test-amd.yaml 62 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# In this file, you can add more tests to run either by adding a new step or
# adding a new command to an existing step. See different options here for examples.

# This script will be feed into Jinja template in `test-template-aws.j2` at
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
# to generate the final pipeline yaml file.

# Documentation
# label(str): the name of the test. emojis allowed.
# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
# fast_check_only(bool): run this test on the fastcheck pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
# command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for the test. incompatible with command.
# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
#     in this case, commands must be specified. the first command runs on the first host, the second
#     command runs on the second host.
# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
#     and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.

# When adding a test
# - If the test belongs to an existing group, add it there
# - If the test is short, add to any existing step
# - If the test takes more than 10min, then it is okay to create a new step.
#   Note that all steps execute in parallel.

steps:
##### fast check tests  #####

- label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
  # of the dependencies. Please check the error message and add the package to whitelist
41
  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
42
  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
43
  agent_pool: mi325_1
44
  grade: Blocking
45
46
47
48
49
50
  soft_fail: true
  source_file_dependencies:
  - requirements/nightly_torch_test.txt
  commands:
  - bash standalone_tests/pytorch_nightly_dependency.sh

51
52
- label: Async Engine, Inputs, Utils, Worker Test # 10min
  timeout_in_minutes: 15
53
  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
54
  agent_pool: mi325_1
55
  grade: Blocking
56
57
58
59
60
61
62
63
  source_file_dependencies:
  - vllm/
  - tests/multimodal
  - tests/utils_
  commands:
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_

64
65
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
  timeout_in_minutes: 30
66
  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
67
  agent_pool: mi325_1
68
  grade: Blocking
69
70
71
72
73
74
  source_file_dependencies:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
  - tests/standalone_tests/lazy_imports.py
75
  - tests/tokenizers_
76
  - tests/tool_parsers
77
  - tests/transformers_utils
78
  - tests/config
79
80
81
82
83
84
  no_gpu: true
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
85
  - pytest -v -s tokenizers_
86
  - pytest -v -s tool_parsers
87
  - pytest -v -s transformers_utils
88
  - pytest -v -s config
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103

- label: Python-only Installation Test # 10min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - tests/standalone_tests/python_only_compile.sh
  - setup.py
  commands:
  - bash standalone_tests/python_only_compile.sh

- label: Basic Correctness Test # 20min
  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental, amdproduction]
104
  agent_pool: mi325_1
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
  # grade: Blocking
  fast_check: true
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness/test_basic_correctness
  - tests/basic_correctness/test_cpu_offload
  - tests/basic_correctness/test_cumem.py
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s basic_correctness/test_cumem.py
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py

- label: Entrypoints Unit Tests # 5min
120
  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
121
  agent_pool: mi325_1
122
  grade: Blocking
123
124
125
126
127
128
129
130
  timeout_in_minutes: 10
  working_dir: "/vllm-workspace/tests"
  fast_check: true
  source_file_dependencies:
  - vllm/entrypoints
  - tests/entrypoints/
  commands:
  - pytest -v -s entrypoints/openai/tool_parsers
131
  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
132
133
134
135

- label: Entrypoints Integration Test (LLM) # 30min
  timeout_in_minutes: 40
  mirror_hardwares: [amdexperimental, amdproduction]
136
  agent_pool: mi325_1
137
138
139
140
141
142
143
144
145
146
147
148
149
150
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  fast_check: true
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/llm
  - tests/entrypoints/offline_mode
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

151
- label: Entrypoints Integration Test (API Server 1) # 100min
152
153
154
155
156
157
158
159
160
161
162
163
164
  timeout_in_minutes: 130
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  fast_check: true
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/openai
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
165
  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
166
167
  - pytest -v -s entrypoints/test_chat_utils.py

168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
- label: Entrypoints Integration Test (API Server 2)
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  fast_check: true
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/sleep
  - tests/entrypoints/rpc
  - tests/tool_use
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/sleep
  - pytest -v -s tool_use
  - PYTHONPATH=/vllm-workspace  pytest -v -s entrypoints/rpc

187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
- label: Entrypoints Integration Test (Pooling)
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  fast_check: true
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/pooling
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/pooling

202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
- label: Entrypoints Integration Test (Responses API)
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  fast_check: true
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/openai/responses
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/openai/responses

217
218
219
220
221
222
223
224
225
226
227
228
- label: Distributed Tests (4 GPUs) # 35min
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
229
  - tests/compile/fullgraph/test_basic_correctness.py
230
231
232
233
234
235
236
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
  - tests/distributed/test_symm_mem_allreduce.py
  commands:
237
238
239
  # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
  # TODO: Remove when the bug is fixed in a future ROCm release
  - export TORCH_NCCL_BLOCKING_WAIT=1
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
  # test with torchrun tp=2 and external_dp=2
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with torchrun tp=2 and pp=2
  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with torchrun tp=4 and dp=1
  - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with torchrun tp=2, pp=2 and dp=1
  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with torchrun tp=1 and dp=4 with ep
  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with torchrun tp=2 and dp=2 with ep
  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
255
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
256
257
258
259
260
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
261
  - pytest -v -s compile/fullgraph/test_basic_correctness.py
262
263
264
265
266
267
268
269
270
271
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  - pushd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd

272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
- label: Distributed Tests (8 GPUs) # 4min
  timeout_in_minutes: 10
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_8
  # grade: Blocking
  gpu: h100
  num_gpus: 8
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - examples/offline_inference/torchrun_dp_example.py
  - vllm/config/parallel.py
  - vllm/distributed/
  - vllm/v1/engine/llm_engine.py
  - vllm/v1/executor/uniproc_executor.py
  - vllm/v1/worker/gpu_worker.py
  commands:
  # test with torchrun tp=2 and dp=4 with ep
289
290
291
  # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
  # TODO: Remove when the bug is fixed in a future ROCm release
  - export TORCH_NCCL_BLOCKING_WAIT=1
292
293
  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep

294
- label: EPLB Algorithm Test # 5min
295
  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
296
  agent_pool: mi325_1
297
  grade: Blocking
298
299
300
301
302
303
304
305
  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/eplb
  - tests/distributed/test_eplb_algo.py
  commands:
  - pytest -v -s distributed/test_eplb_algo.py

306
- label: EPLB Execution Test # 10min
307
  mirror_hardwares: [amdexperimental, amdproduction]
308
  agent_pool: mi325_4
309
  # grade: Blocking
310
  timeout_in_minutes: 20
311
312
313
314
315
316
317
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/eplb
  - tests/distributed/test_eplb_execute.py
  commands:
  - pytest -v -s distributed/test_eplb_execute.py
318
  - pytest -v -s distributed/test_eplb_spec_decode.py
319
320
321
322

- label: Metrics, Tracing Test # 12min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental, amdproduction]
323
  agent_pool: mi325_2
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
  # grade: Blocking
  num_gpus: 2
  source_file_dependencies:
  - vllm/
  - tests/v1/tracing
  commands:
  - "pip install \
      'opentelemetry-sdk>=1.26.0' \
      'opentelemetry-api>=1.26.0' \
      'opentelemetry-exporter-otlp>=1.26.0' \
      'opentelemetry-semantic-conventions-ai>=0.4.1'"
  - pytest -v -s v1/tracing

##### fast check tests  #####
#####  1 GPU test  #####

- label: Regression Test # 7min
  timeout_in_minutes: 20
342
  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
343
  agent_pool: mi325_1
344
345
346
347
348
349
350
351
352
  grade: Blocking
  source_file_dependencies:
  - vllm/
  - tests/test_regression
  commands:
  - pip install modelscope
  - pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional

353
354
- label: Engine Test # 9min
  timeout_in_minutes: 15
355
  mirror_hardwares: [amdexperimental, amdproduction]
356
  agent_pool: mi325_1
357
  # grade: Blocking
358
359
360
361
362
363
364
365
366
367
  source_file_dependencies:
  - vllm/
  - tests/engine
  - tests/test_sequence
  - tests/test_config
  - tests/test_logger
  - tests/test_vllm_port
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py

368
369
- label: V1 Test e2e + engine # 65min
  timeout_in_minutes: 90
370
  mirror_hardwares: [amdexperimental]
371
372
373
  # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
  # See discussion here: https://github.com/vllm-project/vllm/pull/31040
  agent_pool: mi325_8
374
375
376
377
378
379
380
381
382
383
384
385
  # grade: Blocking
  source_file_dependencies:
    - vllm/
    - tests/v1
  commands:
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - pytest -v -s v1/e2e
    - pytest -v -s v1/engine

- label: V1 Test entrypoints # 35min
  timeout_in_minutes: 50
386
  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
387
  agent_pool: mi325_1
388
  grade: Blocking
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
  source_file_dependencies:
    - vllm/
    - tests/v1
  commands:
    - pytest -v -s v1/entrypoints

- label: V1 Test others # 42min
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
    - vllm/
    - tests/v1
  commands:
    # split the test to avoid interference
405
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
406
407
408
409
410
411
412
    - pytest -v -s -m 'not cpu_test' v1/core
    - pytest -v -s v1/executor
    - pytest -v -s v1/kv_offload
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
    - pytest -v -s v1/spec_decode
413
    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
414
415
416
    - pytest -v -s -m 'not cpu_test' v1/metrics
    - pytest -v -s v1/test_oracle.py
    - pytest -v -s v1/test_request.py
417
    - pytest -v -s v1/test_outputs.py
418
419
420
421
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

422
423
424
425
426
427
428
429
430
431
432
433
434
435
# TODO: Add the "V1 Test attetion (MI300)" test group

- label: V1 Test attention (H100) # 10min
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  timeout_in_minutes: 30
  gpu: h100
  source_file_dependencies:
    - vllm/v1/attention
    - tests/v1/attention
  commands:
    - pytest -v -s v1/attention

436
437
438
439
440
441
- label: Batch Invariance Tests (H100) # 10min
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  timeout_in_minutes: 25
  gpu: h100
  source_file_dependencies:
442
443
    - vllm/v1/attention
    - vllm/model_executor/layers
444
445
446
447
448
449
450
    - tests/v1/determinism/
  commands:
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pip install pytest-timeout pytest-forked
    - pytest -v -s v1/determinism/test_batch_invariance.py
    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py

451
452
453
454
455
456
457
- label: V1 Test attention (B200) # 10min
  timeout_in_minutes: 30
  gpu: b200
  source_file_dependencies:
    - vllm/v1/attention
    - tests/v1/attention
  commands:
458
    - pytest -v -s v1/attention
459

460
- label: V1 Test others (CPU) # 5 mins
461
  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
462
  agent_pool: mi325_1
463
  grade: Blocking
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
  source_file_dependencies:
    - vllm/
    - tests/v1
  no_gpu: true
  commands:
    # split the test to avoid interference
    - pytest -v -s -m 'cpu_test' v1/core
    - pytest -v -s v1/structured_output
    - pytest -v -s v1/test_serial_utils.py
    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
    - pytest -v -s -m 'cpu_test' v1/metrics


- label: Examples Test # 30min
  timeout_in_minutes: 45
479
  mirror_hardwares: [amdexperimental, amdproduction]
480
481
482
483
484
  agent_pool: mi325_1
  # grade: Blocking
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
  - vllm/entrypoints
485
  - vllm/multimodal
486
487
488
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
489
490
    # for basic
    - python3 offline_inference/basic/chat.py
491
492
    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
493
494
495
496
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
    # for multi-modal models
497
498
499
500
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
501
502
503
504
505
506
    # for pooling models
    - python3 pooling/pooling/vision_language_pooling.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
507
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
508
509
510
    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
    #- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533

- label: Platform Tests (CUDA) # 4min
  timeout_in_minutes: 15
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - vllm/
  - tests/cuda
  commands:
    - pytest -v -s cuda/test_cuda_context.py

- label: Samplers Test # 56min
  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - vllm/model_executor/layers
  - vllm/sampling_metadata.py
  - tests/samplers
  - tests/conftest.py
  commands:
534
    - pytest -v -s -m 'not skip_v1' samplers
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549

- label: LoRA Test %N # 20min each
  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_8
  # grade: Blocking
  source_file_dependencies:
  - vllm/lora
  - tests/lora
  commands:
    - pytest -v -s lora \
      --shard-id=$$BUILDKITE_PARALLEL_JOB \
      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
      --ignore=lora/test_chatglm3_tp.py \
      --ignore=lora/test_llama_tp.py \
550
551
552
      --ignore=lora/test_llm_with_multi_loras.py \
      --ignore=lora/test_olmoe_tp.py \
      --ignore=lora/test_deepseekv2_tp.py \
553
      --ignore=lora/test_gptoss_tp.py \
554
      --ignore=lora/test_qwen3moe_tp.py
555
556
557
558
  parallelism: 4

- label: PyTorch Compilation Unit Tests # 15min
  timeout_in_minutes: 30
559
  mirror_hardwares: [amdexperimental, amdproduction]
560
561
562
563
564
565
566
  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
  source_file_dependencies:
    - vllm/
    - tests/compile
  commands:
567
568
569
570
571
572
  # Run unit tests defined directly under compile/,
  # not including subdirectories, which are usually heavier
  # tests covered elsewhere.
  # Use `find` to launch multiple instances of pytest so that
  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
573
574
575

- label: PyTorch Fullgraph Smoke Test # 15min
  timeout_in_minutes: 30
576
  mirror_hardwares: [amdexperimental, amdproduction]
577
578
579
580
581
582
583
  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
584
585
586
587
588
  # Run smoke tests under fullgraph directory, except test_full_graph.py
  # as it is a heavy test that is covered in other steps.
  # Use `find` to launch multiple instances of pytest so that
  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
589

590
591
- label: PyTorch Fullgraph Test # 27min
  timeout_in_minutes: 40
592
  mirror_hardwares: [amdexperimental, amdproduction]
593
  agent_pool: mi325_1
594
595
596
597
598
599
  # grade: Blocking
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
600
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
601
602
    # Limit to no custom ops to reduce running time
    # Wrap with quotes to escape yaml and avoid starting -k string with a -
603
  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
604
605
606
607
608
609
610
611
612
613
614
615
616

- label: Cudagraph test
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  source_file_dependencies:
  - tests/v1/cudagraph
  - vllm/v1/cudagraph_dispatcher.py
  - vllm/config/compilation.py
  - vllm/compilation
  commands:
    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
617
618
619

- label: Kernels Core Operation Test # 48min
  timeout_in_minutes: 75
620
  mirror_hardwares: [amdexperimental, amdproduction]
621
622
623
624
625
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - csrc/
  - tests/kernels/core
626
  - tests/kernels/test_top_k_per_row.py
627
  commands:
628
    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
629
630
631

- label: Kernels Attention Test %N # 23min
  timeout_in_minutes: 35
632
  mirror_hardwares: [amdexperimental, amdproduction]
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
  agent_pool: mi325_8
  # grade: Blocking
  source_file_dependencies:
  - csrc/attention/
  - vllm/attention
  - vllm/v1/attention
  - tests/kernels/attention
  commands:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2

- label: Kernels Quantization Test %N # 64min
  timeout_in_minutes: 90
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_8
  # grade: Blocking
  source_file_dependencies:
  - csrc/quantization/
  - vllm/model_executor/layers/quantization
  - tests/kernels/quantization
  commands:
    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2

- label: Kernels MoE Test %N # 40min
  timeout_in_minutes: 60
659
  mirror_hardwares: [amdexperimental, amdproduction]
660
661
662
663
664
665
666
667
  agent_pool: mi325_8
  # grade: Blocking
  source_file_dependencies:
  - csrc/quantization/cutlass_w8a8/moe/
  - csrc/moe/
  - tests/kernels/moe
  - vllm/model_executor/layers/fused_moe/
  - vllm/distributed/device_communicators/
668
669
  - vllm/envs.py
  - vllm/config
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
  commands:
    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2

- label: Kernels Mamba Test # 31min
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - csrc/mamba/
  - tests/kernels/mamba
  - vllm/model_executor/layers/mamba/ops
  commands:
    - pytest -v -s kernels/mamba

686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
- label: Kernels DeepGEMM Test (H100) # Nvidia-centric
# Not replicating for CUTLAS & CuTe
  timeout_in_minutes: 45
  gpu: h100
  num_gpus: 1
  source_file_dependencies:
  - tools/install_deepgemm.sh
  - vllm/utils/deep_gemm.py
  - vllm/model_executor/layers/fused_moe
  - vllm/model_executor/layers/quantization
  - tests/kernels/quantization/test_block_fp8.py
  - tests/kernels/moe/test_deepgemm.py
  - tests/kernels/moe/test_batched_deepgemm.py
  - tests/kernels/attention/test_deepgemm_attention.py
  commands:
    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
    - pytest -v -s kernels/moe/test_deepgemm.py
    - pytest -v -s kernels/moe/test_batched_deepgemm.py
    - pytest -v -s kernels/attention/test_deepgemm_attention.py

706
707
708
709
710
711
712
713
714
715
716
- label: Kernels Helion Test
  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  source_file_dependencies:
  - vllm/utils/import_utils.py
  - tests/kernels/helion/
  commands:
    - pip install helion
    - pytest -v -s kernels/helion/

717
718
- label: Model Executor Test # 23min
  timeout_in_minutes: 35
719
  torch_nightly: true
720
  mirror_hardwares: [amdexperimental, amdproduction]
721
722
723
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
724
725
  - vllm/engine/arg_utils.py
  - vllm/config/model.py
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
  - vllm/model_executor
  - tests/model_executor
  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
  commands:
    - apt-get update && apt-get install -y curl libsodium23
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s model_executor
    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py

- label: Benchmarks # 11min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_8
  # grade: Blocking
  working_dir: "/vllm-workspace/.buildkite"
  source_file_dependencies:
  - benchmarks/
  commands:
  - bash scripts/run-benchmarks.sh

- label: Benchmarks CLI Test # 7min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_8
  # grade: Blocking
  source_file_dependencies:
  - vllm/
  - tests/benchmarks/
  commands:
  - pytest -v -s benchmarks/

- label: Quantization Test # 70min
  timeout_in_minutes: 90
759
  mirror_hardwares: [amdexperimental, amdproduction]
760
761
762
763
764
765
766
767
768
769
770
771
772
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - tests/quantization
  commands:
  # temporary install here since we need nightly, will move to requirements/test.in
  # after torchao 0.12 release, and pin a working version of torchao nightly here

  # since torchao nightly is only compatible with torch nightly currently
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
773
  # TODO(jerryzh168): resolve the above comment
774
  - uv pip install --system torchao==0.14.1
775
  - uv pip install --system conch-triton-kernels
776
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
777

778
779
780
781
782
783
784
785
786
787
- label: LM Eval Small Models # 53min
  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  autorun_on_main: true
  commands:
788
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
789

790
791
- label: OpenAI API correctness # 10min
  timeout_in_minutes: 15
792
  mirror_hardwares: [amdexperimental, amdproduction]
793
794
795
796
797
798
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - csrc/
  - vllm/entrypoints/openai/
  - vllm/model_executor/models/whisper.py
799
  - tools/
800
  commands: # LMEval+Transcription WER check
801
  - bash ../tools/install_torchcodec_rocm.sh || exit 1
802
  - pytest -s entrypoints/openai/correctness/
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827


#####  models test  #####

- label: Basic Models Tests (Initialization)
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models/test_initialization.py
  commands:
    # Run a subset of model initialization tests
    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset

- label: Basic Models Tests (Extra Initialization) %N
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_8
  # grade: Blocking
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
828
  - vllm/transformers_utils/
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
  - tests/models/test_initialization.py
  commands:
    # Only when vLLM model source is modified - test initialization of a large
    # subset of supported models (the complement of the small subset in the above
    # test.) Also run if model initialization test file is modified
    - pytest -v -s models/test_initialization.py \
             -k 'not test_can_initialize_small_subset' \
             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
             --shard-id=$$BUILDKITE_PARALLEL_JOB
  parallelism: 2

- label: Basic Models Tests (Other)
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models/test_transformers.py
  - tests/models/test_registry.py
  commands:
    - pytest -v -s models/test_transformers.py models/test_registry.py

- label: Basic Models Test (Other CPU) # 5min
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  timeout_in_minutes: 10
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models/test_utils.py
  - tests/models/test_vision.py
  no_gpu: true
  commands:
    - pytest -v -s models/test_utils.py models/test_vision.py

- label: Language Models Tests (Standard)
  timeout_in_minutes: 25
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models/language
  commands:
    # Test standard language models, excluding a subset of slow tests
    - pip freeze | grep -E 'torch'
    - pytest -v -s models/language -m 'core_model and (not slow_test)'

- label: Language Models Tests (Extra Standard) %N
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
884
  agent_pool: mi325_8
885
886
887
888
889
890
891
892
893
894
895
  # grade: Blocking
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
  - tests/models/language/pooling/test_embedding.py
  - tests/models/language/generation/test_common.py
  - tests/models/language/pooling/test_classification.py
  commands:
    # Shard slow subset of standard language models tests. Only run when model
    # source is modified, or when specified test files are modified
    - pip freeze | grep -E 'torch'
896
    - export TORCH_NCCL_BLOCKING_WAIT=1
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
    - pytest -v -s models/language -m 'core_model and slow_test' \
             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
             --shard-id=$$BUILDKITE_PARALLEL_JOB
  parallelism: 2

- label: Language Models Tests (Hybrid) %N
  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_8
  # grade: Blocking
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/generation
  commands:
    # Install fast path packages for testing against transformers
    # Note: also needed to run plamo2 model in vLLM
914
    - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    # Shard hybrid language model tests
    - pytest -v -s models/language/generation \
                   -m hybrid_model \
                   --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
                   --shard-id=$$BUILDKITE_PARALLEL_JOB
  parallelism: 2

- label: Language Models Test (Extended Generation) # 80min
  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/generation
  commands:
933
934
    # Install fast path packages for testing against transformers
    # Note: also needed to run plamo2 model in vLLM
935
    - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
936
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'

- label: Language Models Test (PPL)
  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/generation_ppl_test
  commands:
    - pytest -v -s models/language/generation_ppl_test

- label: Language Models Test (Extended Pooling)  # 36min
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/pooling
  commands:
    - pytest -v -s models/language/pooling -m 'not core_model'

- label: Language Models Test (MTEB)
  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/pooling_mteb_test
  commands:
    - pytest -v -s models/language/pooling_mteb_test

975
976
977
978
979
980
981
982
983
984
985
986
- label: Multi-Modal Processor Test (CPU)
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  no_gpu: true
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py

987
988
989
990
991
992
993
994
995
996
997
998
999
- label: Multi-Modal Processor Test # 44min
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing

- label: Multi-Modal Models Test (Standard) # 60min
1000
  timeout_in_minutes: 100
1001
1002
1003
1004
1005
1006
1007
1008
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
1009
1010
    - export MIOPEN_DEBUG_CONV_DIRECT=0
    - export MIOPEN_DEBUG_CONV_GEMM=0
1011
1012
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pip freeze | grep -E 'torch'
1013
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py
1014
    - pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model
1015
1016
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work

1017
1018
- label: Multi-Modal Accuracy Eval (Small Models) # 5min
  timeout_in_minutes: 10
1019
1020
1021
1022
1023
1024
1025
1026
1027
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - vllm/multimodal/
  - vllm/inputs/
  - vllm/v1/core/
  commands:
1028
1029
1030
  - export MIOPEN_DEBUG_CONV_DIRECT=0
  - export MIOPEN_DEBUG_CONV_GEMM=0
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt
1031

1032
1033
- label: Multi-Modal Models Test (Extended) 1 # 60min
  timeout_in_minutes: 120
1034
1035
1036
1037
1038
1039
1040
1041
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
1042
1043
    - export MIOPEN_DEBUG_CONV_DIRECT=0
    - export MIOPEN_DEBUG_CONV_GEMM=0
1044
1045
1046
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing

1047
1048
- label: Multi-Modal Models Test (Extended) 2 #60min
  timeout_in_minutes: 120
1049
1050
1051
1052
1053
1054
1055
1056
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
1057
1058
    - export MIOPEN_DEBUG_CONV_DIRECT=0
    - export MIOPEN_DEBUG_CONV_GEMM=0
1059
1060
1061
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'

1062
1063
- label: Multi-Modal Models Test (Extended) 3 # 75min
  timeout_in_minutes: 150
1064
1065
1066
1067
1068
1069
1070
1071
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
1072
1073
    - export MIOPEN_DEBUG_CONV_DIRECT=0
    - export MIOPEN_DEBUG_CONV_GEMM=0
1074
1075
1076
1077
1078
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'

- label: Quantized Models Test # 45 min
  timeout_in_minutes: 60
1079
  mirror_hardwares: [amdexperimental, amdproduction]
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - vllm/model_executor/layers/quantization
  - tests/models/quantization
  commands:
    - pytest -v -s models/quantization

# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  optional: true
  commands:
    - echo 'Testing custom models...'
    # PR authors can temporarily add commands below to test individual models
    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*

- label: Transformers Nightly Models Test
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
1103
  # grade: Blocking
1104
1105
1106
1107
  working_dir: "/vllm-workspace/"
  optional: true
  commands:
    - pip install --upgrade git+https://github.com/huggingface/transformers
1108
    - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
1109
    - pytest -v -s tests/models/test_transformers.py
1110
1111
    # - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
1112
    - python3 examples/offline_inference/basic/chat.py
1113
    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
1114
1115
1116
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

1117
1118
- label: Blackwell Test # 21 min
  timeout_in_minutes: 30
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
  working_dir: "/vllm-workspace/"
  gpu: b200
  # optional: true
  source_file_dependencies:
  - csrc/quantization/fp4/
  - csrc/attention/mla/
  - csrc/quantization/cutlass_w8a8/moe/
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
1131
1132
  - vllm/v1/attention/backends/mla/cutlass_mla.py
  - vllm/v1/attention/backends/mla/flashinfer_mla.py
1133
  - vllm/v1/attention/selector.py
1134
  - vllm/platforms/cuda.py
1135
1136
1137
1138
1139
  commands:
    - nvidia-smi
    - python3 examples/offline_inference/basic/chat.py
    # Attention
    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
1140
    - pytest -v -s tests/kernels/attention/test_attention_selector.py
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
    # Quantization
    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
    - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
1152
1153
    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
1154
1155
1156
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
1157
    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
1158

1159
- label: Blackwell Fusion and Compile Tests # 30 min
1160
1161
1162
1163
1164
1165
1166
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
  gpu: b200
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
1167
1168
  - vllm/v1/worker/
  - vllm/v1/cudagraph_dispatcher.py
1169
1170
1171
1172
1173
  - vllm/compilation/
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
1174
1175
1176
1177
1178
  - tests/compile/test_fusion_attn.py
  - tests/compile/test_silu_mul_quant_fusion.py
  - tests/compile/distributed/test_fusion_all_reduce.py
  - tests/compile/distributed/test_fusions_e2e.py
  - tests/compile/fullgraph/test_full_graph.py
1179
1180
1181
  commands:
    - nvidia-smi
    - pytest -v -s tests/compile/test_fusion_attn.py
1182
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
1183
    # this runner has 2 GPUs available even though num_gpus=2 is not set
1184
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
1185
    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
1186
    # Wrap with quotes to escape yaml
1187
1188
    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
1189
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205

- label: Blackwell Fusion E2E Tests # 30 min
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
  gpu: b200
  optional: true
  num_gpus: 2
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
1206
  - tests/compile/distributed/test_fusions_e2e.py
1207
1208
1209
  commands:
    - nvidia-smi
    # Run all e2e fusion tests
1210
    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
1211

1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
- label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  gpu: b200
  optional: true # run on nightlies
  source_file_dependencies:
  - tests/evals/gpt_oss
  - vllm/model_executor/models/gpt_oss.py
  - vllm/model_executor/layers/quantization/mxfp4.py
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58

1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
- label: Blackwell Quantized MoE Test
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  gpu: b200
  source_file_dependencies:
  - tests/quantization/test_blackwell_moe.py
  - vllm/model_executor/models/deepseek_v2.py
  - vllm/model_executor/models/gpt_oss.py
  - vllm/model_executor/models/llama4.py
  - vllm/model_executor/layers/fused_moe
  - vllm/model_executor/layers/quantization/compressed_tensors
  - vllm/model_executor/layers/quantization/modelopt.py
  - vllm/model_executor/layers/quantization/mxfp4.py
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - pytest -s -v tests/quantization/test_blackwell_moe.py

1243
1244
1245
1246
1247
1248
1249
1250
- label: Blackwell LM Eval Small Models
  timeout_in_minutes: 120
  gpu: b200
  optional: true # run on nightlies
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
1251
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
1252

1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
#####  1 GPU test  #####
#####  multi gpus test  #####

- label: Distributed Comm Ops Test # 7min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_2
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/distributed
  - tests/distributed
  commands:
  - pytest -v -s distributed/test_comm_ops.py
  - pytest -v -s distributed/test_shm_broadcast.py
  - pytest -v -s distributed/test_shm_buffer.py
  - pytest -v -s distributed/test_shm_storage.py

- label: 2 Node Tests (4 GPUs in total) # 16min
  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  num_nodes: 2
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/model_executor/models/
  - tests/distributed/
  - tests/examples/offline_inference/data_parallel.py
  commands:
  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
1291
    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
1292
1293
1294
1295
1296
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
1297
    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313

- label: Distributed Tests (2 GPUs) # 68min
  timeout_in_minutes: 90
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_2
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/compilation/
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/worker/worker_base.py
  - vllm/v1/engine/
  - vllm/v1/worker/
1314
  - tests/compile/fullgraph/test_basic_correctness.py
1315
1316
1317
1318
1319
1320
1321
1322
  - tests/compile/test_wrapper.py
  - tests/distributed/
  - tests/entrypoints/llm/test_collective_rpc.py
  - tests/v1/distributed
  - tests/v1/entrypoints/openai/test_multi_api_servers.py
  - tests/v1/shutdown
  - tests/v1/worker/test_worker_memory_snapshot.py
  commands:
1323
1324
1325
  # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
  # TODO: Remove when the bug is fixed in a future ROCm release
  - export TORCH_NCCL_BLOCKING_WAIT=1
1326
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
1327
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
1328
1329
1330
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
1331
  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
1332
1333
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
1334
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
  - pytest -v -s distributed/test_sequence_parallel.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py

- label: Distributed Model Tests (2 GPUs) # 37min
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_2
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/model_executor/model_loader/sharded_state_loader.py
  - vllm/model_executor/models/
  - tests/basic_correctness/
  - tests/model_executor/model_loader/test_sharded_state_loader.py
  - tests/models/
  commands:
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
  # Avoid importing model tests that cause CUDA reinitialization error
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'

- label: Plugin Tests (2 GPUs) # 40min
  timeout_in_minutes: 60
1363
  mirror_hardwares: [amdexperimental, amdproduction]
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
  agent_pool: mi325_2
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
  commands:
  # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
  - pip install -e ./plugins/vllm_add_dummy_platform
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
  # end platform plugin tests
  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
  - pip install -e ./plugins/prithvi_io_processor_plugin
1379
  - pytest -v -s plugins_tests/test_io_processor_plugins.py
1380
1381
  - pip uninstall prithvi_io_processor_plugin -y
  # end io_processor plugins test
1382
1383
1384
1385
1386
  # begin stat_logger plugins test
  - pip install -e ./plugins/vllm_add_dummy_stat_logger
  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
  - pip uninstall dummy_stat_logger -y
  # end stat_logger plugins test
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
  # other tests continue here:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s distributed/test_distributed_oot.py
  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins

- label: Pipeline + Context Parallelism Test # 45min
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/model_executor/models/
  - tests/distributed/
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
  - pytest -v -s distributed/test_pipeline_parallel.py

- label: LoRA TP Test (Distributed) # 17 min
  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
  num_gpus: 4
  source_file_dependencies:
  - vllm/lora
  - tests/lora
  commands:
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    # There is some Tensor Parallelism related processing logic in LoRA that
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
    - pytest -v -s -x lora/test_llm_with_multi_loras.py
1430
    - pytest -v -s -x lora/test_olmoe_tp.py
1431

1432
    # Disabled for now because MXFP4 backend on non-cuda platform
1433
1434
    # doesn't support LoRA yet
    #- pytest -v -s -x lora/test_gptoss_tp.py
1435

1436
1437
1438

- label: Weight Loading Multiple GPU Test  # 33min
  timeout_in_minutes: 45
1439
  mirror_hardwares: [amdexperimental, amdproduction]
1440
1441
1442
1443
1444
1445
1446
1447
1448
  agent_pool: mi325_2
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
  commands:
1449
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461

- label: Weight Loading Multiple GPU Test - Large Models # optional
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_2
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
  commands:
1462
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
1463

1464
- label: NixlConnector PD accuracy tests (Distributed) # 30min
1465
  mirror_hardwares: [amdexperimental, amdproduction]
1466
  agent_pool: mi325_4
1467
  # grade: Blocking
1468
1469
1470
1471
1472
1473
1474
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
  commands:
1475
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
1476
    - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh --attention-backend ROCM_ATTN
1477
1478

- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
1479
  mirror_hardwares: [amdexperimental, amdproduction]
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
  agent_pool: mi325_4
  # grade: Blocking
  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
1490
    - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh --attention-backend ROCM_ATTN
1491
1492
1493
1494
1495

##### multi gpus test #####
##### A100 test #####

- label: Distributed Tests (A100) # optional
1496
1497
1498
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  # grade: Blocking
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
  gpu: a100
  optional: true
  num_gpus: 4
  source_file_dependencies:
  - vllm/
  commands:
  # NOTE: don't test llama model here, it seems hf implementation is buggy
  # see https://github.com/vllm-project/vllm/pull/5689 for details
  - pytest -v -s distributed/test_custom_all_reduce.py
  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py

1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544

- label: LM Eval Large Models # optional
  gpu: a100
  optional: true
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  # grade: Blocking
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

##### H100 test #####
- label: LM Eval Large Models (H100) # optional
  gpu: h100
  optional: true
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  # grade: Blocking
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4


1545
##### H200 test #####
1546
- label: Distributed Tests (H200) # optional
1547
1548
1549
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_2
  # grade: Blocking
1550
1551
1552
1553
1554
  gpu: h200
  optional: true
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
1555
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
1556
1557
    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
1558
    #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
1559
1560
    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
1561
    - pytest -v -s tests/distributed/test_context_parallel.py
1562
    - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
1563
    - pytest -v -s tests/v1/distributed/test_dbo.py
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573

##### B200 test #####
- label: Distributed Tests (B200) # optional
  gpu: b200
  optional: true
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
1574
    - pytest -v -s tests/v1/distributed/test_dbo.py
1575

1576
1577
1578
1579
1580
##### E2E Eval Tests #####
- label: LM Eval Small Models (1 Card) # 15min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
1581
  # grade: Blocking
1582
1583
1584
1585
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
1586
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
1587
1588
1589
1590
1591
1592

- label: LM Eval Large Models (4 Card)
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
  gpu: a100
1593
  optional: true
1594
1595
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
1596
  source_file_dependencies:
1597
1598
  - csrc/
  - vllm/model_executor/layers/quantization
1599
  commands:
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

- label: ROCm LM Eval Large Models (8 Card)
  mirror_hardwares: [amdproduction]
  agent_pool: mi325_8
  num_gpus: 8
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8

- label: ROCm GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  agent_pool: mi325_1
  mirror_hardwares: [amdexperimental, amdproduction]
  optional: true # run on nightlies
  source_file_dependencies:
  - tests/evals/gpt_oss
  - vllm/model_executor/models/gpt_oss.py
  - vllm/model_executor/layers/quantization/mxfp4.py
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
1626

1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_2
  # grade: Blocking
  timeout_in_minutes: 30
  optional: true
  num_gpus: 2
  working_dir: "/vllm-workspace"
  source_file_dependencies:
  - vllm/
  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
    - bash .buildkite/scripts/run-prime-rl-test.sh
1641
1642

##### EPLB Accuracy Tests #####
1643
- label: DeepSeek V2-Lite Accuracy
1644
  mirror_hardwares: [amdexperimental, amdproduction]
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
  agent_pool: mi325_4
  # grade: Blocking
  timeout_in_minutes: 60
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010

1655
1656
- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
  mirror_hardwares: [amdexperimental, amdproduction]
1657
1658
1659
1660
1661
1662
1663
1664
  agent_pool: mi325_4
  # grade: Blocking
  timeout_in_minutes: 60
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
1665
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
1666
1667
1668
1669
1670
1671
1672
1673

- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
  timeout_in_minutes: 60
  gpu: b200
  optional: true
  num_gpus: 2
  working_dir: "/vllm-workspace"
  commands:
1674
1675
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1

1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686

- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  # grade: Blocking
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040