test-pipeline.yaml 24.4 KB
Newer Older
Simon Mo's avatar
Simon Mo committed
1
2
# In this file, you can add more tests to run either by adding a new step or
# adding a new command to an existing step. See different options here for examples.
3
4

# This script will be feed into Jinja template in `test-template-aws.j2` at
5
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
6
7
# to generate the final pipeline yaml file.

8
9
10
11
# Documentation
# label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only
12
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
13
14
15
16
17
# command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for test. incompatbile with command.
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
18
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
19
20
21
22
23
24
25
26
#     in this case, commands must be specified. the first command runs on first host, the second
#     command runs on the second host.
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.

# When adding a test
# - If the test belong to an existing group, add it there
# - If the test is short, add to any existing step
27
28
# - If the test takes more than 10min, then it is okay to create a new step.
#   Note that all steps execute in parallel.
Simon Mo's avatar
Simon Mo committed
29
30

steps:
31
32
33
34
##### fast check tests  #####

- label: Documentation Build # 2min
  working_dir: "/vllm-workspace/test_docs/docs"
35
  fast_check: true
36
37
  no_gpu: True
  commands:
38
  - pip install -r ../../requirements/docs.txt
39
  - SPHINXOPTS=\"-W\" make html
40
  # Check API reference (if it fails, you may have missing mock imports)
41
  - grep \"sig sig-object py\" build/html/api/inference_params.html
42

43
- label: Async Engine, Inputs, Utils, Worker Test # 24min
44
45
  source_file_dependencies:
  - vllm/
46
  - tests/mq_llm_engine
47
48
49
50
51
  - tests/async_engine
  - tests/test_inputs
  - tests/multimodal
  - tests/test_utils
  - tests/worker
52
  - tests/standalone_tests/lazy_imports.py
53
  commands:
54
  - python3 standalone_tests/lazy_imports.py
55
56
  - pytest -v -s mq_llm_engine # MQLLMEngine
  - pytest -v -s async_engine # AsyncLLMEngine
57
  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
58
59
60
61
62
  - pytest -v -s test_inputs.py
  - pytest -v -s multimodal
  - pytest -v -s test_utils.py # Utils
  - pytest -v -s worker # Worker

63
64
65
66
67
68
69
- label: Python-only Installation Test
  source_file_dependencies:
  - tests/standalone_tests/python_only_compile.sh
  - setup.py
  commands:
  - bash standalone_tests/python_only_compile.sh

70
- label: Basic Correctness Test # 30min
71
  #mirror_hardwares: [amd]
72
  fast_check: true
73
74
  source_file_dependencies:
  - vllm/
75
76
77
  - tests/basic_correctness/test_basic_correctness
  - tests/basic_correctness/test_cpu_offload
  - tests/basic_correctness/test_preemption
78
  - tests/basic_correctness/test_cumem.py
79
  commands:
80
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
81
  - pytest -v -s basic_correctness/test_cumem.py
82
  - pytest -v -s basic_correctness/test_basic_correctness.py
83
  - pytest -v -s basic_correctness/test_cpu_offload.py
84
85
86
87
88
89
90
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py

- label: Chunked Prefill Test
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness/test_chunked_prefill
  commands:
91
92
  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
93

94
- label: Core Test # 10min
95
  mirror_hardwares: [amd]
96
  fast_check: true
97
98
99
100
  source_file_dependencies:
  - vllm/core
  - vllm/distributed
  - tests/core
101
  commands:
102
  - pytest -v -s core
103

104
- label: Entrypoints Test # 40min
105
  working_dir: "/vllm-workspace/tests"
106
  fast_check: true
107
  #mirror_hardwares: [amd]
108
  source_file_dependencies:
109
  - vllm/
110
111
112
113
  - tests/entrypoints/llm
  - tests/entrypoints/openai
  - tests/entrypoints/test_chat_utils
  - tests/entrypoints/offline_mode
114
  commands:
115
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
116
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
117
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
118
119
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
120
  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
121
  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py  --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
122
  - pytest -v -s entrypoints/test_chat_utils.py
123
  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
Simon Mo's avatar
Simon Mo committed
124

125
- label: Distributed Tests (4 GPUs) # 10min
126
  working_dir: "/vllm-workspace/tests"
127
  num_gpus: 4
128
  source_file_dependencies:
129
130
  - vllm/distributed/
  - vllm/core/
131
132
  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
133
  - tests/spec_decode/e2e/test_integration_dist_tp4
134
  - tests/compile/test_basic_correctness
135
  - examples/offline_inference/rlhf.py
136
  - examples/offline_inference/rlhf_colocate.py
137
  - tests/examples/offline_inference/data_parallel.py
138
  - tests/v1/test_async_llm_dp.py
139
  commands:
140
141
142
143
  # test with tp=2 and external_dp=2
  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with internal dp
144
  - python3 ../examples/offline_inference/data_parallel.py
145
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
146
  - pytest -v -s distributed/test_utils.py
147
  - pytest -v -s compile/test_basic_correctness.py
148
  - pytest -v -s distributed/test_pynccl.py
149
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
150
151
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
152
  - pushd ../examples/offline_inference
153
154
  - python3 rlhf.py
  - RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
155
  - popd
156

157
- label: Metrics, Tracing Test # 10min
158
  mirror_hardwares: [amd]
159
  num_gpus: 2
160
161
162
163
  source_file_dependencies:
  - vllm/
  - tests/metrics
  - tests/tracing
164
  commands:
165
  - pytest -v -s metrics
166
  - pytest -v -s tracing
167

168
169
170
##### fast check tests  #####
#####  1 GPU test  #####

171
- label: Regression Test # 5min
172
  #mirror_hardwares: [amd]
173
174
175
  source_file_dependencies:
  - vllm/
  - tests/test_regression
176
177
178
  commands:
  - pip install modelscope
  - pytest -v -s test_regression.py
179
180
181
182
183
184
185
186
  working_dir: "/vllm-workspace/tests" # optional

- label: Engine Test # 10min
  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/engine
  - tests/tokenization
187
188
189
  - tests/test_sequence
  - tests/test_config
  - tests/test_logger
190
  commands:
191
192
193
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py
  # OOM in the CI unless we run this separately
  - pytest -v -s tokenization
Simon Mo's avatar
Simon Mo committed
194

195
196
197
198
199
200
- label: V1 Test
  #mirror_hardwares: [amd]
  source_file_dependencies:
    - vllm/
    - tests/v1
  commands:
201
    # split the test to avoid interference
202
203
    - pytest -v -s v1/core
    - pytest -v -s v1/engine
204
    - pytest -v -s v1/entrypoints
205
206
207
208
209
210
    - pytest -v -s v1/sample
    - pytest -v -s v1/worker
    - pytest -v -s v1/structured_output
    - pytest -v -s v1/test_stats.py
    - pytest -v -s v1/test_utils.py
    - pytest -v -s v1/test_oracle.py
211
212
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
213
    - pytest -v -s v1/e2e
214
215
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
216
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
217

218
- label: Examples Test # 25min
219
  working_dir: "/vllm-workspace/examples"
220
  #mirror_hardwares: [amd]
221
222
223
  source_file_dependencies:
  - vllm/entrypoints
  - examples/
224
  commands:
225
    - pip install tensorizer # for tensorizer test
226
227
228
    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
    - python3 offline_inference/basic/chat.py
229
    - python3 offline_inference/prefix_caching.py
230
    - python3 offline_inference/llm_engine_example.py
231
232
233
234
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_embedding.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
235
    - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
236
    - python3 offline_inference/encoder_decoder.py
237
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
238
239
240
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
241
    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
242

243
- label: Prefix Caching Test # 9min
244
  mirror_hardwares: [amd]
245
246
247
  source_file_dependencies:
  - vllm/
  - tests/prefix_caching
248
  commands:
249
    - pytest -v -s prefix_caching
250

251
- label: Samplers Test # 36min
252
253
254
255
  source_file_dependencies:
  - vllm/model_executor/layers
  - vllm/sampling_metadata.py
  - tests/samplers
youkaichao's avatar
youkaichao committed
256
  - tests/conftest.py
257
258
259
  commands:
    - pytest -v -s samplers
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
Simon Mo's avatar
Simon Mo committed
260

261
- label: LogitsProcessor Test # 5min
262
  mirror_hardwares: [amd]
263
264
  source_file_dependencies:
  - vllm/model_executor/layers
265
  - vllm/model_executor/guided_decoding
266
  - tests/test_logits_processor
267
  - tests/model_executor/test_guided_processors
268
  commands:
269
270
    - pytest -v -s test_logits_processor.py
    - pytest -v -s model_executor/test_guided_processors.py
271

272
- label: Speculative decoding tests # 40min
273
274
275
  source_file_dependencies:
  - vllm/spec_decode
  - tests/spec_decode
276
  - vllm/model_executor/models/eagle.py
277
  commands:
278
    - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
279
    - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
280
    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
281

282
- label: LoRA Test %N # 15min each
283
  #mirror_hardwares: [amd]
284
285
286
  source_file_dependencies:
  - vllm/lora
  - tests/lora
287
  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
288
289
  parallelism: 4

290
291
292
293
294
295
296
297
- label: PyTorch Compilation Unit Tests
  source_file_dependencies:
    - vllm/
    - tests/compile
  commands:
    - pytest -v -s compile/test_pass_manager.py
    - pytest -v -s compile/test_fusion.py

298
- label: PyTorch Fullgraph Smoke Test # 9min
299
300
301
302
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
303
  - pytest -v -s compile/test_basic_correctness.py
304
305
306
  # these tests need to be separated, cannot combine
  - pytest -v -s compile/piecewise/test_simple.py
  - pytest -v -s compile/piecewise/test_toy_llama.py
307

308
- label: PyTorch Fullgraph Test # 18min
309
310
311
312
313
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
  - pytest -v -s compile/test_full_graph.py
314

315
- label: Kernels Test %N # 1h each
316
  # mirror_hardwares: [amd]
317
318
319
320
321
322
323
324
325
  source_file_dependencies:
  - csrc/
  - vllm/attention
  - tests/kernels
  commands:
    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 4

- label: Tensorizer Test # 11min
326
  # mirror_hardwares: [amd]
327
328
329
330
  soft_fail: true
  source_file_dependencies:
  - vllm/model_executor/model_loader
  - tests/tensorizer_loader
331
  commands:
332
    - apt-get update && apt-get install -y curl libsodium23
333
334
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s tensorizer_loader
335

336
- label: Benchmarks # 9min
Simon Mo's avatar
Simon Mo committed
337
  working_dir: "/vllm-workspace/.buildkite"
338
  mirror_hardwares: [amd]
339
340
  source_file_dependencies:
  - benchmarks/
Simon Mo's avatar
Simon Mo committed
341
  commands:
342
  - bash scripts/run-benchmarks.sh
343

344
- label: Quantization Test # 33min
345
346
347
348
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - tests/quantization
349
  command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
350
351

- label: LM Eval Small Models # 53min
352
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
353
354
355
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
356
357
358
359
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - bash ./run-tests.sh -c configs/models-small.txt -t 1

360
361
362
363
364
365
366
367
- label: OpenAI API correctness
  source_file_dependencies:
  - csrc/
  - vllm/entrypoints/openai/
  - vllm/model_executor/models/whisper.py
  commands: # LMEval+Transcription WER check
  - pytest -s entrypoints/openai/correctness/

368
369
370
371
372
373
374
- label: Encoder Decoder tests # 5min
  source_file_dependencies:
  - vllm/
  - tests/encoder_decoder
  commands:
    - pytest -v -s encoder_decoder

375
376
- label: OpenAI-Compatible Tool Use # 20 min
  fast_check: false
377
  #mirror_hardwares: [ amd ]
378
379
380
  source_file_dependencies:
    - vllm/
    - tests/tool_use
381
    - tests/mistral_tool_use
382
383
  commands:
    - pytest -v -s tool_use
384
    - pytest -v -s mistral_tool_use
385

386
387
#####  models test  #####

388
- label: Basic Models Test # 24min
389
390
391
392
  source_file_dependencies:
  - vllm/
  - tests/models
  commands:
393
    - pytest -v -s models/test_transformers.py
394
    - pytest -v -s models/test_registry.py
395
    # V1 Test: https://github.com/vllm-project/vllm/issues/14531
396
397
    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4'
    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
398

399
- label: Language Models Test (Standard) # 32min
400
401
402
403
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/models/decoder_only/language
404
405
  - tests/models/embedding/language
  - tests/models/encoder_decoder/language
406
  commands:
407
408
    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
    - pytest -v -s models/embedding/language -m core_model
409

410
- label: Language Models Test (Extended) # 1h10min
411
  optional: true
412
413
414
  source_file_dependencies:
  - vllm/
  - tests/models/decoder_only/language
415
416
  - tests/models/embedding/language
  - tests/models/encoder_decoder/language
417
  commands:
418
    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
419
    - pytest -v -s models/embedding/language -m 'not core_model'
420

421
- label: Multi-Modal Models Test (Standard) # 40min
422
423
424
425
426
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/models/decoder_only/audio_language
  - tests/models/decoder_only/vision_language
427
  - tests/models/embedding/vision_language
428
  - tests/models/encoder_decoder/audio_language
429
  - tests/models/encoder_decoder/vision_language
430
  commands:
431
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
432
    - pytest -v -s models/multimodal
433
    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
434
    - pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model'
435
    - pytest -v -s models/embedding/vision_language -m core_model
436
    - pytest -v -s models/encoder_decoder/audio_language -m core_model
437
438
    - pytest -v -s models/encoder_decoder/language -m core_model
    - pytest -v -s models/encoder_decoder/vision_language -m core_model
439
    - pytest -v -s models/decoder_only/vision_language/test_interleaved.py
440

441
- label: Multi-Modal Models Test (Extended) 1 # 48m
442
  optional: true
443
444
445
446
  source_file_dependencies:
  - vllm/
  - tests/models/decoder_only/audio_language
  - tests/models/decoder_only/vision_language
447
448
  - tests/models/embedding/vision_language
  - tests/models/encoder_decoder/vision_language
449
  commands:
450
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
451
    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
452
    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
453
    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
454
    - pytest -v -s models/embedding/vision_language -m 'not core_model'
455
456
    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
457

458
459
460
461
462
463
464
465
466
- label: Multi-Modal Models Test (Extended) 2 # 38m
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/decoder_only/vision_language
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'

467
# This test is used only in PR development phase to test individual models and should never run on main
468
- label: Custom Models Test
469
  mirror_hardwares: [amd]
470
471
  optional: true
  commands:
472
    - echo 'Testing custom models...'
473
474
475
    # PR authors can temporarily add commands below to test individual models
    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
476

477
478
479
480
#####  1 GPU test  #####
#####  multi gpus test  #####

- label: Distributed Comm Ops Test # 7min
481
  mirror_hardwares: [amd]
482
483
484
485
486
487
488
489
490
491
492
493
494
495
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/distributed
  - tests/distributed
  commands:
  - pytest -v -s distributed/test_comm_ops.py
  - pytest -v -s distributed/test_shm_broadcast.py

- label: 2 Node Tests (4 GPUs in total) # 16min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  num_nodes: 2
  source_file_dependencies:
496
497
498
499
500
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/model_executor/models/
  - tests/distributed/
501
502
  commands:
  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
503
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
504
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
505
506
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
507
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
508

509
- label: Distributed Tests (2 GPUs) # 40min
510
  #mirror_hardwares: [amd]
511
512
513
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
514
515
516
517
518
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/model_executor/models/
  - tests/distributed/
519
  - vllm/compilation
520
521
522
  - vllm/worker/worker_base.py
  - vllm/worker/worker.py
  - vllm/worker/model_runner.py
523
  - entrypoints/llm/test_collective_rpc.py
524
525
  - tests/v1/test_async_llm_dp.py
  - vllm/v1/engine/
526
  commands:
527
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
528
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
529
  - pytest -v -s ./compile/test_basic_correctness.py
530
  - pytest -v -s ./compile/test_wrapper.py
531
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
532
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
533
  # Avoid importing model tests that cause CUDA reinitialization error
534
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
535
536
537
  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
538
539
540
  # this test fails consistently.
  # TODO: investigate and fix
  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
541
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
542
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
543

544
545
546
547
548
549
550
551
552
553
554
555
556
- label: Plugin Tests (2 GPUs) # 40min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
  commands:
  # begin platform plugin tests, all the code in-between runs on dummy platform
  - pip install -e ./plugins/vllm_add_dummy_platform
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
  # end platform plugin tests
  # other tests continue here:
557
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
558
559
560
561
562
  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s distributed/test_distributed_oot.py
  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
  - pytest -v -s models/test_oot_registration.py # it needs a clean process

563
- label: Multi-step Tests (4 GPUs) # 36min
564
565
566
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
567
568
569
570
571
572
573
574
575
576
  - vllm/model_executor/layers/sampler.py
  - vllm/sequence.py
  - vllm/worker/worker_base.py
  - vllm/worker/worker.py
  - vllm/worker/multi_step_worker.py
  - vllm/worker/model_runner_base.py
  - vllm/worker/model_runner.py
  - vllm/worker/multi_step_model_runner.py
  - vllm/engine
  - tests/multi_step
577
  commands:
578
579
580
  # this test is quite flaky
  # TODO: investigate and fix.
  # - pytest -v -s multi_step/test_correctness_async_llm.py
581
  - pytest -v -s multi_step/test_correctness_llm.py
582

583
- label: Pipeline Parallelism Test # 45min
584
  working_dir: "/vllm-workspace/tests"
585
  num_gpus: 4
586
  source_file_dependencies:
587
588
589
590
591
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/model_executor/models/
  - tests/distributed/
592
  commands:
593
  - pytest -v -s distributed/test_pp_cudagraph.py
594
  - pytest -v -s distributed/test_pipeline_parallel.py
595

596
- label: LoRA TP Test (Distributed)
597
598
599
  num_gpus: 4
  source_file_dependencies:
  - vllm/lora
600
  - tests/lora
601
  commands:
602
603
604
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
605
    # There is some Tensor Parallelism related processing logic in LoRA that
606
607
608
609
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py

610

611
- label: Weight Loading Multiple GPU Test  # 33min
612
613
614
615
616
617
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
  commands:
618
619
620
621
622
623
624
625
626
627
628
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt

- label: Weight Loading Multiple GPU Test - Large Models # optional
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  gpu: a100
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
  commands:
629
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
630
631


632
633
##### multi gpus test #####
##### A100 test #####
634

635
- label: Distributed Tests (A100) # optional
636
  gpu: a100
637
  optional: true
638
  num_gpus: 4
639
640
  source_file_dependencies:
  - vllm/
641
  commands:
642
643
644
  # NOTE: don't test llama model here, it seems hf implementation is buggy
  # see https://github.com/vllm-project/vllm/pull/5689 for details
  - pytest -v -s distributed/test_custom_all_reduce.py
645
  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
646
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
647
  - pytest -v -s -x lora/test_mixtral.py
648
649
650

- label: LM Eval Large Models # optional
  gpu: a100
651
  optional: true
652
653
654
655
656
657
658
659
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - bash ./run-tests.sh -c configs/models-large.txt -t 4