Commit bd363067 authored by lizhigong's avatar lizhigong
Browse files

Merge branch 'v0.8.5.post1-dev' into v0.8.5-zero_overhead

parents 87ef4618 d36deb1a
...@@ -6,10 +6,10 @@ Run `pytest tests/samplers/test_beam_search.py`. ...@@ -6,10 +6,10 @@ Run `pytest tests/samplers/test_beam_search.py`.
import pytest import pytest
import os import os
from ..utils import models_path_prefix
from transformers import AutoModelForSeq2SeqLM from transformers import AutoModelForSeq2SeqLM
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
from ..utils import models_path_prefix
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
...@@ -83,7 +83,7 @@ def test_beam_search_passes_multimodal_data( ...@@ -83,7 +83,7 @@ def test_beam_search_passes_multimodal_data(
# correctly. As such, we just need to check one extra modality to make # correctly. As such, we just need to check one extra modality to make
# sure things pass through properly. # sure things pass through properly.
audios = [AudioAsset("mary_had_lamb").audio_and_sample_rate] audios = [AudioAsset("mary_had_lamb").audio_and_sample_rate]
model = "Qwen/Qwen2-Audio-7B-Instruct" model = os.path.join(models_path_prefix, "Qwen/Qwen2-Audio-7B-Instruct")
audio_seq = "<|audio_bos|><|AUDIO|><|audio_eos|>" audio_seq = "<|audio_bos|><|AUDIO|><|audio_eos|>"
prompts = [ prompts = [
f"<|im_start|>user\n{audio_seq}Can you transcribe this?<|im_end|>\n<|im_start|>assistant\n" #noqa: E501 f"<|im_start|>user\n{audio_seq}Can you transcribe this?<|im_end|>\n<|im_start|>assistant\n" #noqa: E501
......
...@@ -19,7 +19,7 @@ from ...utils import models_path_prefix ...@@ -19,7 +19,7 @@ from ...utils import models_path_prefix
{ {
# Speculative max model len > overridden max model len should raise. # Speculative max model len > overridden max model len should raise.
"speculative_config": { "speculative_config": {
"model": "JackFram/llama-68m", "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"max_model_len": 129, "max_model_len": 129,
}, },
...@@ -29,7 +29,7 @@ from ...utils import models_path_prefix ...@@ -29,7 +29,7 @@ from ...utils import models_path_prefix
# Speculative max model len > draft max model len should raise. # Speculative max model len > draft max model len should raise.
# https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12 # https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12
"speculative_config": { "speculative_config": {
"model": "JackFram/llama-68m", "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"max_model_len": 2048 + 1, "max_model_len": 2048 + 1,
}, },
...@@ -38,7 +38,7 @@ from ...utils import models_path_prefix ...@@ -38,7 +38,7 @@ from ...utils import models_path_prefix
# Speculative max model len > target max model len should raise. # Speculative max model len > target max model len should raise.
# https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18 # https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
"speculative_config": { "speculative_config": {
"model": "JackFram/llama-68m", "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"max_model_len": 131072 + 1, "max_model_len": 131072 + 1,
}, },
......
...@@ -332,14 +332,14 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs, ...@@ -332,14 +332,14 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
"dtype": "float16", "dtype": "float16",
# Main model # Main model
"model_name": "meta-llama/Llama-2-7b-chat-hf", "model_name": os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_config": { "speculative_config": {
"model": "yuhuili/EAGLE-llama2-chat-7B", "model": os.path.join(models_path_prefix, "yuhuili/EAGLE-llama2-chat-7B"),
"num_speculative_tokens": MAX_SPEC_TOKENS, "num_speculative_tokens": MAX_SPEC_TOKENS,
}, },
}, },
...@@ -382,14 +382,14 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, ...@@ -382,14 +382,14 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
"dtype": "float16", "dtype": "float16",
# Main model # Main model
"model_name": "meta-llama/Meta-Llama-3-8B-Instruct", "model_name": os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_config": { "speculative_config": {
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B", "model": os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3-Instruct-8B"),
"num_speculative_tokens": MAX_SPEC_TOKENS, "num_speculative_tokens": MAX_SPEC_TOKENS,
}, },
}, },
...@@ -432,14 +432,14 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, ...@@ -432,14 +432,14 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
"dtype": "float16", "dtype": "float16",
# Main model # Main model
"model_name": "Qwen/Qwen2-7B-Instruct", "model_name": os.path.join(models_path_prefix, "Qwen/Qwen2-7B-Instruct"),
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_config": { "speculative_config": {
"model": "yuhuili/EAGLE-Qwen2-7B-Instruct", "model": os.path.join(models_path_prefix, "yuhuili/EAGLE-Qwen2-7B-Instruct"),
"num_speculative_tokens": MAX_SPEC_TOKENS, "num_speculative_tokens": MAX_SPEC_TOKENS,
}, },
}, },
......
...@@ -69,7 +69,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs, ...@@ -69,7 +69,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Explicitly specify draft model quantization # Explicitly specify draft model quantization
{ {
"speculative_config": { "speculative_config": {
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"quantization": "gptq", "quantization": "gptq",
}, },
...@@ -77,7 +77,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs, ...@@ -77,7 +77,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Explicitly specify GPTQ-based draft model to use marlin quantization # Explicitly specify GPTQ-based draft model to use marlin quantization
{ {
"speculative_config": { "speculative_config": {
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"quantization": "marlin", "quantization": "marlin",
}, },
...@@ -85,7 +85,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs, ...@@ -85,7 +85,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Not explicitly specify draft model quantization # Not explicitly specify draft model quantization
{ {
"speculative_config": { "speculative_config": {
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"quantization": None, "quantization": None,
}, },
...@@ -124,7 +124,7 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs, ...@@ -124,7 +124,7 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{ @pytest.mark.parametrize("test_llm_kwargs", [{
"speculative_config": { "speculative_config": {
"model": "JackFram/llama-68m", "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
"disable_mqa_scorer": True, "disable_mqa_scorer": True,
}, },
......
...@@ -20,12 +20,14 @@ With those tests, we can say at least, mtp would not break the ...@@ -20,12 +20,14 @@ With those tests, we can say at least, mtp would not break the
correctess for the target model outputs. correctess for the target model outputs.
""" """
import os
import pytest import pytest
from .conftest import run_equality_correctness_test from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix
# main model # main model
MAIN_MODEL = "luccafong/deepseek_mtp_main_random" MAIN_MODEL = os.path.join(models_path_prefix, "luccafong/deepseek_mtp_main_random")
# max. number of speculative tokens: this corresponds to # max. number of speculative tokens: this corresponds to
# num_nextn_predict_layers in the config.json of the speculator model. # num_nextn_predict_layers in the config.json of the speculator model.
......
...@@ -334,7 +334,7 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs, ...@@ -334,7 +334,7 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"model_name": "JackFram/llama-68m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
......
...@@ -484,107 +484,107 @@ def test_multi_step_with_batch_expansion_incorrect_output(): ...@@ -484,107 +484,107 @@ def test_multi_step_with_batch_expansion_incorrect_output():
assert (num_mismatch > 0) assert (num_mismatch > 0)
@torch.inference_mode() # @torch.inference_mode()
@pytest.mark.parametrize('num_steps', [1, 2, 3, 4]) # @pytest.mark.parametrize('num_steps', [1, 2, 3, 4])
# The choice of backends forces the multi_step_worker to choose between # # The choice of backends forces the multi_step_worker to choose between
# the vanilla model_runner and TP1DraftModelRunner and that we can test # # the vanilla model_runner and TP1DraftModelRunner and that we can test
# both code paths. # # both code paths.
@pytest.mark.parametrize('attn_backend', # @pytest.mark.parametrize('attn_backend',
[_Backend.XFORMERS, _Backend.FLASH_ATTN]) # [_Backend.XFORMERS, _Backend.FLASH_ATTN])
def test_multi_step_correct_kvcache(num_steps, attn_backend): # def test_multi_step_correct_kvcache(num_steps, attn_backend):
"""Verify that the KV cache of the draft model # """Verify that the KV cache of the draft model
is correctly updated for sequences with bonus token. # is correctly updated for sequences with bonus token.
""" # """
seed = 100 # seed = 100
model_name = "JackFram/llama-68m" # model_name = "JackFram/llama-68m"
block_size = 16 # block_size = 16
num_gpu_blocks = 2048 // block_size # num_gpu_blocks = 2048 // block_size
batch_size = 1 # batch_size = 1
with global_force_attn_backend_context_manager(attn_backend): # with global_force_attn_backend_context_manager(attn_backend):
dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32' # dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32'
multi_step_worker = create_worker(MultiStepWorker, # multi_step_worker = create_worker(MultiStepWorker,
model_name, # model_name,
block_size, # block_size,
num_gpu_blocks, # num_gpu_blocks,
seed, # seed,
model_runner_cls=TP1DraftModelRunner, # model_runner_cls=TP1DraftModelRunner,
dtype=dtype) # dtype=dtype)
multi_step_worker.set_include_gpu_probs_tensor() # multi_step_worker.set_include_gpu_probs_tensor()
worker = create_worker(Worker, # worker = create_worker(Worker,
model_name, # model_name,
block_size, # block_size,
num_gpu_blocks, # num_gpu_blocks,
seed, # seed,
dtype=dtype) # dtype=dtype)
prompts = [[0] for _ in range(batch_size)] # prompts = [[0] for _ in range(batch_size)]
# Already generate two tokens for the sequence # # Already generate two tokens for the sequence
# so that we can simulate the bonus token case # # so that we can simulate the bonus token case
multi_step_continuations = [[ # multi_step_continuations = [[
random.randint(0, 1000), # random.randint(0, 1000),
random.randint(0, 1000) # random.randint(0, 1000)
] for _ in prompts] # ] for _ in prompts]
final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts] # final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts]
seq_ids_with_bonus_token_in_last_step = set(range(batch_size)) # seq_ids_with_bonus_token_in_last_step = set(range(batch_size))
seq_group_metadata_list = create_seq_group_metadata_from_prompts( # seq_group_metadata_list = create_seq_group_metadata_from_prompts(
prompts, # prompts,
num_gpu_blocks, # num_gpu_blocks,
block_size, # block_size,
continuations=multi_step_continuations, # continuations=multi_step_continuations,
final_prompt_lens=final_prompt_lens) # final_prompt_lens=final_prompt_lens)
# Run multi-step. # # Run multi-step.
zero_kv_cache(multi_step_worker.cache_engine) # zero_kv_cache(multi_step_worker.cache_engine)
multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest( # multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list), # seq_group_metadata_list=seq_group_metadata_list),
sample_len=num_steps, # sample_len=num_steps,
seq_ids_with_bonus_token_in_last_step= # seq_ids_with_bonus_token_in_last_step=
seq_ids_with_bonus_token_in_last_step) # seq_ids_with_bonus_token_in_last_step)
# Run single-step repeatedly. # # Run single-step repeatedly.
zero_kv_cache(worker.cache_engine) # zero_kv_cache(worker.cache_engine)
# Generate the kv cache for the bonus token first # # Generate the kv cache for the bonus token first
single_step_continuations = [c[:1] for c in multi_step_continuations] # single_step_continuations = [c[:1] for c in multi_step_continuations]
seq_group_metadata_list = create_seq_group_metadata_from_prompts( # seq_group_metadata_list = create_seq_group_metadata_from_prompts(
prompts, # prompts,
num_gpu_blocks, # num_gpu_blocks,
block_size, # block_size,
continuations=single_step_continuations, # continuations=single_step_continuations,
final_prompt_lens=final_prompt_lens) # final_prompt_lens=final_prompt_lens)
single_step_output = worker.execute_model( # single_step_output = worker.execute_model(
execute_model_req=ExecuteModelRequest( # execute_model_req=ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list)) # seq_group_metadata_list=seq_group_metadata_list))
for _ in range(num_steps): # for _ in range(num_steps):
seq_group_metadata_list = create_seq_group_metadata_from_prompts( # seq_group_metadata_list = create_seq_group_metadata_from_prompts(
prompts, # prompts,
num_gpu_blocks, # num_gpu_blocks,
block_size, # block_size,
continuations=multi_step_continuations, # continuations=multi_step_continuations,
final_prompt_lens=final_prompt_lens) # final_prompt_lens=final_prompt_lens)
single_step_output = worker.execute_model( # single_step_output = worker.execute_model(
execute_model_req=ExecuteModelRequest( # execute_model_req=ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list)) # seq_group_metadata_list=seq_group_metadata_list))
for i, seq_group_output in enumerate(single_step_output[-1]): # for i, seq_group_output in enumerate(single_step_output[-1]):
multi_step_continuations[i].append( # multi_step_continuations[i].append(
seq_group_output.samples[0].output_token) # seq_group_output.samples[0].output_token)
# Verify that the KV cache of the single-step and # # Verify that the KV cache of the single-step and
# multi-step workers are the same. # # multi-step workers are the same.
single_step_gpu_cache = worker.cache_engine[0].gpu_cache # single_step_gpu_cache = worker.cache_engine[0].gpu_cache
multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache # multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache
num_layers = len(single_step_gpu_cache) # num_layers = len(single_step_gpu_cache)
allclose = lambda a, b: torch.allclose( # allclose = lambda a, b: torch.allclose(
a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2) # a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2)
for i in range(num_layers): # for i in range(num_layers):
assert allclose(single_step_gpu_cache[i][0], # assert allclose(single_step_gpu_cache[i][0],
multi_step_gpu_cache[i][0]) # multi_step_gpu_cache[i][0])
assert allclose(single_step_gpu_cache[i][1], # assert allclose(single_step_gpu_cache[i][1],
multi_step_gpu_cache[i][1]) # multi_step_gpu_cache[i][1])
@torch.inference_mode() @torch.inference_mode()
......
...@@ -5,6 +5,7 @@ from collections import defaultdict ...@@ -5,6 +5,7 @@ from collections import defaultdict
from types import SimpleNamespace from types import SimpleNamespace
from unittest.mock import MagicMock from unittest.mock import MagicMock
import os
import pytest import pytest
import torch import torch
...@@ -24,6 +25,7 @@ from vllm.worker.worker import Worker ...@@ -24,6 +25,7 @@ from vllm.worker.worker import Worker
from .test_utils import mock_spec_decode_sampler from .test_utils import mock_spec_decode_sampler
from .utils import (create_batch, create_sampler_output_list, create_worker, from .utils import (create_batch, create_sampler_output_list, create_worker,
mock_worker) mock_worker)
from ..utils import models_path_prefix
@pytest.mark.parametrize('k', [1, 2, 6]) @pytest.mark.parametrize('k', [1, 2, 6])
...@@ -918,14 +920,14 @@ def test_correctly_load_weight_for_eagle(): ...@@ -918,14 +920,14 @@ def test_correctly_load_weight_for_eagle():
num_gpu_blocks = 8096 // block_size num_gpu_blocks = 8096 // block_size
target_worker = create_worker( target_worker = create_worker(
Worker, Worker,
"JackFram/llama-68m", os.path.join(models_path_prefix, "JackFram/llama-68m"),
block_size, block_size,
num_gpu_blocks, num_gpu_blocks,
seed, seed,
) )
draft_worker = create_worker( draft_worker = create_worker(
MultiStepWorker, MultiStepWorker,
"abhigoyal/vllm-eagle-llama-68m-random", os.path.join(models_path_prefix, "abhigoyal/vllm-eagle-llama-68m-random"),
block_size, block_size,
num_gpu_blocks, num_gpu_blocks,
seed, seed,
......
...@@ -7,6 +7,7 @@ import pathlib ...@@ -7,6 +7,7 @@ import pathlib
import subprocess import subprocess
from functools import partial from functools import partial
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
from typing import List, Tuple, Optional
import openai import openai
import pytest import pytest
...@@ -15,6 +16,7 @@ from huggingface_hub import snapshot_download ...@@ -15,6 +16,7 @@ from huggingface_hub import snapshot_download
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.lora.request import LoRARequest
# yapf conflicts with isort for this docstring # yapf conflicts with isort for this docstring
# yapf: disable # yapf: disable
from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig, from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
...@@ -243,7 +245,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): ...@@ -243,7 +245,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
EXAMPLES_PATH / "offline_inference/multilora_inference.py", EXAMPLES_PATH / "offline_inference/multilora_inference.py",
) )
model_ref = "meta-llama/Llama-2-7b-hf" model_ref = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")
# lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") # lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
lora_path = os.path.join(models_path_prefix, "yard1/llama-2-7b-sql-lora-test") lora_path = os.path.join(models_path_prefix, "yard1/llama-2-7b-sql-lora-test")
test_prompts = multilora_inference.create_test_prompts(lora_path) test_prompts = multilora_inference.create_test_prompts(lora_path)
......
...@@ -142,7 +142,7 @@ def test_get_sliding_window(): ...@@ -142,7 +142,7 @@ def test_get_sliding_window():
@pytest.mark.skipif(current_platform.is_rocm(), @pytest.mark.skipif(current_platform.is_rocm(),
reason="Xformers backend is not supported on ROCm.") reason="Xformers backend is not supported on ROCm.")
def test_get_pooling_config(): def test_get_pooling_config():
model_id = "sentence-transformers/all-MiniLM-L12-v2" model_id = os.path.join(models_path_prefix, "sentence-transformers/all-MiniLM-L12-v2")
model_config = ModelConfig( model_config = ModelConfig(
model_id, model_id,
task="auto", task="auto",
...@@ -164,7 +164,7 @@ def test_get_pooling_config(): ...@@ -164,7 +164,7 @@ def test_get_pooling_config():
@pytest.mark.skipif(current_platform.is_rocm(), @pytest.mark.skipif(current_platform.is_rocm(),
reason="Xformers backend is not supported on ROCm.") reason="Xformers backend is not supported on ROCm.")
def test_get_pooling_config_from_args(): def test_get_pooling_config_from_args():
model_id = "sentence-transformers/all-MiniLM-L12-v2" model_id = os.path.join(models_path_prefix, "sentence-transformers/all-MiniLM-L12-v2")
model_config = ModelConfig(model_id, model_config = ModelConfig(model_id,
task="auto", task="auto",
tokenizer=model_id, tokenizer=model_id,
...@@ -273,10 +273,10 @@ def test_rope_customization(): ...@@ -273,10 +273,10 @@ def test_rope_customization():
@pytest.mark.skipif(current_platform.is_rocm(), @pytest.mark.skipif(current_platform.is_rocm(),
reason="Encoder Decoder models not supported on ROCm.") reason="Encoder Decoder models not supported on ROCm.")
@pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [ @pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [
("facebook/opt-125m", False), (os.path.join(models_path_prefix, "facebook/opt-125m"), False),
("facebook/bart-base", True), (os.path.join(models_path_prefix, "facebook/bart-base"), True),
("meta-llama/Llama-3.2-1B-Instruct", False), (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), False),
("meta-llama/Llama-3.2-11B-Vision", True), (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-11B-Vision"), True),
]) ])
def test_is_encoder_decoder(model_id, is_encoder_decoder): def test_is_encoder_decoder(model_id, is_encoder_decoder):
config = ModelConfig( config = ModelConfig(
...@@ -293,8 +293,8 @@ def test_is_encoder_decoder(model_id, is_encoder_decoder): ...@@ -293,8 +293,8 @@ def test_is_encoder_decoder(model_id, is_encoder_decoder):
@pytest.mark.parametrize(("model_id", "uses_mrope"), [ @pytest.mark.parametrize(("model_id", "uses_mrope"), [
("facebook/opt-125m", False), (os.path.join(models_path_prefix, "facebook/opt-125m"), False),
("Qwen/Qwen2-VL-2B-Instruct", True), (os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct"), True),
]) ])
def test_uses_mrope(model_id, uses_mrope): def test_uses_mrope(model_id, uses_mrope):
config = ModelConfig( config = ModelConfig(
...@@ -311,7 +311,7 @@ def test_uses_mrope(model_id, uses_mrope): ...@@ -311,7 +311,7 @@ def test_uses_mrope(model_id, uses_mrope):
def test_generation_config_loading(): def test_generation_config_loading():
model_id = "Qwen/Qwen2.5-1.5B-Instruct" model_id = os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct")
# When set generation_config to "vllm", the default generation config # When set generation_config to "vllm", the default generation config
# will not be loaded. # will not be loaded.
......
...@@ -5,6 +5,7 @@ It should include tests that are reported by users and making sure they ...@@ -5,6 +5,7 @@ It should include tests that are reported by users and making sure they
will never happen again. will never happen again.
""" """
import os
import gc import gc
import pytest import pytest
...@@ -12,8 +13,9 @@ import torch ...@@ -12,8 +13,9 @@ import torch
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from utils import models_path_prefix from .utils import models_path_prefix
import os from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
@pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len") @pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len")
...@@ -23,7 +25,7 @@ def test_duplicated_ignored_sequence_group(): ...@@ -23,7 +25,7 @@ def test_duplicated_ignored_sequence_group():
sampling_params = SamplingParams(temperature=0.01, sampling_params = SamplingParams(temperature=0.01,
top_p=0.1, top_p=0.1,
max_tokens=256) max_tokens=256)
llm = LLM(model="distilbert/distilgpt2", llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"),
max_num_batched_tokens=4096, max_num_batched_tokens=4096,
tensor_parallel_size=1) tensor_parallel_size=1)
prompts = ["This is a short prompt", "This is a very long prompt " * 1000] prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
...@@ -36,7 +38,13 @@ def test_max_tokens_none(): ...@@ -36,7 +38,13 @@ def test_max_tokens_none():
sampling_params = SamplingParams(temperature=0.01, sampling_params = SamplingParams(temperature=0.01,
top_p=0.1, top_p=0.1,
max_tokens=None) max_tokens=None)
llm = LLM(model="distilbert/distilgpt2", if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"),
max_num_batched_tokens=4096,
tensor_parallel_size=1,
block_size=64)
else:
llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"),
max_num_batched_tokens=4096, max_num_batched_tokens=4096,
tensor_parallel_size=1) tensor_parallel_size=1)
prompts = ["Just say hello!"] prompts = ["Just say hello!"]
...@@ -46,7 +54,7 @@ def test_max_tokens_none(): ...@@ -46,7 +54,7 @@ def test_max_tokens_none():
def test_gc(): def test_gc():
llm = LLM(model="distilbert/distilgpt2", enforce_eager=True) llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"), enforce_eager=True)
del llm del llm
gc.collect() gc.collect()
...@@ -63,6 +71,9 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch): ...@@ -63,6 +71,9 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
# model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_MODELSCOPE", "True") m.setenv("VLLM_USE_MODELSCOPE", "True")
if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
llm = LLM(model=os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat"), block_size=64)
else:
llm = LLM(model=os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat")) llm = LLM(model=os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat"))
prompts = [ prompts = [
......
...@@ -2,13 +2,15 @@ ...@@ -2,13 +2,15 @@
"""Tests for the SamplingParams class. """Tests for the SamplingParams class.
""" """
import os
import pytest import pytest
from vllm import SamplingParams from vllm import SamplingParams
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from utils import models_path_prefix
MODEL_NAME = "Qwen/Qwen1.5-7B" MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen1.5-7B")
def test_max_tokens_none(): def test_max_tokens_none():
......
...@@ -8,6 +8,7 @@ import socket ...@@ -8,6 +8,7 @@ import socket
from collections.abc import AsyncIterator from collections.abc import AsyncIterator
from unittest.mock import patch from unittest.mock import patch
import os
import pytest import pytest
import torch import torch
from vllm_test_utils.monitor import monitor from vllm_test_utils.monitor import monitor
......
...@@ -66,7 +66,7 @@ CONFIGS: dict[str, ServerConfig] = { ...@@ -66,7 +66,7 @@ CONFIGS: dict[str, ServerConfig] = {
}, },
"llama": { "llama": {
"model": "model":
"meta-llama/Meta-Llama-3.1-8B-Instruct", os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3.1-8B-Instruct"),
"arguments": [ "arguments": [
"--enforce-eager", "--no-enable-prefix-caching", "--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "llama3_json", "--chat-template", "--tool-call-parser", "llama3_json", "--chat-template",
...@@ -77,7 +77,7 @@ CONFIGS: dict[str, ServerConfig] = { ...@@ -77,7 +77,7 @@ CONFIGS: dict[str, ServerConfig] = {
}, },
"llama3.2": { "llama3.2": {
"model": "model":
"meta-llama/Llama-3.2-3B-Instruct", os.path.join(models_path_prefix, "meta-llama/Llama-3.2-3B-Instruct"),
"arguments": [ "arguments": [
"--enforce-eager", "--no-enable-prefix-caching", "--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "llama3_json", "--chat-template", "--tool-call-parser", "llama3_json", "--chat-template",
...@@ -88,7 +88,7 @@ CONFIGS: dict[str, ServerConfig] = { ...@@ -88,7 +88,7 @@ CONFIGS: dict[str, ServerConfig] = {
}, },
"llama4": { "llama4": {
"model": "model":
"meta-llama/Llama-4-Scout-17B-16E-Instruct", os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"),
"arguments": [ "arguments": [
"--enforce-eager", "--no-enable-prefix-caching", "--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "pythonic", "--chat-template", "--tool-call-parser", "pythonic", "--chat-template",
...@@ -103,7 +103,7 @@ CONFIGS: dict[str, ServerConfig] = { ...@@ -103,7 +103,7 @@ CONFIGS: dict[str, ServerConfig] = {
}, },
"llama4_json": { "llama4_json": {
"model": "model":
"meta-llama/Llama-4-Scout-17B-16E-Instruct", os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"),
"arguments": [ "arguments": [
"--enforce-eager", "--no-enable-prefix-caching", "-tp", "4", "--enforce-eager", "--no-enable-prefix-caching", "-tp", "4",
"--distributed-executor-backend", "mp", "--tool-call-parser", "--distributed-executor-backend", "mp", "--tool-call-parser",
...@@ -149,7 +149,7 @@ CONFIGS: dict[str, ServerConfig] = { ...@@ -149,7 +149,7 @@ CONFIGS: dict[str, ServerConfig] = {
# }, # },
"granite-3.0-8b": { "granite-3.0-8b": {
"model": "model":
"ibm-granite/granite-3.0-8b-instruct", os.path.join(models_path_prefix, "ibm-granite/granite-3.0-8b-instruct"),
"arguments": [ "arguments": [
"--enforce-eager", "--no-enable-prefix-caching", "--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "granite", "--chat-template", "--tool-call-parser", "granite", "--chat-template",
...@@ -158,7 +158,7 @@ CONFIGS: dict[str, ServerConfig] = { ...@@ -158,7 +158,7 @@ CONFIGS: dict[str, ServerConfig] = {
}, },
"granite-3.1-8b": { "granite-3.1-8b": {
"model": "model":
"ibm-granite/granite-3.1-8b-instruct", os.path.join(models_path_prefix, "ibm-granite/granite-3.1-8b-instruct"),
"arguments": [ "arguments": [
"--enforce-eager", "--enforce-eager",
"--no-enable-prefix-caching", "--no-enable-prefix-caching",
...@@ -170,7 +170,7 @@ CONFIGS: dict[str, ServerConfig] = { ...@@ -170,7 +170,7 @@ CONFIGS: dict[str, ServerConfig] = {
}, },
"internlm": { "internlm": {
"model": "model":
"internlm/internlm2_5-7b-chat", os.path.join(models_path_prefix, "internlm/internlm2_5-7b-chat"),
"arguments": [ "arguments": [
"--enforce-eager", "--no-enable-prefix-caching", "--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "internlm", "--chat-template", "--tool-call-parser", "internlm", "--chat-template",
...@@ -183,7 +183,7 @@ CONFIGS: dict[str, ServerConfig] = { ...@@ -183,7 +183,7 @@ CONFIGS: dict[str, ServerConfig] = {
}, },
"toolACE": { "toolACE": {
"model": "model":
"Team-ACE/ToolACE-8B", os.path.join(models_path_prefix, "Team-ACE/ToolACE-8B"),
"arguments": [ "arguments": [
"--enforce-eager", "--no-enable-prefix-caching", "--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "pythonic", "--chat-template", "--tool-call-parser", "pythonic", "--chat-template",
......
...@@ -4,6 +4,8 @@ from dataclasses import dataclass ...@@ -4,6 +4,8 @@ from dataclasses import dataclass
import lm_eval import lm_eval
import pytest import pytest
import os
from ..utils import models_path_prefix
TASK = "gsm8k" TASK = "gsm8k"
FILTER = "exact_match,strict-match" FILTER = "exact_match,strict-match"
...@@ -23,7 +25,7 @@ class GSM8KAccuracyTestConfig: ...@@ -23,7 +25,7 @@ class GSM8KAccuracyTestConfig:
# NOTE: Accuracy scores measured on GPUs. # NOTE: Accuracy scores measured on GPUs.
ACCURACY_CONFIGS = [ ACCURACY_CONFIGS = [
GSM8KAccuracyTestConfig( GSM8KAccuracyTestConfig(
model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", model_name=os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"),
excepted_value=0.76), # no bias excepted_value=0.76), # no bias
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU, # NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
# so only one of these tests can run in a single call to pytest. As # so only one of these tests can run in a single call to pytest. As
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import pytest import pytest
import torch import torch
...@@ -22,6 +23,7 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, ...@@ -22,6 +23,7 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheGroupSpec, KVCacheTensor) KVCacheGroupSpec, KVCacheTensor)
from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.metrics.stats import PrefixCacheStats
from vllm.v1.request import Request from vllm.v1.request import Request
from ...utils import models_path_prefix
# yapf: enable # yapf: enable
...@@ -432,8 +434,8 @@ def test_unify_kv_cache_configs(): ...@@ -432,8 +434,8 @@ def test_unify_kv_cache_configs():
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_id", "max_model_len", "want_estimated_max_len"), [ ("model_id", "max_model_len", "want_estimated_max_len"), [
("Qwen/Qwen1.5-7B", 16385, 16384), (os.path.join(models_path_prefix, "Qwen/Qwen1.5-7B"), 16385, 16384),
("Qwen/Qwen1.5-7B", 16383, 16383), (os.path.join(models_path_prefix, "Qwen/Qwen1.5-7B"), 16383, 16383),
]) ])
def test_estimate_max_model_len(model_id, max_model_len, def test_estimate_max_model_len(model_id, max_model_len,
want_estimated_max_len): want_estimated_max_len):
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
from typing import Optional from typing import Optional
from unittest.mock import Mock from unittest.mock import Mock
import os
import pytest import pytest
import torch import torch
...@@ -16,12 +17,13 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, ...@@ -16,12 +17,13 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager from vllm.v1.structured_output import StructuredOutputManager
from ...utils import models_path_prefix
EOS_TOKEN_ID = 50256 EOS_TOKEN_ID = 50256
def create_scheduler( def create_scheduler(
model: str = "facebook/opt-125m", model: str = os.path.join(models_path_prefix, "facebook/opt-125m"),
max_num_seqs: int = 16, max_num_seqs: int = 16,
max_num_batched_tokens: int = 8192, max_num_batched_tokens: int = 8192,
enable_prefix_caching: Optional[bool] = None, enable_prefix_caching: Optional[bool] = None,
...@@ -211,7 +213,7 @@ def test_schedule(enable_prefix_caching: Optional[bool], ...@@ -211,7 +213,7 @@ def test_schedule(enable_prefix_caching: Optional[bool],
def test_schedule_multimodal_requests(): def test_schedule_multimodal_requests():
scheduler = create_scheduler(model="llava-hf/llava-1.5-7b-hf") scheduler = create_scheduler(model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"))
mm_positions = [[PlaceholderRange(offset=i, length=100)] mm_positions = [[PlaceholderRange(offset=i, length=100)]
for i in range(10)] for i in range(10)]
requests = create_requests( requests = create_requests(
...@@ -243,7 +245,7 @@ def test_schedule_partial_requests(): ...@@ -243,7 +245,7 @@ def test_schedule_partial_requests():
there is insufficient encoder budget. there is insufficient encoder budget.
""" """
scheduler = create_scheduler( scheduler = create_scheduler(
model="llava-hf/llava-1.5-7b-hf", model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
max_num_batched_tokens=1024, max_num_batched_tokens=1024,
) )
mm_positions = [[PlaceholderRange(offset=100, length=600)] mm_positions = [[PlaceholderRange(offset=100, length=600)]
...@@ -303,7 +305,7 @@ def test_schedule_partial_requests(): ...@@ -303,7 +305,7 @@ def test_schedule_partial_requests():
def test_no_mm_input_chunking(): def test_no_mm_input_chunking():
# Disable multimodal input chunking. # Disable multimodal input chunking.
scheduler = create_scheduler( scheduler = create_scheduler(
model="llava-hf/llava-1.5-7b-hf", model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
max_num_batched_tokens=1024, max_num_batched_tokens=1024,
disable_chunked_mm_input=True, disable_chunked_mm_input=True,
max_model_len=2048, max_model_len=2048,
...@@ -347,7 +349,7 @@ def test_no_mm_input_chunking(): ...@@ -347,7 +349,7 @@ def test_no_mm_input_chunking():
# of a max_num_batched_tokens for the mm input. # of a max_num_batched_tokens for the mm input.
with pytest.raises(ValueError): with pytest.raises(ValueError):
_ = create_scheduler( _ = create_scheduler(
model="llava-hf/llava-1.5-7b-hf", model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
max_num_batched_tokens=100, max_num_batched_tokens=100,
disable_chunked_mm_input=True, disable_chunked_mm_input=True,
) )
...@@ -362,7 +364,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool): ...@@ -362,7 +364,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
""" """
scheduler = create_scheduler( scheduler = create_scheduler(
model="facebook/opt-125m", model=os.path.join(models_path_prefix, "facebook/opt-125m"),
max_num_batched_tokens=1024, max_num_batched_tokens=1024,
long_prefill_token_threshold=400, long_prefill_token_threshold=400,
enable_prefix_caching=enable_prefix_caching, enable_prefix_caching=enable_prefix_caching,
......
...@@ -4,11 +4,12 @@ import os ...@@ -4,11 +4,12 @@ import os
import pytest import pytest
from vllm import LLM from vllm import LLM
from ...utils import models_path_prefix
if os.getenv("VLLM_USE_V1", "0") != "1": if os.getenv("VLLM_USE_V1", "0") != "1":
pytest.skip("Test package requires V1", allow_module_level=True) pytest.skip("Test package requires V1", allow_module_level=True)
MODEL = "meta-llama/Llama-3.2-1B" MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")
PROMPT = "Hello my name is Robert and I" PROMPT = "Hello my name is Robert and I"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment