Commit bd363067 authored by lizhigong's avatar lizhigong
Browse files

Merge branch 'v0.8.5.post1-dev' into v0.8.5-zero_overhead

parents 87ef4618 d36deb1a
......@@ -6,10 +6,10 @@ Run `pytest tests/samplers/test_beam_search.py`.
import pytest
import os
from ..utils import models_path_prefix
from transformers import AutoModelForSeq2SeqLM
from vllm.assets.audio import AudioAsset
from ..utils import models_path_prefix
@pytest.fixture(autouse=True)
......@@ -83,7 +83,7 @@ def test_beam_search_passes_multimodal_data(
# correctly. As such, we just need to check one extra modality to make
# sure things pass through properly.
audios = [AudioAsset("mary_had_lamb").audio_and_sample_rate]
model = "Qwen/Qwen2-Audio-7B-Instruct"
model = os.path.join(models_path_prefix, "Qwen/Qwen2-Audio-7B-Instruct")
audio_seq = "<|audio_bos|><|AUDIO|><|audio_eos|>"
prompts = [
f"<|im_start|>user\n{audio_seq}Can you transcribe this?<|im_end|>\n<|im_start|>assistant\n" #noqa: E501
......@@ -140,4 +140,4 @@ def test_beam_search_passes_multimodal_data(
assert filtered_hf_output_ids[-1] == eos_token_id
filtered_hf_output_ids = filtered_hf_output_ids[:-1]
assert filtered_hf_output_ids == filtered_vllm_output_ids
assert filtered_hf_output_ids == filtered_vllm_output_ids
\ No newline at end of file
......@@ -19,7 +19,7 @@ from ...utils import models_path_prefix
{
# Speculative max model len > overridden max model len should raise.
"speculative_config": {
"model": "JackFram/llama-68m",
"model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
"max_model_len": 129,
},
......@@ -29,7 +29,7 @@ from ...utils import models_path_prefix
# Speculative max model len > draft max model len should raise.
# https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12
"speculative_config": {
"model": "JackFram/llama-68m",
"model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
"max_model_len": 2048 + 1,
},
......@@ -38,7 +38,7 @@ from ...utils import models_path_prefix
# Speculative max model len > target max model len should raise.
# https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
"speculative_config": {
"model": "JackFram/llama-68m",
"model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
"max_model_len": 131072 + 1,
},
......@@ -64,4 +64,4 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
with pytest.raises(ValueError, match="cannot be larger than"):
get_output_from_llm_generator(test_llm_generator, prompts,
sampling_params)
sampling_params)
\ No newline at end of file
......@@ -332,14 +332,14 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
"dtype": "float16",
# Main model
"model_name": "meta-llama/Llama-2-7b-chat-hf",
"model_name": os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_config": {
"model": "yuhuili/EAGLE-llama2-chat-7B",
"model": os.path.join(models_path_prefix, "yuhuili/EAGLE-llama2-chat-7B"),
"num_speculative_tokens": MAX_SPEC_TOKENS,
},
},
......@@ -382,14 +382,14 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
"dtype": "float16",
# Main model
"model_name": "meta-llama/Meta-Llama-3-8B-Instruct",
"model_name": os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_config": {
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
"model": os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3-Instruct-8B"),
"num_speculative_tokens": MAX_SPEC_TOKENS,
},
},
......@@ -432,14 +432,14 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
"dtype": "float16",
# Main model
"model_name": "Qwen/Qwen2-7B-Instruct",
"model_name": os.path.join(models_path_prefix, "Qwen/Qwen2-7B-Instruct"),
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_config": {
"model": "yuhuili/EAGLE-Qwen2-7B-Instruct",
"model": os.path.join(models_path_prefix, "yuhuili/EAGLE-Qwen2-7B-Instruct"),
"num_speculative_tokens": MAX_SPEC_TOKENS,
},
},
......
......@@ -69,7 +69,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Explicitly specify draft model quantization
{
"speculative_config": {
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
"model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
"num_speculative_tokens": 5,
"quantization": "gptq",
},
......@@ -77,7 +77,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Explicitly specify GPTQ-based draft model to use marlin quantization
{
"speculative_config": {
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
"model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
"num_speculative_tokens": 5,
"quantization": "marlin",
},
......@@ -85,7 +85,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Not explicitly specify draft model quantization
{
"speculative_config": {
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
"model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
"num_speculative_tokens": 5,
"quantization": None,
},
......@@ -124,7 +124,7 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{
"speculative_config": {
"model": "JackFram/llama-68m",
"model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 3,
"disable_mqa_scorer": True,
},
......@@ -151,4 +151,4 @@ def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
batch_size,
max_output_len=output_len,
seed=seed,
temperature=0.0)
temperature=0.0)
\ No newline at end of file
......@@ -20,12 +20,14 @@ With those tests, we can say at least, mtp would not break the
correctess for the target model outputs.
"""
import os
import pytest
from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix
# main model
MAIN_MODEL = "luccafong/deepseek_mtp_main_random"
MAIN_MODEL = os.path.join(models_path_prefix, "luccafong/deepseek_mtp_main_random")
# max. number of speculative tokens: this corresponds to
# num_nextn_predict_layers in the config.json of the speculator model.
......@@ -329,4 +331,4 @@ def test_mtp_disable_queue(vllm_runner, common_llm_kwargs,
if __name__ == "__main__":
import pytest
pytest.main([__file__])
pytest.main([__file__])
\ No newline at end of file
......@@ -334,7 +334,7 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model_name": "JackFram/llama-68m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
# Skip cuda graph recording for fast test.
"enforce_eager": True,
......@@ -372,4 +372,4 @@ def test_ngram_scorer(vllm_runner, common_llm_kwargs,
batch_size,
max_output_len=output_len,
seed=seed,
temperature=0.0)
temperature=0.0)
\ No newline at end of file
......@@ -484,107 +484,107 @@ def test_multi_step_with_batch_expansion_incorrect_output():
assert (num_mismatch > 0)
@torch.inference_mode()
@pytest.mark.parametrize('num_steps', [1, 2, 3, 4])
# The choice of backends forces the multi_step_worker to choose between
# the vanilla model_runner and TP1DraftModelRunner and that we can test
# both code paths.
@pytest.mark.parametrize('attn_backend',
[_Backend.XFORMERS, _Backend.FLASH_ATTN])
def test_multi_step_correct_kvcache(num_steps, attn_backend):
"""Verify that the KV cache of the draft model
is correctly updated for sequences with bonus token.
"""
seed = 100
model_name = "JackFram/llama-68m"
block_size = 16
num_gpu_blocks = 2048 // block_size
batch_size = 1
with global_force_attn_backend_context_manager(attn_backend):
dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32'
multi_step_worker = create_worker(MultiStepWorker,
model_name,
block_size,
num_gpu_blocks,
seed,
model_runner_cls=TP1DraftModelRunner,
dtype=dtype)
multi_step_worker.set_include_gpu_probs_tensor()
worker = create_worker(Worker,
model_name,
block_size,
num_gpu_blocks,
seed,
dtype=dtype)
prompts = [[0] for _ in range(batch_size)]
# Already generate two tokens for the sequence
# so that we can simulate the bonus token case
multi_step_continuations = [[
random.randint(0, 1000),
random.randint(0, 1000)
] for _ in prompts]
final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts]
seq_ids_with_bonus_token_in_last_step = set(range(batch_size))
seq_group_metadata_list = create_seq_group_metadata_from_prompts(
prompts,
num_gpu_blocks,
block_size,
continuations=multi_step_continuations,
final_prompt_lens=final_prompt_lens)
# Run multi-step.
zero_kv_cache(multi_step_worker.cache_engine)
multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list),
sample_len=num_steps,
seq_ids_with_bonus_token_in_last_step=
seq_ids_with_bonus_token_in_last_step)
# Run single-step repeatedly.
zero_kv_cache(worker.cache_engine)
# Generate the kv cache for the bonus token first
single_step_continuations = [c[:1] for c in multi_step_continuations]
seq_group_metadata_list = create_seq_group_metadata_from_prompts(
prompts,
num_gpu_blocks,
block_size,
continuations=single_step_continuations,
final_prompt_lens=final_prompt_lens)
single_step_output = worker.execute_model(
execute_model_req=ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list))
for _ in range(num_steps):
seq_group_metadata_list = create_seq_group_metadata_from_prompts(
prompts,
num_gpu_blocks,
block_size,
continuations=multi_step_continuations,
final_prompt_lens=final_prompt_lens)
single_step_output = worker.execute_model(
execute_model_req=ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list))
for i, seq_group_output in enumerate(single_step_output[-1]):
multi_step_continuations[i].append(
seq_group_output.samples[0].output_token)
# Verify that the KV cache of the single-step and
# multi-step workers are the same.
single_step_gpu_cache = worker.cache_engine[0].gpu_cache
multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache
num_layers = len(single_step_gpu_cache)
allclose = lambda a, b: torch.allclose(
a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2)
for i in range(num_layers):
assert allclose(single_step_gpu_cache[i][0],
multi_step_gpu_cache[i][0])
assert allclose(single_step_gpu_cache[i][1],
multi_step_gpu_cache[i][1])
# @torch.inference_mode()
# @pytest.mark.parametrize('num_steps', [1, 2, 3, 4])
# # The choice of backends forces the multi_step_worker to choose between
# # the vanilla model_runner and TP1DraftModelRunner and that we can test
# # both code paths.
# @pytest.mark.parametrize('attn_backend',
# [_Backend.XFORMERS, _Backend.FLASH_ATTN])
# def test_multi_step_correct_kvcache(num_steps, attn_backend):
# """Verify that the KV cache of the draft model
# is correctly updated for sequences with bonus token.
# """
# seed = 100
# model_name = "JackFram/llama-68m"
# block_size = 16
# num_gpu_blocks = 2048 // block_size
# batch_size = 1
# with global_force_attn_backend_context_manager(attn_backend):
# dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32'
# multi_step_worker = create_worker(MultiStepWorker,
# model_name,
# block_size,
# num_gpu_blocks,
# seed,
# model_runner_cls=TP1DraftModelRunner,
# dtype=dtype)
# multi_step_worker.set_include_gpu_probs_tensor()
# worker = create_worker(Worker,
# model_name,
# block_size,
# num_gpu_blocks,
# seed,
# dtype=dtype)
# prompts = [[0] for _ in range(batch_size)]
# # Already generate two tokens for the sequence
# # so that we can simulate the bonus token case
# multi_step_continuations = [[
# random.randint(0, 1000),
# random.randint(0, 1000)
# ] for _ in prompts]
# final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts]
# seq_ids_with_bonus_token_in_last_step = set(range(batch_size))
# seq_group_metadata_list = create_seq_group_metadata_from_prompts(
# prompts,
# num_gpu_blocks,
# block_size,
# continuations=multi_step_continuations,
# final_prompt_lens=final_prompt_lens)
# # Run multi-step.
# zero_kv_cache(multi_step_worker.cache_engine)
# multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest(
# seq_group_metadata_list=seq_group_metadata_list),
# sample_len=num_steps,
# seq_ids_with_bonus_token_in_last_step=
# seq_ids_with_bonus_token_in_last_step)
# # Run single-step repeatedly.
# zero_kv_cache(worker.cache_engine)
# # Generate the kv cache for the bonus token first
# single_step_continuations = [c[:1] for c in multi_step_continuations]
# seq_group_metadata_list = create_seq_group_metadata_from_prompts(
# prompts,
# num_gpu_blocks,
# block_size,
# continuations=single_step_continuations,
# final_prompt_lens=final_prompt_lens)
# single_step_output = worker.execute_model(
# execute_model_req=ExecuteModelRequest(
# seq_group_metadata_list=seq_group_metadata_list))
# for _ in range(num_steps):
# seq_group_metadata_list = create_seq_group_metadata_from_prompts(
# prompts,
# num_gpu_blocks,
# block_size,
# continuations=multi_step_continuations,
# final_prompt_lens=final_prompt_lens)
# single_step_output = worker.execute_model(
# execute_model_req=ExecuteModelRequest(
# seq_group_metadata_list=seq_group_metadata_list))
# for i, seq_group_output in enumerate(single_step_output[-1]):
# multi_step_continuations[i].append(
# seq_group_output.samples[0].output_token)
# # Verify that the KV cache of the single-step and
# # multi-step workers are the same.
# single_step_gpu_cache = worker.cache_engine[0].gpu_cache
# multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache
# num_layers = len(single_step_gpu_cache)
# allclose = lambda a, b: torch.allclose(
# a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2)
# for i in range(num_layers):
# assert allclose(single_step_gpu_cache[i][0],
# multi_step_gpu_cache[i][0])
# assert allclose(single_step_gpu_cache[i][1],
# multi_step_gpu_cache[i][1])
@torch.inference_mode()
......
......@@ -5,6 +5,7 @@ from collections import defaultdict
from types import SimpleNamespace
from unittest.mock import MagicMock
import os
import pytest
import torch
......@@ -24,6 +25,7 @@ from vllm.worker.worker import Worker
from .test_utils import mock_spec_decode_sampler
from .utils import (create_batch, create_sampler_output_list, create_worker,
mock_worker)
from ..utils import models_path_prefix
@pytest.mark.parametrize('k', [1, 2, 6])
......@@ -918,14 +920,14 @@ def test_correctly_load_weight_for_eagle():
num_gpu_blocks = 8096 // block_size
target_worker = create_worker(
Worker,
"JackFram/llama-68m",
os.path.join(models_path_prefix, "JackFram/llama-68m"),
block_size,
num_gpu_blocks,
seed,
)
draft_worker = create_worker(
MultiStepWorker,
"abhigoyal/vllm-eagle-llama-68m-random",
os.path.join(models_path_prefix, "abhigoyal/vllm-eagle-llama-68m-random"),
block_size,
num_gpu_blocks,
seed,
......@@ -941,4 +943,4 @@ def test_correctly_load_weight_for_eagle():
target_worker.model_runner.model.lm_head.weight.data)
assert torch.allclose(
worker.proposer_worker.worker.model_runner.model.lm_head.weight.data,
worker.scorer_worker.model_runner.model.lm_head.weight.data)
worker.scorer_worker.model_runner.model.lm_head.weight.data)
\ No newline at end of file
......@@ -7,6 +7,7 @@ import pathlib
import subprocess
from functools import partial
from unittest.mock import MagicMock, patch
from typing import List, Tuple, Optional
import openai
import pytest
......@@ -15,6 +16,7 @@ from huggingface_hub import snapshot_download
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.lora.request import LoRARequest
# yapf conflicts with isort for this docstring
# yapf: disable
from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
......@@ -243,7 +245,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
EXAMPLES_PATH / "offline_inference/multilora_inference.py",
)
model_ref = "meta-llama/Llama-2-7b-hf"
model_ref = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")
# lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
lora_path = os.path.join(models_path_prefix, "yard1/llama-2-7b-sql-lora-test")
test_prompts = multilora_inference.create_test_prompts(lora_path)
......@@ -431,4 +433,4 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
prompts, sampling_params)
# noqa: E501
assert outputs == deserialized_outputs
assert outputs == deserialized_outputs
\ No newline at end of file
......@@ -142,7 +142,7 @@ def test_get_sliding_window():
@pytest.mark.skipif(current_platform.is_rocm(),
reason="Xformers backend is not supported on ROCm.")
def test_get_pooling_config():
model_id = "sentence-transformers/all-MiniLM-L12-v2"
model_id = os.path.join(models_path_prefix, "sentence-transformers/all-MiniLM-L12-v2")
model_config = ModelConfig(
model_id,
task="auto",
......@@ -164,7 +164,7 @@ def test_get_pooling_config():
@pytest.mark.skipif(current_platform.is_rocm(),
reason="Xformers backend is not supported on ROCm.")
def test_get_pooling_config_from_args():
model_id = "sentence-transformers/all-MiniLM-L12-v2"
model_id = os.path.join(models_path_prefix, "sentence-transformers/all-MiniLM-L12-v2")
model_config = ModelConfig(model_id,
task="auto",
tokenizer=model_id,
......@@ -273,10 +273,10 @@ def test_rope_customization():
@pytest.mark.skipif(current_platform.is_rocm(),
reason="Encoder Decoder models not supported on ROCm.")
@pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [
("facebook/opt-125m", False),
("facebook/bart-base", True),
("meta-llama/Llama-3.2-1B-Instruct", False),
("meta-llama/Llama-3.2-11B-Vision", True),
(os.path.join(models_path_prefix, "facebook/opt-125m"), False),
(os.path.join(models_path_prefix, "facebook/bart-base"), True),
(os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), False),
(os.path.join(models_path_prefix, "meta-llama/Llama-3.2-11B-Vision"), True),
])
def test_is_encoder_decoder(model_id, is_encoder_decoder):
config = ModelConfig(
......@@ -293,8 +293,8 @@ def test_is_encoder_decoder(model_id, is_encoder_decoder):
@pytest.mark.parametrize(("model_id", "uses_mrope"), [
("facebook/opt-125m", False),
("Qwen/Qwen2-VL-2B-Instruct", True),
(os.path.join(models_path_prefix, "facebook/opt-125m"), False),
(os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct"), True),
])
def test_uses_mrope(model_id, uses_mrope):
config = ModelConfig(
......@@ -311,7 +311,7 @@ def test_uses_mrope(model_id, uses_mrope):
def test_generation_config_loading():
model_id = "Qwen/Qwen2.5-1.5B-Instruct"
model_id = os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct")
# When set generation_config to "vllm", the default generation config
# will not be loaded.
......@@ -377,4 +377,4 @@ def test_generation_config_loading():
generation_config="vllm",
override_generation_config=override_generation_config)
assert model_config.get_diff_sampling_param() == override_generation_config
assert model_config.get_diff_sampling_param() == override_generation_config
\ No newline at end of file
......@@ -5,6 +5,7 @@ It should include tests that are reported by users and making sure they
will never happen again.
"""
import os
import gc
import pytest
......@@ -12,8 +13,9 @@ import torch
from vllm import LLM, SamplingParams
from utils import models_path_prefix
import os
from .utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
@pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len")
......@@ -23,7 +25,7 @@ def test_duplicated_ignored_sequence_group():
sampling_params = SamplingParams(temperature=0.01,
top_p=0.1,
max_tokens=256)
llm = LLM(model="distilbert/distilgpt2",
llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"),
max_num_batched_tokens=4096,
tensor_parallel_size=1)
prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
......@@ -36,9 +38,15 @@ def test_max_tokens_none():
sampling_params = SamplingParams(temperature=0.01,
top_p=0.1,
max_tokens=None)
llm = LLM(model="distilbert/distilgpt2",
max_num_batched_tokens=4096,
tensor_parallel_size=1)
if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"),
max_num_batched_tokens=4096,
tensor_parallel_size=1,
block_size=64)
else:
llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"),
max_num_batched_tokens=4096,
tensor_parallel_size=1)
prompts = ["Just say hello!"]
outputs = llm.generate(prompts, sampling_params=sampling_params)
......@@ -46,7 +54,7 @@ def test_max_tokens_none():
def test_gc():
llm = LLM(model="distilbert/distilgpt2", enforce_eager=True)
llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"), enforce_eager=True)
del llm
gc.collect()
......@@ -63,7 +71,10 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
# model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
with monkeypatch.context() as m:
m.setenv("VLLM_USE_MODELSCOPE", "True")
llm = LLM(model=os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat"))
if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
llm = LLM(model=os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat"), block_size=64)
else:
llm = LLM(model=os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat"))
prompts = [
"Hello, my name is",
......@@ -74,4 +85,4 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
outputs = llm.generate(prompts, sampling_params)
assert len(outputs) == 4
assert len(outputs) == 4
\ No newline at end of file
......@@ -2,13 +2,15 @@
"""Tests for the SamplingParams class.
"""
import os
import pytest
from vllm import SamplingParams
from vllm.config import ModelConfig
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from utils import models_path_prefix
MODEL_NAME = "Qwen/Qwen1.5-7B"
MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen1.5-7B")
def test_max_tokens_none():
......
......@@ -8,6 +8,7 @@ import socket
from collections.abc import AsyncIterator
from unittest.mock import patch
import os
import pytest
import torch
from vllm_test_utils.monitor import monitor
......
......@@ -66,7 +66,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
"llama": {
"model":
"meta-llama/Meta-Llama-3.1-8B-Instruct",
os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3.1-8B-Instruct"),
"arguments": [
"--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "llama3_json", "--chat-template",
......@@ -77,7 +77,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
"llama3.2": {
"model":
"meta-llama/Llama-3.2-3B-Instruct",
os.path.join(models_path_prefix, "meta-llama/Llama-3.2-3B-Instruct"),
"arguments": [
"--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "llama3_json", "--chat-template",
......@@ -88,7 +88,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
"llama4": {
"model":
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"),
"arguments": [
"--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "pythonic", "--chat-template",
......@@ -103,7 +103,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
"llama4_json": {
"model":
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"),
"arguments": [
"--enforce-eager", "--no-enable-prefix-caching", "-tp", "4",
"--distributed-executor-backend", "mp", "--tool-call-parser",
......@@ -149,7 +149,7 @@ CONFIGS: dict[str, ServerConfig] = {
# },
"granite-3.0-8b": {
"model":
"ibm-granite/granite-3.0-8b-instruct",
os.path.join(models_path_prefix, "ibm-granite/granite-3.0-8b-instruct"),
"arguments": [
"--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "granite", "--chat-template",
......@@ -158,7 +158,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
"granite-3.1-8b": {
"model":
"ibm-granite/granite-3.1-8b-instruct",
os.path.join(models_path_prefix, "ibm-granite/granite-3.1-8b-instruct"),
"arguments": [
"--enforce-eager",
"--no-enable-prefix-caching",
......@@ -170,7 +170,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
"internlm": {
"model":
"internlm/internlm2_5-7b-chat",
os.path.join(models_path_prefix, "internlm/internlm2_5-7b-chat"),
"arguments": [
"--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "internlm", "--chat-template",
......@@ -183,7 +183,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
"toolACE": {
"model":
"Team-ACE/ToolACE-8B",
os.path.join(models_path_prefix, "Team-ACE/ToolACE-8B"),
"arguments": [
"--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "pythonic", "--chat-template",
......@@ -361,4 +361,4 @@ MESSAGES_WITH_PARALLEL_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [{
"content":
"The weather in Orlando FL is 78 degrees fahrenheit with clear"
"skies."
}]
}]
\ No newline at end of file
......@@ -4,6 +4,8 @@ from dataclasses import dataclass
import lm_eval
import pytest
import os
from ..utils import models_path_prefix
TASK = "gsm8k"
FILTER = "exact_match,strict-match"
......@@ -23,7 +25,7 @@ class GSM8KAccuracyTestConfig:
# NOTE: Accuracy scores measured on GPUs.
ACCURACY_CONFIGS = [
GSM8KAccuracyTestConfig(
model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
model_name=os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"),
excepted_value=0.76), # no bias
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
# so only one of these tests can run in a single call to pytest. As
......@@ -48,4 +50,4 @@ def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
measured_value = results["results"][TASK][FILTER]
assert (measured_value - RTOL < EXPECTED_VALUE
and measured_value + RTOL > EXPECTED_VALUE
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
import torch
......@@ -22,6 +23,7 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheGroupSpec, KVCacheTensor)
from vllm.v1.metrics.stats import PrefixCacheStats
from vllm.v1.request import Request
from ...utils import models_path_prefix
# yapf: enable
......@@ -432,8 +434,8 @@ def test_unify_kv_cache_configs():
@pytest.mark.parametrize(
("model_id", "max_model_len", "want_estimated_max_len"), [
("Qwen/Qwen1.5-7B", 16385, 16384),
("Qwen/Qwen1.5-7B", 16383, 16383),
(os.path.join(models_path_prefix, "Qwen/Qwen1.5-7B"), 16385, 16384),
(os.path.join(models_path_prefix, "Qwen/Qwen1.5-7B"), 16383, 16383),
])
def test_estimate_max_model_len(model_id, max_model_len,
want_estimated_max_len):
......
......@@ -2,6 +2,7 @@
from typing import Optional
from unittest.mock import Mock
import os
import pytest
import torch
......@@ -16,12 +17,13 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager
from ...utils import models_path_prefix
EOS_TOKEN_ID = 50256
def create_scheduler(
model: str = "facebook/opt-125m",
model: str = os.path.join(models_path_prefix, "facebook/opt-125m"),
max_num_seqs: int = 16,
max_num_batched_tokens: int = 8192,
enable_prefix_caching: Optional[bool] = None,
......@@ -211,7 +213,7 @@ def test_schedule(enable_prefix_caching: Optional[bool],
def test_schedule_multimodal_requests():
scheduler = create_scheduler(model="llava-hf/llava-1.5-7b-hf")
scheduler = create_scheduler(model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"))
mm_positions = [[PlaceholderRange(offset=i, length=100)]
for i in range(10)]
requests = create_requests(
......@@ -243,7 +245,7 @@ def test_schedule_partial_requests():
there is insufficient encoder budget.
"""
scheduler = create_scheduler(
model="llava-hf/llava-1.5-7b-hf",
model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
max_num_batched_tokens=1024,
)
mm_positions = [[PlaceholderRange(offset=100, length=600)]
......@@ -303,7 +305,7 @@ def test_schedule_partial_requests():
def test_no_mm_input_chunking():
# Disable multimodal input chunking.
scheduler = create_scheduler(
model="llava-hf/llava-1.5-7b-hf",
model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
max_num_batched_tokens=1024,
disable_chunked_mm_input=True,
max_model_len=2048,
......@@ -347,7 +349,7 @@ def test_no_mm_input_chunking():
# of a max_num_batched_tokens for the mm input.
with pytest.raises(ValueError):
_ = create_scheduler(
model="llava-hf/llava-1.5-7b-hf",
model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
max_num_batched_tokens=100,
disable_chunked_mm_input=True,
)
......@@ -362,7 +364,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
"""
scheduler = create_scheduler(
model="facebook/opt-125m",
model=os.path.join(models_path_prefix, "facebook/opt-125m"),
max_num_batched_tokens=1024,
long_prefill_token_threshold=400,
enable_prefix_caching=enable_prefix_caching,
......@@ -1241,4 +1243,4 @@ def test_memory_leak():
scheduler.update_from_output(scheduler_output, model_runner_output)
# Confirm no memory leak.
assert_scheduler_empty(scheduler)
assert_scheduler_empty(scheduler)
\ No newline at end of file
......@@ -4,11 +4,12 @@ import os
import pytest
from vllm import LLM
from ...utils import models_path_prefix
if os.getenv("VLLM_USE_V1", "0") != "1":
pytest.skip("Test package requires V1", allow_module_level=True)
MODEL = "meta-llama/Llama-3.2-1B"
MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")
PROMPT = "Hello my name is Robert and I"
......@@ -26,4 +27,4 @@ def test_concurrent_partial_prefill(model):
outputs = model.generate([PROMPT] * 3)
assert len(outputs) == 3
for output in outputs:
assert len(output.outputs) == 1
assert len(output.outputs) == 1
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment