Commit bd363067 authored by lizhigong's avatar lizhigong
Browse files

Merge branch 'v0.8.5.post1-dev' into v0.8.5-zero_overhead

parents 87ef4618 d36deb1a
# SPDX-License-Identifier: Apache-2.0
"""Test that we handle a startup Error and shutdown."""
import os
import pytest
from tests.utils import wait_for_gpu_memory_to_clear
......@@ -11,8 +12,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.sampling_params import RequestOutputKind
from vllm.utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM
from ...utils import models_path_prefix
MODELS = ["meta-llama/Llama-3.2-1B"]
MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
@pytest.mark.asyncio
......
......@@ -3,6 +3,7 @@
import asyncio
import os
import pytest
from tests.utils import wait_for_gpu_memory_to_clear
......@@ -14,8 +15,9 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.exceptions import EngineDeadError
from ...utils import models_path_prefix
MODELS = ["meta-llama/Llama-3.2-1B"]
MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
def evil_forward(self, *args, **kwargs):
......
......@@ -3,6 +3,7 @@
import asyncio
import os
import pytest
from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT_SEC
......@@ -12,8 +13,9 @@ from vllm.inputs.data import TokensPrompt
from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.exceptions import EngineGenerateError
from ...utils import models_path_prefix
MODELS = ["meta-llama/Llama-3.2-1B"]
MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
@pytest.mark.asyncio
......
# SPDX-License-Identifier: Apache-2.0
"""Test that we handle a startup Error and shutdown."""
import os
import pytest
from tests.utils import wait_for_gpu_memory_to_clear
......@@ -12,8 +13,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM
from ...utils import models_path_prefix
MODELS = ["meta-llama/Llama-3.2-1B"]
MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
def evil_method(self, *args, **kwargs):
......@@ -69,7 +71,7 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
Test profiling (forward()) and load weights failures.
TODO(andy) - LLM without multiprocessing.
"""
if model != "meta-llama/Llama-3.2-1B":
if model != os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"):
pytest.skip(reason="Only test meta-llama/Llama-3.2-1B")
if cuda_device_count_stateless() < tensor_parallel_size:
pytest.skip(reason="Not enough CUDA devices")
......
# SPDX-License-Identifier: Apache-2.0
"""Test whether spec decoding handles the max model length properly."""
import os
import pytest
from vllm import LLM, SamplingParams
from ...utils import models_path_prefix
_PROMPTS = [
"1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1",
......@@ -21,7 +23,7 @@ def test_ngram_max_len(
m.setenv("VLLM_USE_V1", "1")
llm = LLM(
model="facebook/opt-125m",
model=os.path.join(models_path_prefix, "facebook/opt-125m"),
max_model_len=100,
enforce_eager=True, # For faster initialization.
speculative_config={
......@@ -44,11 +46,11 @@ def test_eagle_max_len(
m.setenv("VLLM_USE_V1", "1")
llm = LLM(
model="meta-llama/Meta-Llama-3-8B-Instruct",
model=os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
enforce_eager=True, # For faster initialization.
speculative_config={
"method": "eagle",
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
"model": os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3-Instruct-8B"),
"num_speculative_tokens": num_speculative_tokens,
},
max_model_len=100,
......
# SPDX-License-Identifier: Apache-2.0
import os
import numpy as np
from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
from vllm.v1.spec_decode.ngram_proposer import (NgramProposer,
_find_subarray_kmp,
_kmp_lps_array)
from ...utils import models_path_prefix
def test_kmp_lps_array():
......@@ -43,10 +45,10 @@ def test_ngram_proposer():
def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
# Dummy model config. Just to set max_model_len.
model_config = ModelConfig(model="facebook/opt-125m",
model_config = ModelConfig(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
task="generate",
max_model_len=100,
tokenizer="facebook/opt-125m",
tokenizer=os.path.join(models_path_prefix, "facebook/opt-125m"),
tokenizer_mode="auto",
dtype="auto",
seed=None,
......
......@@ -14,9 +14,10 @@ from vllm.platforms import current_platform
from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.core_client import DPAsyncMPClient
from ..utils import models_path_prefix
engine_args = AsyncEngineArgs(
model="ibm-research/PowerMoE-3b",
model=os.path.join(models_path_prefix, "ibm-research/PowerMoE-3b"),
enforce_eager=True,
disable_log_requests=True,
tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
......
......@@ -7,16 +7,17 @@ import vllm.envs as envs
from vllm import LLM
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from ..utils import models_path_prefix
UNSUPPORTED_MODELS_V1 = [
"openai/whisper-large-v3", # transcription
"facebook/bart-large-cnn", # encoder decoder
"mistralai/Mamba-Codestral-7B-v0.1", # mamba
"ibm-ai-platform/Bamba-9B", # hybrid
"BAAI/bge-m3", # embedding
os.path.join(models_path_prefix, "openai/whisper-large-v3"), # transcription
os.path.join(models_path_prefix, "facebook/bart-large-cnn"), # encoder decoder
os.path.join(models_path_prefix, "mistralai/Mamba-Codestral-7B-v0.1"), # mamba
os.path.join(models_path_prefix, "ibm-ai-platform/Bamba-9B"), # hybrid
os.path.join(models_path_prefix, "BAAI/bge-m3"), # embedding
]
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
@pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1)
......
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
......@@ -7,6 +8,7 @@ from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
SchedulerOutput)
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
from ...utils import models_path_prefix
@pytest.fixture
......@@ -17,9 +19,9 @@ def model_runner():
max_model_len=512,
)
model_config = ModelConfig(
model="facebook/opt-125m",
model=os.path.join(models_path_prefix, "facebook/opt-125m"),
task="generate",
tokenizer="facebook/opt-125m",
tokenizer=os.path.join(models_path_prefix, "facebook/opt-125m"),
tokenizer_mode="auto",
trust_remote_code=True,
dtype="float16",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment