Commit bd363067 authored by lizhigong's avatar lizhigong
Browse files

Merge branch 'v0.8.5.post1-dev' into v0.8.5-zero_overhead

parents 87ef4618 d36deb1a
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Test that we handle a startup Error and shutdown.""" """Test that we handle a startup Error and shutdown."""
import os
import pytest import pytest
from tests.utils import wait_for_gpu_memory_to_clear from tests.utils import wait_for_gpu_memory_to_clear
...@@ -11,8 +12,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs ...@@ -11,8 +12,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.sampling_params import RequestOutputKind from vllm.sampling_params import RequestOutputKind
from vllm.utils import cuda_device_count_stateless from vllm.utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
from ...utils import models_path_prefix
MODELS = ["meta-llama/Llama-3.2-1B"] MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
@pytest.mark.asyncio @pytest.mark.asyncio
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
import asyncio import asyncio
import os
import pytest import pytest
from tests.utils import wait_for_gpu_memory_to_clear from tests.utils import wait_for_gpu_memory_to_clear
...@@ -14,8 +15,9 @@ from vllm.model_executor.models.llama import LlamaForCausalLM ...@@ -14,8 +15,9 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.utils import cuda_device_count_stateless from vllm.utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.exceptions import EngineDeadError from vllm.v1.engine.exceptions import EngineDeadError
from ...utils import models_path_prefix
MODELS = ["meta-llama/Llama-3.2-1B"] MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
def evil_forward(self, *args, **kwargs): def evil_forward(self, *args, **kwargs):
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
import asyncio import asyncio
import os
import pytest import pytest
from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT_SEC from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT_SEC
...@@ -12,8 +13,9 @@ from vllm.inputs.data import TokensPrompt ...@@ -12,8 +13,9 @@ from vllm.inputs.data import TokensPrompt
from vllm.sampling_params import RequestOutputKind from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.exceptions import EngineGenerateError from vllm.v1.engine.exceptions import EngineGenerateError
from ...utils import models_path_prefix
MODELS = ["meta-llama/Llama-3.2-1B"] MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
@pytest.mark.asyncio @pytest.mark.asyncio
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Test that we handle a startup Error and shutdown.""" """Test that we handle a startup Error and shutdown."""
import os
import pytest import pytest
from tests.utils import wait_for_gpu_memory_to_clear from tests.utils import wait_for_gpu_memory_to_clear
...@@ -12,8 +13,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs ...@@ -12,8 +13,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.utils import cuda_device_count_stateless from vllm.utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
from ...utils import models_path_prefix
MODELS = ["meta-llama/Llama-3.2-1B"] MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
def evil_method(self, *args, **kwargs): def evil_method(self, *args, **kwargs):
...@@ -69,7 +71,7 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int, ...@@ -69,7 +71,7 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
Test profiling (forward()) and load weights failures. Test profiling (forward()) and load weights failures.
TODO(andy) - LLM without multiprocessing. TODO(andy) - LLM without multiprocessing.
""" """
if model != "meta-llama/Llama-3.2-1B": if model != os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"):
pytest.skip(reason="Only test meta-llama/Llama-3.2-1B") pytest.skip(reason="Only test meta-llama/Llama-3.2-1B")
if cuda_device_count_stateless() < tensor_parallel_size: if cuda_device_count_stateless() < tensor_parallel_size:
pytest.skip(reason="Not enough CUDA devices") pytest.skip(reason="Not enough CUDA devices")
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Test whether spec decoding handles the max model length properly.""" """Test whether spec decoding handles the max model length properly."""
import os
import pytest import pytest
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from ...utils import models_path_prefix
_PROMPTS = [ _PROMPTS = [
"1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1", "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1",
...@@ -21,7 +23,7 @@ def test_ngram_max_len( ...@@ -21,7 +23,7 @@ def test_ngram_max_len(
m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_USE_V1", "1")
llm = LLM( llm = LLM(
model="facebook/opt-125m", model=os.path.join(models_path_prefix, "facebook/opt-125m"),
max_model_len=100, max_model_len=100,
enforce_eager=True, # For faster initialization. enforce_eager=True, # For faster initialization.
speculative_config={ speculative_config={
...@@ -44,11 +46,11 @@ def test_eagle_max_len( ...@@ -44,11 +46,11 @@ def test_eagle_max_len(
m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_USE_V1", "1")
llm = LLM( llm = LLM(
model="meta-llama/Meta-Llama-3-8B-Instruct", model=os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
enforce_eager=True, # For faster initialization. enforce_eager=True, # For faster initialization.
speculative_config={ speculative_config={
"method": "eagle", "method": "eagle",
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B", "model": os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3-Instruct-8B"),
"num_speculative_tokens": num_speculative_tokens, "num_speculative_tokens": num_speculative_tokens,
}, },
max_model_len=100, max_model_len=100,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import numpy as np import numpy as np
from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
from vllm.v1.spec_decode.ngram_proposer import (NgramProposer, from vllm.v1.spec_decode.ngram_proposer import (NgramProposer,
_find_subarray_kmp, _find_subarray_kmp,
_kmp_lps_array) _kmp_lps_array)
from ...utils import models_path_prefix
def test_kmp_lps_array(): def test_kmp_lps_array():
...@@ -43,10 +45,10 @@ def test_ngram_proposer(): ...@@ -43,10 +45,10 @@ def test_ngram_proposer():
def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
# Dummy model config. Just to set max_model_len. # Dummy model config. Just to set max_model_len.
model_config = ModelConfig(model="facebook/opt-125m", model_config = ModelConfig(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
task="generate", task="generate",
max_model_len=100, max_model_len=100,
tokenizer="facebook/opt-125m", tokenizer=os.path.join(models_path_prefix, "facebook/opt-125m"),
tokenizer_mode="auto", tokenizer_mode="auto",
dtype="auto", dtype="auto",
seed=None, seed=None,
......
...@@ -14,9 +14,10 @@ from vllm.platforms import current_platform ...@@ -14,9 +14,10 @@ from vllm.platforms import current_platform
from vllm.sampling_params import RequestOutputKind from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.core_client import DPAsyncMPClient from vllm.v1.engine.core_client import DPAsyncMPClient
from ..utils import models_path_prefix
engine_args = AsyncEngineArgs( engine_args = AsyncEngineArgs(
model="ibm-research/PowerMoE-3b", model=os.path.join(models_path_prefix, "ibm-research/PowerMoE-3b"),
enforce_eager=True, enforce_eager=True,
disable_log_requests=True, disable_log_requests=True,
tensor_parallel_size=int(os.getenv("TP_SIZE", 1)), tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
......
...@@ -7,16 +7,17 @@ import vllm.envs as envs ...@@ -7,16 +7,17 @@ import vllm.envs as envs
from vllm import LLM from vllm import LLM
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from ..utils import models_path_prefix
UNSUPPORTED_MODELS_V1 = [ UNSUPPORTED_MODELS_V1 = [
"openai/whisper-large-v3", # transcription os.path.join(models_path_prefix, "openai/whisper-large-v3"), # transcription
"facebook/bart-large-cnn", # encoder decoder os.path.join(models_path_prefix, "facebook/bart-large-cnn"), # encoder decoder
"mistralai/Mamba-Codestral-7B-v0.1", # mamba os.path.join(models_path_prefix, "mistralai/Mamba-Codestral-7B-v0.1"), # mamba
"ibm-ai-platform/Bamba-9B", # hybrid os.path.join(models_path_prefix, "ibm-ai-platform/Bamba-9B"), # hybrid
"BAAI/bge-m3", # embedding os.path.join(models_path_prefix, "BAAI/bge-m3"), # embedding
] ]
MODEL = "meta-llama/Llama-3.2-1B-Instruct" MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
@pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1) @pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import pytest import pytest
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
...@@ -7,6 +8,7 @@ from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, ...@@ -7,6 +8,7 @@ from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
SchedulerOutput) SchedulerOutput)
from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.worker.gpu_model_runner import GPUModelRunner from vllm.v1.worker.gpu_model_runner import GPUModelRunner
from ...utils import models_path_prefix
@pytest.fixture @pytest.fixture
...@@ -17,9 +19,9 @@ def model_runner(): ...@@ -17,9 +19,9 @@ def model_runner():
max_model_len=512, max_model_len=512,
) )
model_config = ModelConfig( model_config = ModelConfig(
model="facebook/opt-125m", model=os.path.join(models_path_prefix, "facebook/opt-125m"),
task="generate", task="generate",
tokenizer="facebook/opt-125m", tokenizer=os.path.join(models_path_prefix, "facebook/opt-125m"),
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=True, trust_remote_code=True,
dtype="float16", dtype="float16",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment