Commit bd363067 authored by lizhigong's avatar lizhigong
Browse files

Merge branch 'v0.8.5.post1-dev' into v0.8.5-zero_overhead

parents 87ef4618 d36deb1a
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Test that we handle a startup Error and shutdown.""" """Test that we handle a startup Error and shutdown."""
import os
import pytest import pytest
from tests.utils import wait_for_gpu_memory_to_clear from tests.utils import wait_for_gpu_memory_to_clear
...@@ -11,8 +12,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs ...@@ -11,8 +12,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.sampling_params import RequestOutputKind from vllm.sampling_params import RequestOutputKind
from vllm.utils import cuda_device_count_stateless from vllm.utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
from ...utils import models_path_prefix
MODELS = ["meta-llama/Llama-3.2-1B"] MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -94,4 +96,4 @@ def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int, ...@@ -94,4 +96,4 @@ def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int,
wait_for_gpu_memory_to_clear( wait_for_gpu_memory_to_clear(
devices=list(range(tensor_parallel_size)), devices=list(range(tensor_parallel_size)),
threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES, threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
) )
\ No newline at end of file
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
import asyncio import asyncio
import os
import pytest import pytest
from tests.utils import wait_for_gpu_memory_to_clear from tests.utils import wait_for_gpu_memory_to_clear
...@@ -14,8 +15,9 @@ from vllm.model_executor.models.llama import LlamaForCausalLM ...@@ -14,8 +15,9 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.utils import cuda_device_count_stateless from vllm.utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.exceptions import EngineDeadError from vllm.v1.engine.exceptions import EngineDeadError
from ...utils import models_path_prefix
MODELS = ["meta-llama/Llama-3.2-1B"] MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
def evil_forward(self, *args, **kwargs): def evil_forward(self, *args, **kwargs):
...@@ -126,4 +128,4 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size: int, ...@@ -126,4 +128,4 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size: int,
wait_for_gpu_memory_to_clear( wait_for_gpu_memory_to_clear(
devices=list(range(tensor_parallel_size)), devices=list(range(tensor_parallel_size)),
threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES, threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
) )
\ No newline at end of file
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
import asyncio import asyncio
import os
import pytest import pytest
from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT_SEC from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT_SEC
...@@ -12,8 +13,9 @@ from vllm.inputs.data import TokensPrompt ...@@ -12,8 +13,9 @@ from vllm.inputs.data import TokensPrompt
from vllm.sampling_params import RequestOutputKind from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.exceptions import EngineGenerateError from vllm.v1.engine.exceptions import EngineGenerateError
from ...utils import models_path_prefix
MODELS = ["meta-llama/Llama-3.2-1B"] MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -66,4 +68,4 @@ async def test_async_llm_processor_error(model: str) -> None: ...@@ -66,4 +68,4 @@ async def test_async_llm_processor_error(model: str) -> None:
generated_tokens.extend(out.outputs[0].token_ids) generated_tokens.extend(out.outputs[0].token_ids)
assert len(generated_tokens) == EXPECTED_TOKENS assert len(generated_tokens) == EXPECTED_TOKENS
async_llm.shutdown() async_llm.shutdown()
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Test that we handle a startup Error and shutdown.""" """Test that we handle a startup Error and shutdown."""
import os
import pytest import pytest
from tests.utils import wait_for_gpu_memory_to_clear from tests.utils import wait_for_gpu_memory_to_clear
...@@ -12,8 +13,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs ...@@ -12,8 +13,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.utils import cuda_device_count_stateless from vllm.utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
from ...utils import models_path_prefix
MODELS = ["meta-llama/Llama-3.2-1B"] MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
def evil_method(self, *args, **kwargs): def evil_method(self, *args, **kwargs):
...@@ -69,7 +71,7 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int, ...@@ -69,7 +71,7 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
Test profiling (forward()) and load weights failures. Test profiling (forward()) and load weights failures.
TODO(andy) - LLM without multiprocessing. TODO(andy) - LLM without multiprocessing.
""" """
if model != "meta-llama/Llama-3.2-1B": if model != os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"):
pytest.skip(reason="Only test meta-llama/Llama-3.2-1B") pytest.skip(reason="Only test meta-llama/Llama-3.2-1B")
if cuda_device_count_stateless() < tensor_parallel_size: if cuda_device_count_stateless() < tensor_parallel_size:
pytest.skip(reason="Not enough CUDA devices") pytest.skip(reason="Not enough CUDA devices")
...@@ -94,4 +96,4 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int, ...@@ -94,4 +96,4 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
wait_for_gpu_memory_to_clear( wait_for_gpu_memory_to_clear(
devices=list(range(tensor_parallel_size)), devices=list(range(tensor_parallel_size)),
threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES, threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
) )
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Test whether spec decoding handles the max model length properly.""" """Test whether spec decoding handles the max model length properly."""
import os
import pytest import pytest
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from ...utils import models_path_prefix
_PROMPTS = [ _PROMPTS = [
"1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1", "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1",
...@@ -21,7 +23,7 @@ def test_ngram_max_len( ...@@ -21,7 +23,7 @@ def test_ngram_max_len(
m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_USE_V1", "1")
llm = LLM( llm = LLM(
model="facebook/opt-125m", model=os.path.join(models_path_prefix, "facebook/opt-125m"),
max_model_len=100, max_model_len=100,
enforce_eager=True, # For faster initialization. enforce_eager=True, # For faster initialization.
speculative_config={ speculative_config={
...@@ -44,11 +46,11 @@ def test_eagle_max_len( ...@@ -44,11 +46,11 @@ def test_eagle_max_len(
m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_USE_V1", "1")
llm = LLM( llm = LLM(
model="meta-llama/Meta-Llama-3-8B-Instruct", model=os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
enforce_eager=True, # For faster initialization. enforce_eager=True, # For faster initialization.
speculative_config={ speculative_config={
"method": "eagle", "method": "eagle",
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B", "model": os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3-Instruct-8B"),
"num_speculative_tokens": num_speculative_tokens, "num_speculative_tokens": num_speculative_tokens,
}, },
max_model_len=100, max_model_len=100,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import numpy as np import numpy as np
from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
from vllm.v1.spec_decode.ngram_proposer import (NgramProposer, from vllm.v1.spec_decode.ngram_proposer import (NgramProposer,
_find_subarray_kmp, _find_subarray_kmp,
_kmp_lps_array) _kmp_lps_array)
from ...utils import models_path_prefix
def test_kmp_lps_array(): def test_kmp_lps_array():
...@@ -43,10 +45,10 @@ def test_ngram_proposer(): ...@@ -43,10 +45,10 @@ def test_ngram_proposer():
def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
# Dummy model config. Just to set max_model_len. # Dummy model config. Just to set max_model_len.
model_config = ModelConfig(model="facebook/opt-125m", model_config = ModelConfig(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
task="generate", task="generate",
max_model_len=100, max_model_len=100,
tokenizer="facebook/opt-125m", tokenizer=os.path.join(models_path_prefix, "facebook/opt-125m"),
tokenizer_mode="auto", tokenizer_mode="auto",
dtype="auto", dtype="auto",
seed=None, seed=None,
...@@ -86,4 +88,4 @@ def test_ngram_proposer(): ...@@ -86,4 +88,4 @@ def test_ngram_proposer():
result = ngram_proposer( result = ngram_proposer(
2, 4, 2, 4,
2).propose(context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4])) 2).propose(context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]))
assert np.array_equal(result, np.array([1, 2])) # Not [5, 2] assert np.array_equal(result, np.array([1, 2])) # Not [5, 2]
\ No newline at end of file
...@@ -14,9 +14,10 @@ from vllm.platforms import current_platform ...@@ -14,9 +14,10 @@ from vllm.platforms import current_platform
from vllm.sampling_params import RequestOutputKind from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.core_client import DPAsyncMPClient from vllm.v1.engine.core_client import DPAsyncMPClient
from ..utils import models_path_prefix
engine_args = AsyncEngineArgs( engine_args = AsyncEngineArgs(
model="ibm-research/PowerMoE-3b", model=os.path.join(models_path_prefix, "ibm-research/PowerMoE-3b"),
enforce_eager=True, enforce_eager=True,
disable_log_requests=True, disable_log_requests=True,
tensor_parallel_size=int(os.getenv("TP_SIZE", 1)), tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
...@@ -106,4 +107,4 @@ async def test_load(output_kind: RequestOutputKind): ...@@ -106,4 +107,4 @@ async def test_load(output_kind: RequestOutputKind):
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
assert not core_client.engines_running assert not core_client.engines_running
assert not core_client.reqs_in_flight assert not core_client.reqs_in_flight
\ No newline at end of file
...@@ -7,16 +7,17 @@ import vllm.envs as envs ...@@ -7,16 +7,17 @@ import vllm.envs as envs
from vllm import LLM from vllm import LLM
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from ..utils import models_path_prefix
UNSUPPORTED_MODELS_V1 = [ UNSUPPORTED_MODELS_V1 = [
"openai/whisper-large-v3", # transcription os.path.join(models_path_prefix, "openai/whisper-large-v3"), # transcription
"facebook/bart-large-cnn", # encoder decoder os.path.join(models_path_prefix, "facebook/bart-large-cnn"), # encoder decoder
"mistralai/Mamba-Codestral-7B-v0.1", # mamba os.path.join(models_path_prefix, "mistralai/Mamba-Codestral-7B-v0.1"), # mamba
"ibm-ai-platform/Bamba-9B", # hybrid os.path.join(models_path_prefix, "ibm-ai-platform/Bamba-9B"), # hybrid
"BAAI/bge-m3", # embedding os.path.join(models_path_prefix, "BAAI/bge-m3"), # embedding
] ]
MODEL = "meta-llama/Llama-3.2-1B-Instruct" MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
@pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1) @pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1)
...@@ -160,4 +161,4 @@ def test_reject_using_constructor_directly(monkeypatch): ...@@ -160,4 +161,4 @@ def test_reject_using_constructor_directly(monkeypatch):
AsyncLLMEngine._get_executor_cls(vllm_config), AsyncLLMEngine._get_executor_cls(vllm_config),
log_stats=True) log_stats=True)
m.delenv("VLLM_USE_V1") m.delenv("VLLM_USE_V1")
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import pytest import pytest
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
...@@ -7,6 +8,7 @@ from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, ...@@ -7,6 +8,7 @@ from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
SchedulerOutput) SchedulerOutput)
from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.worker.gpu_model_runner import GPUModelRunner from vllm.v1.worker.gpu_model_runner import GPUModelRunner
from ...utils import models_path_prefix
@pytest.fixture @pytest.fixture
...@@ -17,9 +19,9 @@ def model_runner(): ...@@ -17,9 +19,9 @@ def model_runner():
max_model_len=512, max_model_len=512,
) )
model_config = ModelConfig( model_config = ModelConfig(
model="facebook/opt-125m", model=os.path.join(models_path_prefix, "facebook/opt-125m"),
task="generate", task="generate",
tokenizer="facebook/opt-125m", tokenizer=os.path.join(models_path_prefix, "facebook/opt-125m"),
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=True, trust_remote_code=True,
dtype="float16", dtype="float16",
...@@ -276,4 +278,4 @@ def test_update_states_request_unscheduled(model_runner): ...@@ -276,4 +278,4 @@ def test_update_states_request_unscheduled(model_runner):
assert _is_req_scheduled(model_runner, req_ids[0]) assert _is_req_scheduled(model_runner, req_ids[0])
assert _is_req_added(model_runner, req_ids[1]) assert _is_req_added(model_runner, req_ids[1])
assert not _is_req_scheduled(model_runner, req_ids[1]) assert not _is_req_scheduled(model_runner, req_ids[1])
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment