Commit 38d80967 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori

parents 33650733 880c741b
......@@ -686,7 +686,7 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
async def test_completion_with_empty_prompt_embeds(
client: openai.AsyncOpenAI) -> None:
"""Test completion with empty prompt embeds."""
payload: dict[str, list] = {"prompt_embeds": []}
payload: dict[str, object] = {"prompt": "Hello", "prompt_embeds": []}
headers: dict[str, str] = {"Content-Type": "application/json"}
# base_url = http://localhost:8000/v1/completions
response = requests.post(f"{client.base_url}completions",
......
......@@ -27,7 +27,7 @@ class CustomMultiprocExecutor(MultiprocExecutor):
kwargs: Optional[dict] = None,
non_block: bool = False,
unique_reply_rank: Optional[int] = None) -> list[Any]:
# Drop marker to show that this was ran
# Drop marker to show that this was run
with open(".marker", "w"):
...
return super().collective_rpc(method, timeout, args, kwargs)
......
......@@ -42,7 +42,7 @@ def test_basic_lifecycle():
engine_core_outputs = scheduler.update_from_output(scheduler_output,
model_runner_output)
# Ensure the request is finished after 1 tokens.
# Ensure the request is finished after 1 token.
assert request.is_finished()
assert request.status == RequestStatus.FINISHED_LENGTH_CAPPED
output = engine_core_outputs[0].outputs[0]
......@@ -141,7 +141,7 @@ def test_short_prompt_lifecycle():
def test_prefix_cache_lifecycle():
"""Test that remote decode params still works with a prefix cache hit."""
"""Test that remote decode params still work with a prefix cache hit."""
vllm_config = create_vllm_config()
scheduler = create_scheduler(vllm_config)
......
......@@ -33,7 +33,7 @@ def _check_path_len(path):
def _list_path(path):
"""Return the list of foldername (hashes generatd) under the path"""
"""Return the list of foldername (hashes generated) under the path"""
return list(path.iterdir())
......@@ -41,7 +41,7 @@ def run_test(tmp_path, processor, llm: LLM, question: str,
image_urls: list[Image], expected_len: int, info: str):
"""
One individual test to process the prompt and output base on 1 set of input
Then check if the length in the strorage path matches the expected length
Then check if the length in the storage path matches the expected length
`info` introduces details or purpose of the individual test
"""
print(f"***info: {info}***")
......@@ -115,7 +115,7 @@ def test_shared_storage_connector_hashes(tmp_path):
"""
Tests that SharedStorageConnector saves KV to the storage locations
with proper hashes; that are unique for inputs with identical text but
differnt images (same size), or same multiple images but different orders.
different images (same size), or same multiple images but different orders.
"""
# Using tmp_path as the storage path to store KV
print(f"KV storage path at: {str(tmp_path)}")
......@@ -171,12 +171,12 @@ def test_shared_storage_connector_hashes(tmp_path):
img=[image_1],
expected_len=2,
info=("image_1 single input the 2nd time. "
"It should not form aother new hash.")),
"It should not form another new hash.")),
InputCase(text=TEXT_PROMPTS[0],
img=[image_2],
expected_len=2,
info=("image_2 single input the 2nd time. "
"It should not form aother new hash.")),
"It should not form another new hash.")),
InputCase(text=TEXT_PROMPTS[0],
img=[image_1, image_2],
expected_len=3,
......@@ -189,12 +189,12 @@ def test_shared_storage_connector_hashes(tmp_path):
img=[image_1, image_2],
expected_len=4,
info=("[image_1, image_2] input the 2nd time. "
"It should not form aother new hash.")),
"It should not form another new hash.")),
InputCase(text=TEXT_PROMPTS[0],
img=[image_2, image_1],
expected_len=4,
info=("[image_2, image_1] input the 2nd time. "
"It should not form aother new hash.")),
"It should not form another new hash.")),
InputCase(text=TEXT_PROMPTS[0],
img=[],
expected_len=5,
......
......@@ -13,6 +13,7 @@ from vllm.distributed.kv_transfer.kv_connector.factory import (
KVConnectorFactory)
from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import ( # noqa
SharedStorageConnector)
from vllm.utils import sha256
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
init_none_hash)
......@@ -127,11 +128,11 @@ def create_request(request_id: int,
use_all_1s_for_prompt_tokens: bool = False,
num_remote_blocks: int = 3,
block_size: int = 16,
hash_fn: Callable = hash) -> Request:
hash_fn: Callable = sha256) -> Request:
"""Make dummy request for testing."""
global _none_hash_initialized
if not _none_hash_initialized:
init_none_hash(hash)
init_none_hash(hash_fn)
_none_hash_initialized = True
kv_transfer_params: Optional[dict[str, Any]] = None
......
......@@ -15,6 +15,7 @@ from tests.v1.logits_processors.utils import (DUMMY_LOGITPROC_ARG,
POOLING_MODEL_NAME, TEMP_GREEDY,
CustomLogitprocSource,
DummyLogitsProcessor,
WrappedPerReqLogitsProcessor,
dummy_module)
from tests.v1.logits_processors.utils import entry_points as fake_entry_points
from tests.v1.logits_processors.utils import prompts
......@@ -80,7 +81,7 @@ def _run_test(kwargs: dict, logitproc_loaded: bool) -> None:
target_token = params.extra_args[DUMMY_LOGITPROC_ARG]
if not all(x == target_token for x in lp_toks):
raise AssertionError(
f"Request {bdx} generated {lp_toks}, shoud all be "
f"Request {bdx} generated {lp_toks}, should all be "
f"{target_token}")
else:
# This request does not exercise custom logitproc (or custom
......@@ -161,6 +162,38 @@ def test_custom_logitsprocs(monkeypatch,
_run_test(kwargs, logitproc_loaded=True)
@create_new_process_for_each_test()
def test_custom_logitsprocs_req(monkeypatch):
"""Test passing request-level logits processor to offline Python interface
Wrap a request-level logits processor to create a batch level logits
processor that has a well-defined behavior (mask out all tokens except one
`target_token`)
Construct an `LLM` instance which loads the wrapped logits processor. Pass
the custom logitproc as a class object.
Construct a reference `LLM` instance with no custom logitproc
Pass in a batch of requests, 50% of which pass a `target_token` value
in through `SamplingParams.extra_args`, 50% of which do not.
Validate that
* Requests which do not activate the custom logitproc, yield the same
results for both `LLM` instances
* Requests which activate the custom logitproc, only output `target_token`
Args:
monkeypatch: for setting env vars
"""
# Test that logitproc info is passed to workers
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1")
random.seed(40)
_run_test({"logits_processors": [WrappedPerReqLogitsProcessor]},
logitproc_loaded=True)
@create_new_process_for_each_test()
@pytest.mark.parametrize("logitproc_source", [
CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT,
......
......@@ -3,15 +3,21 @@
import types
from enum import Enum, auto
from typing import Optional
from typing import Any, Optional
import torch
from vllm.config import VllmConfig
from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP, BatchUpdate,
LogitsProcessor)
from vllm.logger import init_logger
from vllm.sampling_params import SamplingParams
from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP,
AdapterLogitsProcessor,
BatchUpdate, LogitsProcessor,
RequestLogitsProcessor)
from vllm.v1.sample.logits_processor.builtin import process_dict_updates
logger = init_logger(__name__)
MODEL_NAME = "facebook/opt-125m"
POOLING_MODEL_NAME = "BAAI/bge-base-en-v1.5"
DUMMY_LOGITPROC_ARG = "target_token"
......@@ -104,5 +110,60 @@ class EntryPoints(list):
self.names = [ep.name for ep in eps]
class DummyPerReqLogitsProcessor:
"""The request-level logits processor masks out all logits except the
token id identified by `target_token`"""
def __init__(self, target_token: int) -> None:
"""Specify `target_token`"""
self.target_token = target_token
def __call__(
self,
output_ids: list[int],
logits: torch.Tensor,
) -> torch.Tensor:
val_to_keep = logits[self.target_token].item()
logits[:] = float("-inf")
logits[self.target_token] = val_to_keep
return logits
class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
"""Example of wrapping a fake request-level logit processor to create a
batch-level logits processor"""
def is_argmax_invariant(self) -> bool:
return False
def new_req_logits_processor(
self,
params: SamplingParams,
) -> Optional[RequestLogitsProcessor]:
"""This method returns a new request-level logits processor, customized
to the `target_token` value associated with a particular request.
Returns None if the logits processor should not be applied to the
particular request. To use the logits processor the request must have
a "target_token" custom argument with an integer value.
Args:
params: per-request sampling params
Returns:
`Callable` request logits processor, or None
"""
target_token: Optional[
Any] = params.extra_args and params.extra_args.get("target_token")
if target_token is None:
return None
if not isinstance(target_token, int):
logger.warning(
"target_token value %s is not int; not applying logits"
" processor to request.", target_token)
return None
return DummyPerReqLogitsProcessor(target_token)
"""Fake version of importlib.metadata.entry_points"""
entry_points = lambda group: EntryPoints(group)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copy
import pytest
from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger
class DummyStatLogger:
"""
A dummy stat logger for testing purposes.
Implements the minimal interface expected by StatLoggerManager.
"""
def __init__(self, vllm_config, engine_idx):
self.vllm_config = vllm_config
self.engine_idx = engine_idx
self.recorded = []
self.logged = False
self.engine_initialized = False
def record(self, scheduler_stats, iteration_stats, engine_idx):
self.recorded.append((scheduler_stats, iteration_stats, engine_idx))
def log(self):
self.logged = True
def log_engine_initialized(self):
self.engine_initialized = True
@pytest.fixture
def log_stats_enabled_engine_args():
"""
Shared fixture providing common AsyncEngineArgs configuration
used across multiple tests.
"""
return AsyncEngineArgs(
model="distilbert/distilgpt2",
dtype="half",
disable_log_stats=False,
enforce_eager=True,
)
@pytest.mark.asyncio
async def test_async_llm_replace_default_loggers(
log_stats_enabled_engine_args):
"""
RayPrometheusStatLogger should replace the default PrometheusStatLogger
"""
engine = AsyncLLM.from_engine_args(log_stats_enabled_engine_args,
stat_loggers=[RayPrometheusStatLogger])
assert isinstance(engine.logger_manager.prometheus_logger,
RayPrometheusStatLogger)
engine.shutdown()
@pytest.mark.asyncio
async def test_async_llm_add_to_default_loggers(log_stats_enabled_engine_args):
"""
It's still possible to use custom stat loggers exclusively by passing
disable_log_stats=True in addition to a list of custom stat loggers.
"""
# Create engine_args with disable_log_stats=True for this test
disabled_log_engine_args = copy.deepcopy(log_stats_enabled_engine_args)
disabled_log_engine_args.disable_log_stats = True
# Disable default loggers; pass custom stat logger to the constructor
engine = AsyncLLM.from_engine_args(disabled_log_engine_args,
stat_loggers=[DummyStatLogger])
assert len(engine.logger_manager.per_engine_logger_dict[0]) == 1
assert isinstance(engine.logger_manager.per_engine_logger_dict[0][0],
DummyStatLogger)
# log_stats is still True, since custom stat loggers are used
assert engine.log_stats
engine.shutdown()
......@@ -430,7 +430,7 @@ def test_zero_logprobs(vllm_model, example_prompts,
def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
"""Engine should return all vocabulary logprobs
"""Engine should return all vocabulary logprobs and prompt logprobs
Args:
example_prompts: list of example prompts (test fixture)
......@@ -444,16 +444,24 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
# 2 other llms alive during whole session
gpu_memory_utilization=0.15,
max_model_len=256)
sampling_params_logprobs_all = SamplingParams(max_tokens=5,
logprobs=-1)
logprobs=-1,
prompt_logprobs=-1)
results_logprobs_all = runner.llm.generate(
example_prompts, sampling_params=sampling_params_logprobs_all)
vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size()
for i in range(len(results_logprobs_all)):
logprobs = results_logprobs_all[i].outputs[0].logprobs
prompt_logprobs = results_logprobs_all[i].prompt_logprobs
assert logprobs is not None
for logprob in logprobs:
assert len(logprob) == vocab_size
assert prompt_logprobs is not None
assert prompt_logprobs[0] is None
for prompt_logprob in prompt_logprobs[1:]:
assert len(prompt_logprob) == vocab_size
@pytest.mark.parametrize("logprobs_mode", list(LogprobsMode))
......
......@@ -12,9 +12,10 @@ from tests.v1.attention.utils import (BatchSpec, _Backend,
create_common_attn_metadata,
create_standard_kv_cache_spec,
get_attention_backend)
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
ParallelConfig, SchedulerConfig, SpeculativeConfig,
VllmConfig)
from vllm.config.load import LoadConfig
from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.platforms import current_platform
from vllm.v1.spec_decode.eagle import EagleProposer
......@@ -183,7 +184,7 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
mock_pp_group.world_size = pp_size
mock_get_pp_group.return_value = mock_pp_group
# Setup the target model mock with a custom class so that
# Set up the target model mock with a custom class so that
# isinstance() checks match the expected type.
class _TargetModelStub(LlamaForCausalLM):
model: mock.MagicMock
......
......@@ -187,7 +187,7 @@ def test_tree_attn_correctness() -> None:
dtype=torch.bfloat16,
)
# Setup the block table and KV cache for paged KV.
# Set up the block table and KV cache for paged KV.
assert max_sequence_length % block_size == 0
max_blocks_per_batch = max_sequence_length // block_size
kv_cache = torch.randn(
......@@ -222,7 +222,7 @@ def test_tree_attn_correctness() -> None:
num_alloc_blocks_per_batch] = block_ids.view(
-1, num_alloc_blocks_per_batch)
# Setup the slot mapping for the input KVs.
# Set up the slot mapping for the input KVs.
tree_positions = sequence_position + torch.arange(
0,
tree_size_q,
......
......@@ -30,7 +30,7 @@ def test_initialize_kv_cache_for_kv_sharing_different_attn_groups():
}
# Layers 0 and 1 both belong in KV cache group 0
# However, if they have have different attention backends, they will be
# However, if they have different attention backends, they will be
# placed in different attention groups for KV cache group 0
kv_cache_groups = [
KVCacheGroupSpec(["model.layers.0", "model.layers.1"],
......
......@@ -10,7 +10,6 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
UNSUPPORTED_MODELS_V1 = [
"openai/whisper-large-v3", # transcription
"facebook/bart-large-cnn", # encoder decoder
]
......
......@@ -4,18 +4,19 @@
import openai
import pytest
from vllm.multimodal.utils import encode_image_base64, fetch_image
from vllm.multimodal.utils import encode_image_base64
from vllm.platforms import current_platform
from ...entrypoints.openai.test_vision import TEST_IMAGE_URLS
from ...entrypoints.openai.test_vision import TEST_IMAGE_ASSETS
from ...utils import RemoteOpenAIServer
@pytest.fixture(scope="session")
def base64_encoded_image() -> dict[str, str]:
def base64_encoded_image(local_asset_server) -> dict[str, str]:
return {
image_url: encode_image_base64(fetch_image(image_url))
for image_url in TEST_IMAGE_URLS
image_asset:
encode_image_base64(local_asset_server.get_image_asset(image_asset))
for image_asset in TEST_IMAGE_ASSETS
}
......@@ -66,7 +67,7 @@ async def test_basic_vision(model_name: str, base64_encoded_image: dict[str,
client: openai.AsyncOpenAI = remote_server.get_async_client()
# Other requests now should be much faster
for image_url in TEST_IMAGE_URLS:
for image_url in TEST_IMAGE_ASSETS:
image_base64 = base64_encoded_image[image_url]
chat_completion_from_base64 = await client.chat.completions\
.create(
......
......@@ -6,8 +6,12 @@ import pytest
import torch
from vllm.platforms import current_platform
from vllm.v1.sample.ops.topk_topp_sampler import (apply_top_k_top_p,
apply_top_k_top_p_tpu)
from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
# isort: off
from vllm.v1.sample.tpu.sampler import (apply_top_k_top_p as
apply_top_k_top_p_tpu)
# isort: on
if not current_platform.is_tpu():
pytest.skip("This test needs a TPU.", allow_module_level=True)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa
# type: ignore
from __future__ import annotations
import threading
from collections.abc import Iterable
from concurrent import futures
from typing import Callable, Generator, Literal
import grpc
import pytest
from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import (
ExportTraceServiceResponse)
from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import (
TraceServiceServicer, add_TraceServiceServicer_to_server)
from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue
from opentelemetry.sdk.environment_variables import (
OTEL_EXPORTER_OTLP_TRACES_INSECURE)
from vllm import LLM, SamplingParams
from vllm.tracing import SpanAttributes
FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',
'array_value']
def decode_value(value: AnyValue):
field_decoders: dict[FieldName, Callable] = {
"bool_value": (lambda v: v.bool_value),
"string_value": (lambda v: v.string_value),
"int_value": (lambda v: v.int_value),
"double_value": (lambda v: v.double_value),
"array_value":
(lambda v: [decode_value(item) for item in v.array_value.values]),
}
for field, decoder in field_decoders.items():
if value.HasField(field):
return decoder(value)
raise ValueError(f"Couldn't decode value: {value}")
def decode_attributes(attributes: Iterable[KeyValue]):
return {kv.key: decode_value(kv.value) for kv in attributes}
class FakeTraceService(TraceServiceServicer):
def __init__(self):
self.request = None
self.evt = threading.Event()
def Export(self, request, context):
self.request = request
self.evt.set()
return ExportTraceServiceResponse()
@pytest.fixture
def trace_service() -> Generator[FakeTraceService, None, None]:
"""Fixture to set up a fake gRPC trace service"""
server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
service = FakeTraceService()
add_TraceServiceServicer_to_server(service, server)
server.add_insecure_port(FAKE_TRACE_SERVER_ADDRESS)
server.start()
yield service
server.stop(None)
def test_traces(
monkeypatch: pytest.MonkeyPatch,
trace_service: FakeTraceService,
):
with monkeypatch.context() as m:
m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
m.setenv("VLLM_USE_V1", "1")
sampling_params = SamplingParams(
temperature=0.01,
top_p=0.1,
max_tokens=256,
)
model = "facebook/opt-125m"
llm = LLM(model=model,
otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
gpu_memory_utilization=0.3,
disable_log_stats=False)
prompts = ["This is a short prompt"]
outputs = llm.generate(prompts, sampling_params=sampling_params)
print(f"test_traces outputs is : {outputs}")
timeout = 10
if not trace_service.evt.wait(timeout):
raise TimeoutError(
f"The fake trace service didn't receive a trace within "
f"the {timeout} seconds timeout")
request = trace_service.request
assert len(request.resource_spans) == 1, (
f"Expected 1 resource span, "
f"but got {len(request.resource_spans)}")
assert len(request.resource_spans[0].scope_spans) == 1, (
f"Expected 1 scope span, "
f"but got {len(request.resource_spans[0].scope_spans)}")
assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
f"Expected 1 span, "
f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
attributes = decode_attributes(
request.resource_spans[0].scope_spans[0].spans[0].attributes)
# assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
assert attributes.get(
SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
) == sampling_params.temperature
assert attributes.get(
SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
) == sampling_params.max_tokens
assert attributes.get(
SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
assert attributes.get(
SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
outputs[0].prompt_token_ids)
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
assert attributes.get(
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) > 0
assert attributes.get(
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) > 0
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) > 0
......@@ -702,7 +702,7 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
KVCacheTensors for the attention and mamba layers
(via _reshape_kv_cache_tensors function). This test verifies
that the views are compatible: writing a mamba block
will not corrupt an attention block and vice-versa
will not corrupt an attention block and vice versa
'''
current_platform.seed_everything(42)
......
......@@ -6,7 +6,7 @@ set -e
# Default values
DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
DEEPGEMM_GIT_REF="7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c"
DEEPGEMM_GIT_REF="ea9c5d9270226c5dd7a577c212e9ea385f6ef048"
# Parse command line arguments
while [[ $# -gt 0 ]]; do
......@@ -105,4 +105,4 @@ fi
popd
echo "✅ DeepGEMM installation completed successfully"
\ No newline at end of file
echo "✅ DeepGEMM installation completed successfully"
......@@ -29,7 +29,7 @@ run_mypy vllm/engine
run_mypy vllm/executor
run_mypy vllm/inputs
run_mypy vllm/lora
run_mypy vllm/model_executor
run_mypy --exclude 'vllm/model_executor/layers/fla/ops' vllm/model_executor
run_mypy vllm/plugins
run_mypy vllm/worker
run_mypy vllm/v1
......@@ -119,7 +119,7 @@ def attempt_to_make_names_unique(entries_and_traces):
if not all_the_same(trace_eles)), None)
if first_trace_difference is None:
# can't create a unique name, leave them names as the
# can't create a unique name, leave the names as they
# are they will get aggregated by the pivot_table call
continue
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment