Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
38d80967
Commit
38d80967
authored
Sep 12, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori
parents
33650733
880c741b
Changes
544
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
366 additions
and
38 deletions
+366
-38
tests/v1/entrypoints/openai/test_completion.py
tests/v1/entrypoints/openai/test_completion.py
+1
-1
tests/v1/executor/test_executor.py
tests/v1/executor/test_executor.py
+1
-1
tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
+2
-2
tests/v1/kv_connector/unit/test_shared_storage_connector.py
tests/v1/kv_connector/unit/test_shared_storage_connector.py
+7
-7
tests/v1/kv_connector/unit/utils.py
tests/v1/kv_connector/unit/utils.py
+3
-2
tests/v1/logits_processors/test_custom_offline.py
tests/v1/logits_processors/test_custom_offline.py
+34
-1
tests/v1/logits_processors/utils.py
tests/v1/logits_processors/utils.py
+64
-3
tests/v1/metrics/test_engine_logger_apis.py
tests/v1/metrics/test_engine_logger_apis.py
+83
-0
tests/v1/sample/test_logprobs.py
tests/v1/sample/test_logprobs.py
+10
-2
tests/v1/spec_decode/test_eagle.py
tests/v1/spec_decode/test_eagle.py
+3
-2
tests/v1/spec_decode/test_tree_attention.py
tests/v1/spec_decode/test_tree_attention.py
+2
-2
tests/v1/test_kv_sharing.py
tests/v1/test_kv_sharing.py
+1
-1
tests/v1/test_oracle.py
tests/v1/test_oracle.py
+0
-1
tests/v1/tpu/test_multimodal.py
tests/v1/tpu/test_multimodal.py
+7
-6
tests/v1/tpu/test_topk_topp_sampler.py
tests/v1/tpu/test_topk_topp_sampler.py
+6
-2
tests/v1/tracing/test_tracing.py
tests/v1/tracing/test_tracing.py
+137
-0
tests/v1/worker/test_gpu_model_runner.py
tests/v1/worker/test_gpu_model_runner.py
+1
-1
tools/install_deepgemm.sh
tools/install_deepgemm.sh
+2
-2
tools/mypy.sh
tools/mypy.sh
+1
-1
tools/profiler/visualize_layerwise_profile.py
tools/profiler/visualize_layerwise_profile.py
+1
-1
No files found.
Too many changes to show.
To preserve performance only
544 of 544+
files are displayed.
Plain diff
Email patch
tests/v1/entrypoints/openai/test_completion.py
View file @
38d80967
...
...
@@ -686,7 +686,7 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
async
def
test_completion_with_empty_prompt_embeds
(
client
:
openai
.
AsyncOpenAI
)
->
None
:
"""Test completion with empty prompt embeds."""
payload
:
dict
[
str
,
lis
t
]
=
{
"prompt_embeds"
:
[]}
payload
:
dict
[
str
,
objec
t
]
=
{
"prompt"
:
"Hello"
,
"prompt_embeds"
:
[]}
headers
:
dict
[
str
,
str
]
=
{
"Content-Type"
:
"application/json"
}
# base_url = http://localhost:8000/v1/completions
response
=
requests
.
post
(
f
"
{
client
.
base_url
}
completions"
,
...
...
tests/v1/executor/test_executor.py
View file @
38d80967
...
...
@@ -27,7 +27,7 @@ class CustomMultiprocExecutor(MultiprocExecutor):
kwargs
:
Optional
[
dict
]
=
None
,
non_block
:
bool
=
False
,
unique_reply_rank
:
Optional
[
int
]
=
None
)
->
list
[
Any
]:
# Drop marker to show that this was r
a
n
# Drop marker to show that this was r
u
n
with
open
(
".marker"
,
"w"
):
...
return
super
().
collective_rpc
(
method
,
timeout
,
args
,
kwargs
)
...
...
tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
View file @
38d80967
...
...
@@ -42,7 +42,7 @@ def test_basic_lifecycle():
engine_core_outputs
=
scheduler
.
update_from_output
(
scheduler_output
,
model_runner_output
)
# Ensure the request is finished after 1 token
s
.
# Ensure the request is finished after 1 token.
assert
request
.
is_finished
()
assert
request
.
status
==
RequestStatus
.
FINISHED_LENGTH_CAPPED
output
=
engine_core_outputs
[
0
].
outputs
[
0
]
...
...
@@ -141,7 +141,7 @@ def test_short_prompt_lifecycle():
def
test_prefix_cache_lifecycle
():
"""Test that remote decode params still work
s
with a prefix cache hit."""
"""Test that remote decode params still work with a prefix cache hit."""
vllm_config
=
create_vllm_config
()
scheduler
=
create_scheduler
(
vllm_config
)
...
...
tests/v1/kv_connector/unit/test_shared_storage_connector.py
View file @
38d80967
...
...
@@ -33,7 +33,7 @@ def _check_path_len(path):
def
_list_path
(
path
):
"""Return the list of foldername (hashes generatd) under the path"""
"""Return the list of foldername (hashes generat
e
d) under the path"""
return
list
(
path
.
iterdir
())
...
...
@@ -41,7 +41,7 @@ def run_test(tmp_path, processor, llm: LLM, question: str,
image_urls
:
list
[
Image
],
expected_len
:
int
,
info
:
str
):
"""
One individual test to process the prompt and output base on 1 set of input
Then check if the length in the st
r
orage path matches the expected length
Then check if the length in the storage path matches the expected length
`info` introduces details or purpose of the individual test
"""
print
(
f
"***info:
{
info
}
***"
)
...
...
@@ -115,7 +115,7 @@ def test_shared_storage_connector_hashes(tmp_path):
"""
Tests that SharedStorageConnector saves KV to the storage locations
with proper hashes; that are unique for inputs with identical text but
differnt images (same size), or same multiple images but different orders.
differ
e
nt images (same size), or same multiple images but different orders.
"""
# Using tmp_path as the storage path to store KV
print
(
f
"KV storage path at:
{
str
(
tmp_path
)
}
"
)
...
...
@@ -171,12 +171,12 @@ def test_shared_storage_connector_hashes(tmp_path):
img
=
[
image_1
],
expected_len
=
2
,
info
=
(
"image_1 single input the 2nd time. "
"It should not form aother new hash."
)),
"It should not form a
n
other new hash."
)),
InputCase
(
text
=
TEXT_PROMPTS
[
0
],
img
=
[
image_2
],
expected_len
=
2
,
info
=
(
"image_2 single input the 2nd time. "
"It should not form aother new hash."
)),
"It should not form a
n
other new hash."
)),
InputCase
(
text
=
TEXT_PROMPTS
[
0
],
img
=
[
image_1
,
image_2
],
expected_len
=
3
,
...
...
@@ -189,12 +189,12 @@ def test_shared_storage_connector_hashes(tmp_path):
img
=
[
image_1
,
image_2
],
expected_len
=
4
,
info
=
(
"[image_1, image_2] input the 2nd time. "
"It should not form aother new hash."
)),
"It should not form a
n
other new hash."
)),
InputCase
(
text
=
TEXT_PROMPTS
[
0
],
img
=
[
image_2
,
image_1
],
expected_len
=
4
,
info
=
(
"[image_2, image_1] input the 2nd time. "
"It should not form aother new hash."
)),
"It should not form a
n
other new hash."
)),
InputCase
(
text
=
TEXT_PROMPTS
[
0
],
img
=
[],
expected_len
=
5
,
...
...
tests/v1/kv_connector/unit/utils.py
View file @
38d80967
...
...
@@ -13,6 +13,7 @@ from vllm.distributed.kv_transfer.kv_connector.factory import (
KVConnectorFactory
)
from
vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector
import
(
# noqa
SharedStorageConnector
)
from
vllm.utils
import
sha256
from
vllm.v1.core.kv_cache_manager
import
KVCacheBlocks
from
vllm.v1.core.kv_cache_utils
import
(
get_request_block_hasher
,
init_none_hash
)
...
...
@@ -127,11 +128,11 @@ def create_request(request_id: int,
use_all_1s_for_prompt_tokens
:
bool
=
False
,
num_remote_blocks
:
int
=
3
,
block_size
:
int
=
16
,
hash_fn
:
Callable
=
ha
sh
)
->
Request
:
hash_fn
:
Callable
=
sh
a256
)
->
Request
:
"""Make dummy request for testing."""
global
_none_hash_initialized
if
not
_none_hash_initialized
:
init_none_hash
(
hash
)
init_none_hash
(
hash
_fn
)
_none_hash_initialized
=
True
kv_transfer_params
:
Optional
[
dict
[
str
,
Any
]]
=
None
...
...
tests/v1/logits_processors/test_custom_offline.py
View file @
38d80967
...
...
@@ -15,6 +15,7 @@ from tests.v1.logits_processors.utils import (DUMMY_LOGITPROC_ARG,
POOLING_MODEL_NAME
,
TEMP_GREEDY
,
CustomLogitprocSource
,
DummyLogitsProcessor
,
WrappedPerReqLogitsProcessor
,
dummy_module
)
from
tests.v1.logits_processors.utils
import
entry_points
as
fake_entry_points
from
tests.v1.logits_processors.utils
import
prompts
...
...
@@ -80,7 +81,7 @@ def _run_test(kwargs: dict, logitproc_loaded: bool) -> None:
target_token
=
params
.
extra_args
[
DUMMY_LOGITPROC_ARG
]
if
not
all
(
x
==
target_token
for
x
in
lp_toks
):
raise
AssertionError
(
f
"Request
{
bdx
}
generated
{
lp_toks
}
, shoud all be "
f
"Request
{
bdx
}
generated
{
lp_toks
}
, shou
l
d all be "
f
"
{
target_token
}
"
)
else
:
# This request does not exercise custom logitproc (or custom
...
...
@@ -161,6 +162,38 @@ def test_custom_logitsprocs(monkeypatch,
_run_test
(
kwargs
,
logitproc_loaded
=
True
)
@
create_new_process_for_each_test
()
def
test_custom_logitsprocs_req
(
monkeypatch
):
"""Test passing request-level logits processor to offline Python interface
Wrap a request-level logits processor to create a batch level logits
processor that has a well-defined behavior (mask out all tokens except one
`target_token`)
Construct an `LLM` instance which loads the wrapped logits processor. Pass
the custom logitproc as a class object.
Construct a reference `LLM` instance with no custom logitproc
Pass in a batch of requests, 50% of which pass a `target_token` value
in through `SamplingParams.extra_args`, 50% of which do not.
Validate that
* Requests which do not activate the custom logitproc, yield the same
results for both `LLM` instances
* Requests which activate the custom logitproc, only output `target_token`
Args:
monkeypatch: for setting env vars
"""
# Test that logitproc info is passed to workers
monkeypatch
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"1"
)
random
.
seed
(
40
)
_run_test
({
"logits_processors"
:
[
WrappedPerReqLogitsProcessor
]},
logitproc_loaded
=
True
)
@
create_new_process_for_each_test
()
@
pytest
.
mark
.
parametrize
(
"logitproc_source"
,
[
CustomLogitprocSource
.
LOGITPROC_SOURCE_ENTRYPOINT
,
...
...
tests/v1/logits_processors/utils.py
View file @
38d80967
...
...
@@ -3,15 +3,21 @@
import
types
from
enum
import
Enum
,
auto
from
typing
import
Optional
from
typing
import
Any
,
Optional
import
torch
from
vllm.config
import
VllmConfig
from
vllm.v1.sample.logits_processor
import
(
LOGITSPROCS_GROUP
,
BatchUpdate
,
LogitsProcessor
)
from
vllm.logger
import
init_logger
from
vllm.sampling_params
import
SamplingParams
from
vllm.v1.sample.logits_processor
import
(
LOGITSPROCS_GROUP
,
AdapterLogitsProcessor
,
BatchUpdate
,
LogitsProcessor
,
RequestLogitsProcessor
)
from
vllm.v1.sample.logits_processor.builtin
import
process_dict_updates
logger
=
init_logger
(
__name__
)
MODEL_NAME
=
"facebook/opt-125m"
POOLING_MODEL_NAME
=
"BAAI/bge-base-en-v1.5"
DUMMY_LOGITPROC_ARG
=
"target_token"
...
...
@@ -104,5 +110,60 @@ class EntryPoints(list):
self
.
names
=
[
ep
.
name
for
ep
in
eps
]
class
DummyPerReqLogitsProcessor
:
"""The request-level logits processor masks out all logits except the
token id identified by `target_token`"""
def
__init__
(
self
,
target_token
:
int
)
->
None
:
"""Specify `target_token`"""
self
.
target_token
=
target_token
def
__call__
(
self
,
output_ids
:
list
[
int
],
logits
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
val_to_keep
=
logits
[
self
.
target_token
].
item
()
logits
[:]
=
float
(
"-inf"
)
logits
[
self
.
target_token
]
=
val_to_keep
return
logits
class
WrappedPerReqLogitsProcessor
(
AdapterLogitsProcessor
):
"""Example of wrapping a fake request-level logit processor to create a
batch-level logits processor"""
def
is_argmax_invariant
(
self
)
->
bool
:
return
False
def
new_req_logits_processor
(
self
,
params
:
SamplingParams
,
)
->
Optional
[
RequestLogitsProcessor
]:
"""This method returns a new request-level logits processor, customized
to the `target_token` value associated with a particular request.
Returns None if the logits processor should not be applied to the
particular request. To use the logits processor the request must have
a "target_token" custom argument with an integer value.
Args:
params: per-request sampling params
Returns:
`Callable` request logits processor, or None
"""
target_token
:
Optional
[
Any
]
=
params
.
extra_args
and
params
.
extra_args
.
get
(
"target_token"
)
if
target_token
is
None
:
return
None
if
not
isinstance
(
target_token
,
int
):
logger
.
warning
(
"target_token value %s is not int; not applying logits"
" processor to request."
,
target_token
)
return
None
return
DummyPerReqLogitsProcessor
(
target_token
)
"""Fake version of importlib.metadata.entry_points"""
entry_points
=
lambda
group
:
EntryPoints
(
group
)
tests/v1/metrics/test_engine_logger_apis.py
0 → 100644
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
copy
import
pytest
from
vllm.v1.engine.async_llm
import
AsyncEngineArgs
,
AsyncLLM
from
vllm.v1.metrics.ray_wrappers
import
RayPrometheusStatLogger
class
DummyStatLogger
:
"""
A dummy stat logger for testing purposes.
Implements the minimal interface expected by StatLoggerManager.
"""
def
__init__
(
self
,
vllm_config
,
engine_idx
):
self
.
vllm_config
=
vllm_config
self
.
engine_idx
=
engine_idx
self
.
recorded
=
[]
self
.
logged
=
False
self
.
engine_initialized
=
False
def
record
(
self
,
scheduler_stats
,
iteration_stats
,
engine_idx
):
self
.
recorded
.
append
((
scheduler_stats
,
iteration_stats
,
engine_idx
))
def
log
(
self
):
self
.
logged
=
True
def
log_engine_initialized
(
self
):
self
.
engine_initialized
=
True
@
pytest
.
fixture
def
log_stats_enabled_engine_args
():
"""
Shared fixture providing common AsyncEngineArgs configuration
used across multiple tests.
"""
return
AsyncEngineArgs
(
model
=
"distilbert/distilgpt2"
,
dtype
=
"half"
,
disable_log_stats
=
False
,
enforce_eager
=
True
,
)
@
pytest
.
mark
.
asyncio
async
def
test_async_llm_replace_default_loggers
(
log_stats_enabled_engine_args
):
"""
RayPrometheusStatLogger should replace the default PrometheusStatLogger
"""
engine
=
AsyncLLM
.
from_engine_args
(
log_stats_enabled_engine_args
,
stat_loggers
=
[
RayPrometheusStatLogger
])
assert
isinstance
(
engine
.
logger_manager
.
prometheus_logger
,
RayPrometheusStatLogger
)
engine
.
shutdown
()
@
pytest
.
mark
.
asyncio
async
def
test_async_llm_add_to_default_loggers
(
log_stats_enabled_engine_args
):
"""
It's still possible to use custom stat loggers exclusively by passing
disable_log_stats=True in addition to a list of custom stat loggers.
"""
# Create engine_args with disable_log_stats=True for this test
disabled_log_engine_args
=
copy
.
deepcopy
(
log_stats_enabled_engine_args
)
disabled_log_engine_args
.
disable_log_stats
=
True
# Disable default loggers; pass custom stat logger to the constructor
engine
=
AsyncLLM
.
from_engine_args
(
disabled_log_engine_args
,
stat_loggers
=
[
DummyStatLogger
])
assert
len
(
engine
.
logger_manager
.
per_engine_logger_dict
[
0
])
==
1
assert
isinstance
(
engine
.
logger_manager
.
per_engine_logger_dict
[
0
][
0
],
DummyStatLogger
)
# log_stats is still True, since custom stat loggers are used
assert
engine
.
log_stats
engine
.
shutdown
()
tests/v1/sample/test_logprobs.py
View file @
38d80967
...
...
@@ -430,7 +430,7 @@ def test_zero_logprobs(vllm_model, example_prompts,
def
test_all_logprobs
(
example_prompts
,
monkeypatch
:
pytest
.
MonkeyPatch
):
"""Engine should return all vocabulary logprobs
"""Engine should return all vocabulary logprobs
and prompt logprobs
Args:
example_prompts: list of example prompts (test fixture)
...
...
@@ -444,16 +444,24 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
# 2 other llms alive during whole session
gpu_memory_utilization
=
0.15
,
max_model_len
=
256
)
sampling_params_logprobs_all
=
SamplingParams
(
max_tokens
=
5
,
logprobs
=-
1
)
logprobs
=-
1
,
prompt_logprobs
=-
1
)
results_logprobs_all
=
runner
.
llm
.
generate
(
example_prompts
,
sampling_params
=
sampling_params_logprobs_all
)
vocab_size
=
runner
.
llm
.
llm_engine
.
get_model_config
().
get_vocab_size
()
for
i
in
range
(
len
(
results_logprobs_all
)):
logprobs
=
results_logprobs_all
[
i
].
outputs
[
0
].
logprobs
prompt_logprobs
=
results_logprobs_all
[
i
].
prompt_logprobs
assert
logprobs
is
not
None
for
logprob
in
logprobs
:
assert
len
(
logprob
)
==
vocab_size
assert
prompt_logprobs
is
not
None
assert
prompt_logprobs
[
0
]
is
None
for
prompt_logprob
in
prompt_logprobs
[
1
:]:
assert
len
(
prompt_logprob
)
==
vocab_size
@
pytest
.
mark
.
parametrize
(
"logprobs_mode"
,
list
(
LogprobsMode
))
...
...
tests/v1/spec_decode/test_eagle.py
View file @
38d80967
...
...
@@ -12,9 +12,10 @@ from tests.v1.attention.utils import (BatchSpec, _Backend,
create_common_attn_metadata
,
create_standard_kv_cache_spec
,
get_attention_backend
)
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
ModelConfig
,
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
,
SpeculativeConfig
,
VllmConfig
)
from
vllm.config.load
import
LoadConfig
from
vllm.model_executor.models.llama
import
LlamaForCausalLM
from
vllm.platforms
import
current_platform
from
vllm.v1.spec_decode.eagle
import
EagleProposer
...
...
@@ -183,7 +184,7 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
mock_pp_group
.
world_size
=
pp_size
mock_get_pp_group
.
return_value
=
mock_pp_group
# Setup the target model mock with a custom class so that
# Set
up the target model mock with a custom class so that
# isinstance() checks match the expected type.
class
_TargetModelStub
(
LlamaForCausalLM
):
model
:
mock
.
MagicMock
...
...
tests/v1/spec_decode/test_tree_attention.py
View file @
38d80967
...
...
@@ -187,7 +187,7 @@ def test_tree_attn_correctness() -> None:
dtype
=
torch
.
bfloat16
,
)
# Setup the block table and KV cache for paged KV.
# Set
up the block table and KV cache for paged KV.
assert
max_sequence_length
%
block_size
==
0
max_blocks_per_batch
=
max_sequence_length
//
block_size
kv_cache
=
torch
.
randn
(
...
...
@@ -222,7 +222,7 @@ def test_tree_attn_correctness() -> None:
num_alloc_blocks_per_batch
]
=
block_ids
.
view
(
-
1
,
num_alloc_blocks_per_batch
)
# Setup the slot mapping for the input KVs.
# Set
up the slot mapping for the input KVs.
tree_positions
=
sequence_position
+
torch
.
arange
(
0
,
tree_size_q
,
...
...
tests/v1/test_kv_sharing.py
View file @
38d80967
...
...
@@ -30,7 +30,7 @@ def test_initialize_kv_cache_for_kv_sharing_different_attn_groups():
}
# Layers 0 and 1 both belong in KV cache group 0
# However, if they have
have
different attention backends, they will be
# However, if they have different attention backends, they will be
# placed in different attention groups for KV cache group 0
kv_cache_groups
=
[
KVCacheGroupSpec
([
"model.layers.0"
,
"model.layers.1"
],
...
...
tests/v1/test_oracle.py
View file @
38d80967
...
...
@@ -10,7 +10,6 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
UNSUPPORTED_MODELS_V1
=
[
"openai/whisper-large-v3"
,
# transcription
"facebook/bart-large-cnn"
,
# encoder decoder
]
...
...
tests/v1/tpu/test_multimodal.py
View file @
38d80967
...
...
@@ -4,18 +4,19 @@
import
openai
import
pytest
from
vllm.multimodal.utils
import
encode_image_base64
,
fetch_image
from
vllm.multimodal.utils
import
encode_image_base64
from
vllm.platforms
import
current_platform
from
...entrypoints.openai.test_vision
import
TEST_IMAGE_
URL
S
from
...entrypoints.openai.test_vision
import
TEST_IMAGE_
ASSET
S
from
...utils
import
RemoteOpenAIServer
@
pytest
.
fixture
(
scope
=
"session"
)
def
base64_encoded_image
()
->
dict
[
str
,
str
]:
def
base64_encoded_image
(
local_asset_server
)
->
dict
[
str
,
str
]:
return
{
image_url
:
encode_image_base64
(
fetch_image
(
image_url
))
for
image_url
in
TEST_IMAGE_URLS
image_asset
:
encode_image_base64
(
local_asset_server
.
get_image_asset
(
image_asset
))
for
image_asset
in
TEST_IMAGE_ASSETS
}
...
...
@@ -66,7 +67,7 @@ async def test_basic_vision(model_name: str, base64_encoded_image: dict[str,
client
:
openai
.
AsyncOpenAI
=
remote_server
.
get_async_client
()
# Other requests now should be much faster
for
image_url
in
TEST_IMAGE_
URL
S
:
for
image_url
in
TEST_IMAGE_
ASSET
S
:
image_base64
=
base64_encoded_image
[
image_url
]
chat_completion_from_base64
=
await
client
.
chat
.
completions
\
.
create
(
...
...
tests/v1/tpu/test_topk_topp_sampler.py
View file @
38d80967
...
...
@@ -6,8 +6,12 @@ import pytest
import
torch
from
vllm.platforms
import
current_platform
from
vllm.v1.sample.ops.topk_topp_sampler
import
(
apply_top_k_top_p
,
apply_top_k_top_p_tpu
)
from
vllm.v1.sample.ops.topk_topp_sampler
import
apply_top_k_top_p
# isort: off
from
vllm.v1.sample.tpu.sampler
import
(
apply_top_k_top_p
as
apply_top_k_top_p_tpu
)
# isort: on
if
not
current_platform
.
is_tpu
():
pytest
.
skip
(
"This test needs a TPU."
,
allow_module_level
=
True
)
...
...
tests/v1/tracing/test_tracing.py
0 → 100644
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa
# type: ignore
from
__future__
import
annotations
import
threading
from
collections.abc
import
Iterable
from
concurrent
import
futures
from
typing
import
Callable
,
Generator
,
Literal
import
grpc
import
pytest
from
opentelemetry.proto.collector.trace.v1.trace_service_pb2
import
(
ExportTraceServiceResponse
)
from
opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc
import
(
TraceServiceServicer
,
add_TraceServiceServicer_to_server
)
from
opentelemetry.proto.common.v1.common_pb2
import
AnyValue
,
KeyValue
from
opentelemetry.sdk.environment_variables
import
(
OTEL_EXPORTER_OTLP_TRACES_INSECURE
)
from
vllm
import
LLM
,
SamplingParams
from
vllm.tracing
import
SpanAttributes
FAKE_TRACE_SERVER_ADDRESS
=
"localhost:4317"
FieldName
=
Literal
[
'bool_value'
,
'string_value'
,
'int_value'
,
'double_value'
,
'array_value'
]
def
decode_value
(
value
:
AnyValue
):
field_decoders
:
dict
[
FieldName
,
Callable
]
=
{
"bool_value"
:
(
lambda
v
:
v
.
bool_value
),
"string_value"
:
(
lambda
v
:
v
.
string_value
),
"int_value"
:
(
lambda
v
:
v
.
int_value
),
"double_value"
:
(
lambda
v
:
v
.
double_value
),
"array_value"
:
(
lambda
v
:
[
decode_value
(
item
)
for
item
in
v
.
array_value
.
values
]),
}
for
field
,
decoder
in
field_decoders
.
items
():
if
value
.
HasField
(
field
):
return
decoder
(
value
)
raise
ValueError
(
f
"Couldn't decode value:
{
value
}
"
)
def
decode_attributes
(
attributes
:
Iterable
[
KeyValue
]):
return
{
kv
.
key
:
decode_value
(
kv
.
value
)
for
kv
in
attributes
}
class
FakeTraceService
(
TraceServiceServicer
):
def
__init__
(
self
):
self
.
request
=
None
self
.
evt
=
threading
.
Event
()
def
Export
(
self
,
request
,
context
):
self
.
request
=
request
self
.
evt
.
set
()
return
ExportTraceServiceResponse
()
@
pytest
.
fixture
def
trace_service
()
->
Generator
[
FakeTraceService
,
None
,
None
]:
"""Fixture to set up a fake gRPC trace service"""
server
=
grpc
.
server
(
futures
.
ThreadPoolExecutor
(
max_workers
=
1
))
service
=
FakeTraceService
()
add_TraceServiceServicer_to_server
(
service
,
server
)
server
.
add_insecure_port
(
FAKE_TRACE_SERVER_ADDRESS
)
server
.
start
()
yield
service
server
.
stop
(
None
)
def
test_traces
(
monkeypatch
:
pytest
.
MonkeyPatch
,
trace_service
:
FakeTraceService
,
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
OTEL_EXPORTER_OTLP_TRACES_INSECURE
,
"true"
)
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
sampling_params
=
SamplingParams
(
temperature
=
0.01
,
top_p
=
0.1
,
max_tokens
=
256
,
)
model
=
"facebook/opt-125m"
llm
=
LLM
(
model
=
model
,
otlp_traces_endpoint
=
FAKE_TRACE_SERVER_ADDRESS
,
gpu_memory_utilization
=
0.3
,
disable_log_stats
=
False
)
prompts
=
[
"This is a short prompt"
]
outputs
=
llm
.
generate
(
prompts
,
sampling_params
=
sampling_params
)
print
(
f
"test_traces outputs is :
{
outputs
}
"
)
timeout
=
10
if
not
trace_service
.
evt
.
wait
(
timeout
):
raise
TimeoutError
(
f
"The fake trace service didn't receive a trace within "
f
"the
{
timeout
}
seconds timeout"
)
request
=
trace_service
.
request
assert
len
(
request
.
resource_spans
)
==
1
,
(
f
"Expected 1 resource span, "
f
"but got
{
len
(
request
.
resource_spans
)
}
"
)
assert
len
(
request
.
resource_spans
[
0
].
scope_spans
)
==
1
,
(
f
"Expected 1 scope span, "
f
"but got
{
len
(
request
.
resource_spans
[
0
].
scope_spans
)
}
"
)
assert
len
(
request
.
resource_spans
[
0
].
scope_spans
[
0
].
spans
)
==
1
,
(
f
"Expected 1 span, "
f
"but got
{
len
(
request
.
resource_spans
[
0
].
scope_spans
[
0
].
spans
)
}
"
)
attributes
=
decode_attributes
(
request
.
resource_spans
[
0
].
scope_spans
[
0
].
spans
[
0
].
attributes
)
# assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI_REQUEST_ID
)
==
outputs
[
0
].
request_id
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI_REQUEST_TEMPERATURE
)
==
sampling_params
.
temperature
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI_REQUEST_TOP_P
)
==
sampling_params
.
top_p
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI_REQUEST_MAX_TOKENS
)
==
sampling_params
.
max_tokens
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI_REQUEST_N
)
==
sampling_params
.
n
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI_USAGE_PROMPT_TOKENS
)
==
len
(
outputs
[
0
].
prompt_token_ids
)
completion_tokens
=
sum
(
len
(
o
.
token_ids
)
for
o
in
outputs
[
0
].
outputs
)
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI_USAGE_COMPLETION_TOKENS
)
==
completion_tokens
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI_LATENCY_TIME_IN_QUEUE
)
>
0
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN
)
>
0
assert
attributes
.
get
(
SpanAttributes
.
GEN_AI_LATENCY_E2E
)
>
0
tests/v1/worker/test_gpu_model_runner.py
View file @
38d80967
...
...
@@ -702,7 +702,7 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
KVCacheTensors for the attention and mamba layers
(via _reshape_kv_cache_tensors function). This test verifies
that the views are compatible: writing a mamba block
will not corrupt an attention block and vice
-
versa
will not corrupt an attention block and vice
versa
'''
current_platform
.
seed_everything
(
42
)
...
...
tools/install_deepgemm.sh
View file @
38d80967
...
...
@@ -6,7 +6,7 @@ set -e
# Default values
DEEPGEMM_GIT_REPO
=
"https://github.com/deepseek-ai/DeepGEMM.git"
DEEPGEMM_GIT_REF
=
"
7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c
"
DEEPGEMM_GIT_REF
=
"
ea9c5d9270226c5dd7a577c212e9ea385f6ef048
"
# Parse command line arguments
while
[[
$#
-gt
0
]]
;
do
...
...
@@ -105,4 +105,4 @@ fi
popd
echo
"✅ DeepGEMM installation completed successfully"
\ No newline at end of file
echo
"✅ DeepGEMM installation completed successfully"
tools/mypy.sh
View file @
38d80967
...
...
@@ -29,7 +29,7 @@ run_mypy vllm/engine
run_mypy vllm/executor
run_mypy vllm/inputs
run_mypy vllm/lora
run_mypy vllm/model_executor
run_mypy
--exclude
'vllm/model_executor/layers/fla/ops'
vllm/model_executor
run_mypy vllm/plugins
run_mypy vllm/worker
run_mypy vllm/v1
tools/profiler/visualize_layerwise_profile.py
View file @
38d80967
...
...
@@ -119,7 +119,7 @@ def attempt_to_make_names_unique(entries_and_traces):
if
not
all_the_same
(
trace_eles
)),
None
)
if
first_trace_difference
is
None
:
# can't create a unique name, leave the
m
names as the
# can't create a unique name, leave the names as the
y
# are they will get aggregated by the pivot_table call
continue
...
...
Prev
1
…
14
15
16
17
18
19
20
21
22
…
28
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment