Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6c036615
Unverified
Commit
6c036615
authored
Sep 17, 2025
by
Woosuk Kwon
Committed by
GitHub
Sep 17, 2025
Browse files
[V0 Deprecation] Remove misc V0 tests (#25118)
Signed-off-by:
Woosuk Kwon
<
woosuk.kwon@berkeley.edu
>
parent
2fc24e94
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
0 additions
and
190 deletions
+0
-190
tests/model_executor/test_logits_processor.py
tests/model_executor/test_logits_processor.py
+0
-98
tests/test_cache_block_hashing.py
tests/test_cache_block_hashing.py
+0
-92
No files found.
tests/model_executor/test_logits_processor.py
deleted
100644 → 0
View file @
2fc24e94
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
random
from
unittest.mock
import
patch
import
pytest
import
torch
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.utils
import
set_random_seed
from
vllm.sequence
import
SamplingParams
,
SequenceData
,
SequenceGroupMetadata
from
vllm.utils
import
is_pin_memory_available
class
MockLogitsProcessor
(
LogitsProcessor
):
def
__init__
(
self
,
vocab_size
:
int
,
scale
:
float
,
fake_logits
:
torch
.
Tensor
):
super
().
__init__
(
vocab_size
=
vocab_size
,
scale
=
scale
)
self
.
fake_logits
=
fake_logits
.
clone
()
def
forward
(
self
,
*
args
,
**
kwargs
):
with
patch
(
"vllm.model_executor.layers.logits_processor._prune_hidden_states"
,
lambda
x
,
y
:
x
),
patch
(
"vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits"
,
lambda
*
args
,
**
kwargs
:
self
.
fake_logits
):
return
super
().
forward
(
*
args
,
**
kwargs
)
def
_prepare_test
(
batch_size
:
int
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
MockLogitsProcessor
]:
vocab_size
=
32000
input_tensor
=
torch
.
rand
((
batch_size
,
1024
),
dtype
=
torch
.
float16
)
fake_logits
=
torch
.
full
((
batch_size
,
vocab_size
),
1e-2
,
dtype
=
input_tensor
.
dtype
)
logits_processor
=
MockLogitsProcessor
(
32000
,
0.5
,
fake_logits
)
return
input_tensor
,
fake_logits
,
logits_processor
RANDOM_SEEDS
=
list
(
range
(
128
))
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
]
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_logits_processors
(
seed
:
int
,
device
:
str
):
set_random_seed
(
seed
)
torch
.
set_default_device
(
device
)
batch_size
=
random
.
randint
(
1
,
256
)
input_tensor
,
fake_logits
,
logits_processor
=
_prepare_test
(
batch_size
)
# This sample logits processor gives infinite score to the i-th token,
# where i is the length of the input sequence.
# We therefore expect the output token sequence to be [0, 1, 2, ...]
def
pick_ith
(
token_ids
,
logits
):
logits
[
len
(
token_ids
)]
=
float
(
"inf"
)
return
logits
seq_group_metadata_list
=
[]
seq_lens
=
[]
for
i
in
range
(
batch_size
):
seq_group_metadata_list
.
append
(
SequenceGroupMetadata
(
request_id
=
f
"test_
{
i
}
"
,
is_prompt
=
True
,
seq_data
=
{
0
:
SequenceData
.
from_seqs
([
1
,
2
,
3
])},
sampling_params
=
SamplingParams
(
temperature
=
0
,
logits_processors
=
[
pick_ith
]),
block_tables
=
{
0
:
[
1
]},
))
seq_lens
.
append
(
seq_group_metadata_list
[
-
1
].
seq_data
[
0
].
get_len
())
sampling_metadata
=
SamplingMetadata
.
prepare
(
seq_group_metadata_list
,
seq_lens
,
query_lens
=
seq_lens
,
device
=
device
,
pin_memory
=
is_pin_memory_available
())
logits_processor_output
=
logits_processor
(
lm_head
=
None
,
hidden_states
=
input_tensor
,
sampling_metadata
=
sampling_metadata
)
assert
torch
.
isinf
(
logits_processor_output
[:,
0
]).
all
()
fake_logits
*=
logits_processor
.
scale
torch
.
testing
.
assert_close
(
logits_processor_output
[:,
1
],
fake_logits
[:,
1
],
rtol
=
1e-4
,
atol
=
0.0
)
tests/test_cache_block_hashing.py
deleted
100644 → 0
View file @
2fc24e94
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Test hashing of cache blocks.
Run `pytest tests/test_cache_block_hashing.py`.
"""
from
typing
import
Optional
import
pytest
from
vllm.inputs
import
token_inputs
from
vllm.lora.request
import
LoRARequest
from
vllm.sequence
import
Sequence
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
# Make two prefixes with different first blocks.
prefix_start
=
[(
"You are an expert"
),
(
"You are a"
)]
prefix_common
=
(
" school principal, skilled in effectively managing "
"faculty and staff. Draft 10-15 questions for a potential first grade "
"Head Teacher for my K-12, all-girls', independent school that emphasizes "
"community, joyful discovery, and life-long learning. The candidate is "
"coming in for a first-round panel interview for a 8th grade Math "
"teaching role. They have 5 years of previous teaching experience "
"as an assistant teacher at a co-ed, public school with experience "
"in middle school math teaching. Based on this, fulfill "
"the following: "
)
prefixes
=
[
start
+
prefix_common
for
start
in
prefix_start
]
# Sample prompts.
sample_prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
]
# Helper function.
def
flatten_2d
(
li
):
return
[
lss
for
ls
in
li
for
lss
in
ls
]
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"max_num_seqs"
,
[
256
])
@
pytest
.
mark
.
parametrize
(
"concurrent_lora_int_ids"
,
[[
None
],
[
1
],
[
None
,
1
],
[
None
,
1
,
2
],
[
1
,
2
]])
def
test_auto_prefix_caching
(
model
:
str
,
block_size
:
int
,
max_num_seqs
:
int
,
concurrent_lora_int_ids
:
list
[
Optional
[
int
]]):
tokenizer
=
get_tokenizer
(
"facebook/opt-125m"
)
hashes
:
list
[
list
[
list
[
int
]]]
=
[]
for
prefix
in
prefixes
:
for
lora_int_id
in
concurrent_lora_int_ids
:
lora_request
=
None
if
lora_int_id
is
not
None
:
lora_request
=
LoRARequest
(
f
"example_lora_
{
lora_int_id
}
"
,
lora_int_id
,
f
"example/path/to/lora_
{
lora_int_id
}
"
,
)
hashes
.
append
([])
prompts
=
[
prefix
+
prompt
for
prompt
in
sample_prompts
]
for
seq_id
,
prompt
in
enumerate
(
prompts
):
hashes
[
-
1
].
append
([])
prompt_token_ids
=
tokenizer
.
encode
(
prompt
)
seq
=
Sequence
(
seq_id
,
inputs
=
token_inputs
(
prompt_token_ids
,
prompt
=
prompt
),
block_size
=
block_size
,
eos_token_id
=
tokenizer
.
eos_token_id
,
lora_request
=
lora_request
)
num_blocks
=
len
(
prompt_token_ids
)
//
block_size
for
idx
in
range
(
num_blocks
):
hashes
[
-
1
][
-
1
].
append
(
seq
.
hash_of_block
(
idx
))
# Check that hashes made with two prefixes with different first blocks are
# different everywhere.
for
hash0
,
hash1
in
zip
(
flatten_2d
(
hashes
[
0
]),
flatten_2d
(
hashes
[
1
])):
assert
(
hash0
!=
hash1
)
# Check that hashes of different prompts made with the same prefix are the
# same until the hashes that contain the prompt.
for
hash_pref
in
hashes
:
same_hashes
=
[
tuple
(
h
[:
-
1
])
for
h
in
hash_pref
]
different_hashes
=
[
h
[
-
1
]
for
h
in
hash_pref
]
assert
(
len
(
set
(
same_hashes
))
==
1
)
assert
(
len
(
set
(
different_hashes
))
==
len
(
different_hashes
))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment