Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7c4f76e3
Commit
7c4f76e3
authored
Apr 15, 2024
by
zhuwenwen
Browse files
merge v0.4.0
parents
2da0dd3e
51c31bc1
Changes
332
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
978 additions
and
165 deletions
+978
-165
tests/lora/test_tokenizer_group.py
tests/lora/test_tokenizer_group.py
+55
-0
tests/lora/test_utils.py
tests/lora/test_utils.py
+1
-1
tests/lora/test_worker.py
tests/lora/test_worker.py
+4
-4
tests/metrics/test_metrics.py
tests/metrics/test_metrics.py
+8
-6
tests/models/test_big_models.py
tests/models/test_big_models.py
+45
-0
tests/models/test_llava.py
tests/models/test_llava.py
+107
-0
tests/models/test_marlin.py
tests/models/test_marlin.py
+12
-11
tests/models/test_mistral.py
tests/models/test_mistral.py
+4
-1
tests/models/test_models.py
tests/models/test_models.py
+9
-9
tests/prefix_caching/test_prefix_caching.py
tests/prefix_caching/test_prefix_caching.py
+68
-34
tests/samplers/test_beam_search.py
tests/samplers/test_beam_search.py
+8
-1
tests/samplers/test_logprobs.py
tests/samplers/test_logprobs.py
+36
-6
tests/samplers/test_ranks.py
tests/samplers/test_ranks.py
+50
-0
tests/samplers/test_rejection_sampler.py
tests/samplers/test_rejection_sampler.py
+2
-3
tests/samplers/test_sampler.py
tests/samplers/test_sampler.py
+254
-87
tests/samplers/test_seeded_generate.py
tests/samplers/test_seeded_generate.py
+2
-2
tests/samplers/test_stop_reason.py
tests/samplers/test_stop_reason.py
+59
-0
tests/spec_decode/__init__.py
tests/spec_decode/__init__.py
+0
-0
tests/spec_decode/test_batch_expansion.py
tests/spec_decode/test_batch_expansion.py
+95
-0
tests/spec_decode/test_metrics.py
tests/spec_decode/test_metrics.py
+159
-0
No files found.
tests/lora/test_tokenizer.py
→
tests/lora/test_tokenizer
_group
.py
View file @
7c4f76e3
...
@@ -2,57 +2,43 @@ import pytest
...
@@ -2,57 +2,43 @@ import pytest
from
transformers
import
AutoTokenizer
,
PreTrainedTokenizerBase
from
transformers
import
AutoTokenizer
,
PreTrainedTokenizerBase
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.transformers_utils.tokenizer
import
TokenizerGroup
,
get_lora_tokenizer
from
vllm.transformers_utils.tokenizer
import
get_lora_tokenizer
from
vllm.transformers_utils.tokenizer_group
import
get_tokenizer_group
from
..conftest
import
get_tokenizer_pool_config
@
pytest
.
mark
.
asyncio
async
def
test_transformers_tokenizer
():
reference_tokenizer
=
AutoTokenizer
.
from_pretrained
(
"gpt2"
)
tokenizer
=
TokenizerGroup
(
tokenizer_id
=
"gpt2"
,
enable_lora
=
False
,
max_num_seqs
=
1
,
max_input_length
=
None
,
)
assert
reference_tokenizer
.
encode
(
"prompt"
)
==
tokenizer
.
encode
(
request_id
=
"request_id"
,
prompt
=
"prompt"
,
lora_request
=
None
)
assert
reference_tokenizer
.
encode
(
"prompt"
)
==
await
tokenizer
.
encode_async
(
request_id
=
"request_id"
,
prompt
=
"prompt"
,
lora_request
=
None
)
assert
isinstance
(
tokenizer
.
get_lora_tokenizer
(
None
),
PreTrainedTokenizerBase
)
assert
tokenizer
.
get_lora_tokenizer
(
None
)
==
await
tokenizer
.
get_lora_tokenizer_async
(
None
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_transformers_tokenizer_lora
(
sql_lora_files
):
@
pytest
.
mark
.
parametrize
(
"tokenizer_group_type"
,
[
None
,
"ray"
])
async
def
test_tokenizer_group_lora
(
sql_lora_files
,
tokenizer_group_type
):
reference_tokenizer
=
AutoTokenizer
.
from_pretrained
(
sql_lora_files
)
reference_tokenizer
=
AutoTokenizer
.
from_pretrained
(
sql_lora_files
)
tokenizer
=
TokenizerGroup
(
tokenizer_group
=
get_tokenizer_group
(
get_tokenizer_pool_config
(
tokenizer_group_type
),
tokenizer_id
=
"gpt2"
,
tokenizer_id
=
"gpt2"
,
enable_lora
=
True
,
enable_lora
=
True
,
max_num_seqs
=
1
,
max_num_seqs
=
1
,
max_input_length
=
None
,
max_input_length
=
None
,
)
)
lora_request
=
LoRARequest
(
"1"
,
1
,
sql_lora_files
)
lora_request
=
LoRARequest
(
"1"
,
1
,
sql_lora_files
)
assert
reference_tokenizer
.
encode
(
"prompt"
)
==
tokenizer
.
encode
(
assert
reference_tokenizer
.
encode
(
"prompt"
)
==
tokenizer
_group
.
encode
(
request_id
=
"request_id"
,
prompt
=
"prompt"
,
lora_request
=
lora_request
)
request_id
=
"request_id"
,
prompt
=
"prompt"
,
lora_request
=
lora_request
)
assert
reference_tokenizer
.
encode
(
assert
reference_tokenizer
.
encode
(
"prompt"
)
==
await
tokenizer
.
encode_async
(
request_id
=
"request_id"
,
"prompt"
)
==
await
tokenizer_group
.
encode_async
(
prompt
=
"prompt"
,
request_id
=
"request_id"
,
lora_request
=
lora_request
)
prompt
=
"prompt"
,
assert
isinstance
(
tokenizer
.
get_lora_tokenizer
(
None
),
lora_request
=
lora_request
)
assert
isinstance
(
tokenizer_group
.
get_lora_tokenizer
(
None
),
PreTrainedTokenizerBase
)
PreTrainedTokenizerBase
)
assert
tokenizer
.
get_lora_tokenizer
(
assert
tokenizer
_group
.
get_lora_tokenizer
(
None
)
==
await
tokenizer
.
get_lora_tokenizer_async
(
None
)
None
)
==
await
tokenizer
_group
.
get_lora_tokenizer_async
(
None
)
assert
isinstance
(
tokenizer
.
get_lora_tokenizer
(
lora_request
),
assert
isinstance
(
tokenizer
_group
.
get_lora_tokenizer
(
lora_request
),
PreTrainedTokenizerBase
)
PreTrainedTokenizerBase
)
assert
tokenizer
.
get_lora_tokenizer
(
assert
tokenizer_group
.
get_lora_tokenizer
(
lora_request
)
!=
tokenizer
.
get_lora_tokenizer
(
None
)
lora_request
)
!=
tokenizer_group
.
get_lora_tokenizer
(
None
)
assert
tokenizer
.
get_lora_tokenizer
(
assert
tokenizer_group
.
get_lora_tokenizer
(
lora_request
)
==
await
tokenizer
.
get_lora_tokenizer_async
(
lora_request
)
lora_request
)
==
await
tokenizer_group
.
get_lora_tokenizer_async
(
lora_request
)
def
test_get_lora_tokenizer
(
sql_lora_files
,
tmpdir
):
def
test_get_lora_tokenizer
(
sql_lora_files
,
tmpdir
):
...
...
tests/lora/test_utils.py
View file @
7c4f76e3
...
@@ -2,8 +2,8 @@ from collections import OrderedDict
...
@@ -2,8 +2,8 @@ from collections import OrderedDict
from
torch
import
nn
from
torch
import
nn
from
vllm.lora.utils
import
parse_fine_tuned_lora_name
,
replace_submodule
from
vllm.utils
import
LRUCache
from
vllm.utils
import
LRUCache
from
vllm.lora.utils
import
(
parse_fine_tuned_lora_name
,
replace_submodule
)
def
test_parse_fine_tuned_lora_name
():
def
test_parse_fine_tuned_lora_name
():
...
...
tests/lora/test_worker.py
View file @
7c4f76e3
...
@@ -3,10 +3,10 @@ import random
...
@@ -3,10 +3,10 @@ import random
import
tempfile
import
tempfile
from
unittest.mock
import
patch
from
unittest.mock
import
patch
from
vllm.config
import
(
DeviceConfig
,
LoRAConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
)
from
vllm.lora.models
import
LoRAMapping
from
vllm.lora.models
import
LoRAMapping
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.config
import
(
ModelConfig
,
ParallelConfig
,
SchedulerConfig
,
DeviceConfig
,
LoRAConfig
)
from
vllm.worker.worker
import
Worker
from
vllm.worker.worker
import
Worker
...
@@ -25,7 +25,7 @@ def test_worker_apply_lora(sql_lora_files):
...
@@ -25,7 +25,7 @@ def test_worker_apply_lora(sql_lora_files):
revision
=
None
,
revision
=
None
,
),
),
parallel_config
=
ParallelConfig
(
1
,
1
,
False
),
parallel_config
=
ParallelConfig
(
1
,
1
,
False
),
scheduler_config
=
SchedulerConfig
(
32
,
32
,
32
,
256
),
scheduler_config
=
SchedulerConfig
(
32
,
32
,
32
),
device_config
=
DeviceConfig
(
"cuda"
),
device_config
=
DeviceConfig
(
"cuda"
),
local_rank
=
0
,
local_rank
=
0
,
rank
=
0
,
rank
=
0
,
...
@@ -33,7 +33,7 @@ def test_worker_apply_lora(sql_lora_files):
...
@@ -33,7 +33,7 @@ def test_worker_apply_lora(sql_lora_files):
max_loras
=
32
),
max_loras
=
32
),
distributed_init_method
=
f
"file://
{
tempfile
.
mkstemp
()[
1
]
}
"
,
distributed_init_method
=
f
"file://
{
tempfile
.
mkstemp
()[
1
]
}
"
,
)
)
worker
.
init_
model
()
worker
.
init_
device
()
worker
.
load_model
()
worker
.
load_model
()
worker
.
model_runner
.
set_active_loras
([],
LoRAMapping
([],
[]))
worker
.
model_runner
.
set_active_loras
([],
LoRAMapping
([],
[]))
...
...
tests/metrics/test_metrics.py
View file @
7c4f76e3
...
@@ -21,7 +21,8 @@ def test_metric_counter_prompt_tokens(
...
@@ -21,7 +21,8 @@ def test_metric_counter_prompt_tokens(
gpu_memory_utilization
=
0.4
)
gpu_memory_utilization
=
0.4
)
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
prompt_token_counts
=
[
len
(
tokenizer
.
encode
(
p
))
for
p
in
example_prompts
]
prompt_token_counts
=
[
len
(
tokenizer
.
encode
(
p
))
for
p
in
example_prompts
]
# This test needs at least 2 prompts in a batch of different lengths to verify their token count is correct despite padding.
# This test needs at least 2 prompts in a batch of different lengths to
# verify their token count is correct despite padding.
assert
len
(
example_prompts
)
>
1
,
"at least 2 prompts are required"
assert
len
(
example_prompts
)
>
1
,
"at least 2 prompts are required"
assert
prompt_token_counts
[
0
]
!=
prompt_token_counts
[
1
],
(
assert
prompt_token_counts
[
0
]
!=
prompt_token_counts
[
1
],
(
"prompts of different lengths are required"
)
"prompts of different lengths are required"
)
...
@@ -33,8 +34,8 @@ def test_metric_counter_prompt_tokens(
...
@@ -33,8 +34,8 @@ def test_metric_counter_prompt_tokens(
**
stat_logger
.
labels
).
_value
.
get
()
**
stat_logger
.
labels
).
_value
.
get
()
assert
vllm_prompt_token_count
==
metric_count
,
(
assert
vllm_prompt_token_count
==
metric_count
,
(
f
"prompt token count:
{
vllm_prompt_token_count
!
r
}
\n
metric:
{
metric_count
!
r
}
"
f
"prompt token count:
{
vllm_prompt_token_count
!
r
}
\n
"
)
f
"metric:
{
metric_count
!
r
}
"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
...
@@ -60,9 +61,10 @@ def test_metric_counter_generation_tokens(
...
@@ -60,9 +61,10 @@ def test_metric_counter_generation_tokens(
for
i
in
range
(
len
(
example_prompts
)):
for
i
in
range
(
len
(
example_prompts
)):
vllm_output_ids
,
vllm_output_str
=
vllm_outputs
[
i
]
vllm_output_ids
,
vllm_output_str
=
vllm_outputs
[
i
]
prompt_ids
=
tokenizer
.
encode
(
example_prompts
[
i
])
prompt_ids
=
tokenizer
.
encode
(
example_prompts
[
i
])
# vllm_output_ids contains both prompt tokens and generation tokens. We're interested only in the count of the generation tokens.
# vllm_output_ids contains both prompt tokens and generation tokens.
# We're interested only in the count of the generation tokens.
vllm_generation_count
+=
len
(
vllm_output_ids
)
-
len
(
prompt_ids
)
vllm_generation_count
+=
len
(
vllm_output_ids
)
-
len
(
prompt_ids
)
assert
vllm_generation_count
==
metric_count
,
(
assert
vllm_generation_count
==
metric_count
,
(
f
"generation token count:
{
vllm_generation_count
!
r
}
\n
metric:
{
metric_count
!
r
}
"
f
"generation token count:
{
vllm_generation_count
!
r
}
\n
"
)
f
"metric:
{
metric_count
!
r
}
"
)
tests/models/test_big_models.py
0 → 100644
View file @
7c4f76e3
"""Compare the outputs of HF and vLLM when using greedy sampling.
This tests bigger models and use half precision.
Run `pytest tests/models/test_big_models.py`.
"""
import
pytest
MODELS
=
[
"meta-llama/Llama-2-7b-hf"
,
# "mistralai/Mistral-7B-v0.1", # Broken
# "Deci/DeciLM-7b", # Broken
# "tiiuae/falcon-7b", # Broken
"EleutherAI/gpt-j-6b"
,
"mosaicml/mpt-7b"
,
# "Qwen/Qwen1.5-0.5B" # Broken,
]
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
def
test_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
hf_model
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
vllm_model
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
vllm_output_ids
,
vllm_output_str
=
vllm_outputs
[
i
]
assert
hf_output_str
==
vllm_output_str
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_str
!
r
}
\n
vLLM:
{
vllm_output_str
!
r
}
"
)
assert
hf_output_ids
==
vllm_output_ids
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_ids
}
\n
vLLM:
{
vllm_output_ids
}
"
)
tests/models/test_llava.py
0 → 100644
View file @
7c4f76e3
import
gc
from
dataclasses
import
fields
from
enum
import
Enum
from
typing
import
Dict
,
List
,
Tuple
import
pytest
import
torch
from
transformers
import
AutoTokenizer
from
vllm.config
import
VisionLanguageConfig
model_and_vl_config
=
[
(
"llava-hf/llava-1.5-7b-hf"
,
VisionLanguageConfig
(
image_input_type
=
VisionLanguageConfig
.
ImageInputType
.
PIXEL_VALUES
,
image_feature_size
=
576
,
image_token_id
=
32000
,
image_input_shape
=
(
1
,
3
,
336
,
336
))),
(
"llava-hf/llava-1.5-7b-hf"
,
VisionLanguageConfig
(
image_input_type
=
VisionLanguageConfig
.
ImageInputType
.
IMAGE_FEATURES
,
image_feature_size
=
576
,
image_token_id
=
32000
,
image_input_shape
=
(
1
,
576
,
1024
)))
]
def
as_dict
(
vision_language_config
:
VisionLanguageConfig
)
->
Dict
:
"""Flatten vision language config to pure args.
Compatible with what llm entrypoint expects.
"""
result
=
{}
for
field
in
fields
(
vision_language_config
):
value
=
getattr
(
vision_language_config
,
field
.
name
)
if
isinstance
(
value
,
Enum
):
result
[
field
.
name
]
=
value
.
name
.
lower
()
elif
isinstance
(
value
,
tuple
):
result
[
field
.
name
]
=
","
.
join
([
str
(
item
)
for
item
in
value
])
else
:
result
[
field
.
name
]
=
value
return
result
def
sanitize_vllm_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
],
vision_language_config
:
VisionLanguageConfig
,
model_id
:
str
):
"""Sanitize vllm output to be comparable with hf output.
The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
It also reduces `output_str` from "<image><image>bla" to "bla".
"""
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_id
)
image_token_str
=
tokenizer
.
decode
(
vision_language_config
.
image_token_id
)
image_token_str_len
=
len
(
image_token_str
)
input_ids
,
output_str
=
vllm_output
sanitized_input_ids
=
input_ids
[
0
:
2
]
+
input_ids
[
2
+
vision_language_config
.
image_feature_size
-
1
:]
sanitzied_output_str
=
output_str
[
vision_language_config
.
image_feature_size
*
image_token_str_len
:]
return
sanitized_input_ids
,
sanitzied_output_str
@
pytest
.
mark
.
parametrize
(
"worker_use_ray"
,
[
False
])
@
pytest
.
mark
.
parametrize
(
"model_and_config"
,
model_and_vl_config
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
def
test_models
(
hf_runner
,
vllm_runner
,
hf_image_prompts
,
hf_images
,
vllm_image_prompts
,
vllm_images
,
model_and_config
:
tuple
,
dtype
:
str
,
max_tokens
:
int
,
worker_use_ray
:
bool
)
->
None
:
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the raw images as input.
For vllm runner, we provide image tensors and corresponding
vision language config as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
model_id
,
vision_language_config
=
model_and_config
hf_model
=
hf_runner
(
model_id
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_greedy
(
hf_image_prompts
,
max_tokens
,
images
=
hf_images
)
del
hf_model
vllm_model
=
vllm_runner
(
model_id
,
dtype
=
dtype
,
worker_use_ray
=
worker_use_ray
,
**
as_dict
(
vision_language_config
))
vllm_outputs
=
vllm_model
.
generate_greedy
(
vllm_image_prompts
,
max_tokens
,
images
=
vllm_images
)
del
vllm_model
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
for
i
in
range
(
len
(
hf_image_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
vllm_output_ids
,
vllm_output_str
=
sanitize_vllm_output
(
vllm_outputs
[
i
],
vision_language_config
,
model_id
)
assert
hf_output_str
==
vllm_output_str
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_str
!
r
}
\n
vLLM:
{
vllm_output_str
!
r
}
"
)
assert
hf_output_ids
==
vllm_output_ids
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_ids
}
\n
vLLM:
{
vllm_output_ids
}
"
)
tests/models/test_marlin.py
View file @
7c4f76e3
"""Compare the outputs of a GPTQ model to a Marlin model.
"""Compare the outputs of a GPTQ model to a Marlin model.
Note: GPTQ and Marlin do not have bitwise correctness.
Note: GPTQ and Marlin do not have bitwise correctness.
As a result, in this test, we just confirm that the top selected tokens of the
As a result, in this test, we just confirm that the top selected tokens of the
Marlin/GPTQ models are in the top 3 selections of each other.
Marlin/GPTQ models are in the top 3 selections of each other.
Note: Marlin internally uses locks to synchronize the threads. This can
Note: Marlin internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for Marlin. As a result, we re-run the test
result in very slight nondeterminism for Marlin. As a result, we re-run the test
up to 3 times to see if we pass.
up to 3 times to see if we pass.
Run `pytest tests/models/test_marlin.py
--forked
`.
Run `pytest tests/models/test_marlin.py`.
"""
"""
from
dataclasses
import
dataclass
import
pytest
import
pytest
import
torch
import
torch
from
dataclasses
import
dataclass
from
vllm.model_executor.layers.quantization
import
_QUANTIZATION_CONFIG_REGISTRY
from
vllm.model_executor.layers.quantization
import
(
_QUANTIZATION_CONFIG_REGISTRY
)
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
...
@@ -60,7 +63,6 @@ def test_models(
...
@@ -60,7 +63,6 @@ def test_models(
# Note: not sure why, but deleting just the model on Ada Lovelace
# Note: not sure why, but deleting just the model on Ada Lovelace
# does not free the GPU memory. On Ampere, deleting the just model
# does not free the GPU memory. On Ampere, deleting the just model
# frees the memory.
# frees the memory.
del
marlin_model
.
model
.
llm_engine
.
driver_worker
del
marlin_model
del
marlin_model
gptq_model
=
vllm_runner
(
model_pair
.
model_gptq
,
dtype
=
dtype
)
gptq_model
=
vllm_runner
(
model_pair
.
model_gptq
,
dtype
=
dtype
)
...
@@ -71,7 +73,6 @@ def test_models(
...
@@ -71,7 +73,6 @@ def test_models(
# Note: not sure why, but deleting just the model on Ada Lovelace
# Note: not sure why, but deleting just the model on Ada Lovelace
# does not free the GPU memory. On Ampere, deleting the just model
# does not free the GPU memory. On Ampere, deleting the just model
# frees the memory.
# frees the memory.
del
gptq_model
.
model
.
llm_engine
.
driver_worker
del
gptq_model
del
gptq_model
# loop through the prompts
# loop through the prompts
...
@@ -87,11 +88,11 @@ def test_models(
...
@@ -87,11 +88,11 @@ def test_models(
if
marlin_output_id
!=
gptq_output_id
:
if
marlin_output_id
!=
gptq_output_id
:
# Each predicted token must be in top 5 of the other's
# Each predicted token must be in top 5 of the other's
assert
gptq_output_id
in
marlin_logprobs
[
idx
],
(
assert
gptq_output_id
in
marlin_logprobs
[
idx
],
(
f
"Test
{
prompt_idx
}
:
\n
GPTQ:
\t
{
gptq_output_str
!
r
}
\n
Marlin:
\t
{
marlin_output_str
!
r
}
"
f
"Test
{
prompt_idx
}
:
\n
GPTQ:
\t
{
gptq_output_str
!
r
}
\n
"
)
f
"Marlin:
\t
{
marlin_output_str
!
r
}
"
)
assert
marlin_output_id
in
gptq_logprobs
[
idx
],
(
assert
marlin_output_id
in
gptq_logprobs
[
idx
],
(
f
"Test
{
prompt_idx
}
:
\n
GPTQ:
\t
{
gptq_output_str
!
r
}
\n
Marlin:
\t
{
marlin_output_str
!
r
}
"
f
"Test
{
prompt_idx
}
:
\n
GPTQ:
\t
{
gptq_output_str
!
r
}
\n
"
)
f
"Marlin:
\t
{
marlin_output_str
!
r
}
"
)
# Break out since sequences will now diverge.
# Break out since sequences will now diverge.
break
break
tests/models/test_mistral.py
View file @
7c4f76e3
"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
Run `pytest tests/models/test_mistral.py
--forked
`.
Run `pytest tests/models/test_mistral.py`.
"""
"""
import
pytest
import
pytest
...
@@ -12,6 +12,9 @@ MODELS = [
...
@@ -12,6 +12,9 @@ MODELS = [
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
skip
(
"Two problems: 1. Failing correctness tests. 2. RuntimeError: expected "
"scalar type BFloat16 but found Half (only in CI)."
)
def
test_models
(
def
test_models
(
hf_runner
,
hf_runner
,
vllm_runner
,
vllm_runner
,
...
...
tests/models/test_models.py
View file @
7c4f76e3
"""Compare the outputs of HF and vLLM when using greedy sampling.
"""Compare the outputs of HF and vLLM when using greedy sampling.
Run `pytest tests/models/test_models.py --forked`.
This test only tests small models. Big models such as 7B should be tested from
test_big_models.py because it could use a larger instance to run tests.
Run `pytest tests/models/test_models.py`.
"""
"""
import
pytest
import
pytest
MODELS
=
[
MODELS
=
[
"facebook/opt-125m"
,
"facebook/opt-125m"
,
"meta-llama/Llama-2-7b-hf"
,
"mistralai/Mistral-7B-v0.1"
,
"Deci/DeciLM-7b"
,
"tiiuae/falcon-7b"
,
"gpt2"
,
"gpt2"
,
"bigcode/tiny_starcoder_py"
,
"bigcode/tiny_starcoder_py"
,
"EleutherAI/gpt-j-6b"
,
"EleutherAI/pythia-70m"
,
"EleutherAI/pythia-70m"
,
"bigscience/bloom-560m"
,
"bigscience/bloom-560m"
,
"mosaicml/mpt-7b"
,
"microsoft/phi-2"
,
"microsoft/phi-2"
,
"stabilityai/stablelm-3b-4e1t"
,
"stabilityai/stablelm-3b-4e1t"
,
"allenai/OLMo-1B"
,
#
"allenai/OLMo-1B",
# Broken
"bigcode/starcoder2-3b"
,
"bigcode/starcoder2-3b"
,
]
]
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
def
test_models
(
def
test_models
(
hf_runner
,
hf_runner
,
vllm_runner
,
vllm_runner
,
...
@@ -34,6 +31,9 @@ def test_models(
...
@@ -34,6 +31,9 @@ def test_models(
dtype
:
str
,
dtype
:
str
,
max_tokens
:
int
,
max_tokens
:
int
,
)
->
None
:
)
->
None
:
# To pass the small model tests, we need full precision.
assert
dtype
==
"float"
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
hf_model
del
hf_model
...
...
tests/prefix_caching/test_prefix_caching.py
View file @
7c4f76e3
...
@@ -4,38 +4,72 @@ Run `pytest tests/prefix_caching/test_prefix_caching.py`.
...
@@ -4,38 +4,72 @@ Run `pytest tests/prefix_caching/test_prefix_caching.py`.
"""
"""
import
pytest
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
vllm.core.block_manager_v1
import
CachedBlockAllocator
from
vllm.utils
import
Device
prefix
=
(
"You are an expert school principal, skilled in effectively managing "
"faculty and staff. Draft 10-15 questions for a potential first grade "
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
"Head Teacher for my K-12, all-girls', independent school that emphasizes "
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
16
])
"community, joyful discovery, and life-long learning. The candidate is "
def
test_block_allocator
(
"coming in for a first-round panel interview for a 8th grade Math "
block_size
:
int
,
"teaching role. They have 5 years of previous teaching experience "
num_blocks
:
int
,
"as an assistant teacher at a co-ed, public school with experience "
"in middle school math teaching. Based on these information, fulfill "
"the following paragraph: "
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
16
])
def
test_prefix_caching
(
example_prompts
,
model
:
str
,
max_tokens
:
int
,
):
):
llm
=
LLM
(
model
=
model
)
block_hash
=
1
# -1 since the last token can change when concatenating prompts.
block_allocator
=
CachedBlockAllocator
(
Device
.
CPU
,
block_size
,
num_blocks
)
prefix_pos
=
len
(
llm
.
llm_engine
.
tokenizer
.
encode
(
prefix
))
-
1
prompts
=
[
prefix
+
prompt
for
prompt
in
example_prompts
]
# Allocate two PysicalTokenBlocks with the same hash and check
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
max_tokens
)
# that they are the same PhysicalTokenBlock
outputs_without_prefix
=
llm
.
generate
(
prompts
,
sampling_params
)
first_block
=
block_allocator
.
allocate
(
block_hash
,
0
)
outputs_with_prefix
=
llm
.
generate
(
prompts
,
second_block
=
block_allocator
.
allocate
(
block_hash
,
0
)
sampling_params
,
assert
(
first_block
==
second_block
)
prefix_pos
=
[
prefix_pos
]
*
len
(
prompts
))
assert
(
second_block
.
ref_count
==
2
)
for
output_without_prefix
,
output_with_prefix
in
zip
(
outputs_without_prefix
,
outputs_with_prefix
):
# Free the first_block and confirm that the ref_count is correctly
assert
(
output_without_prefix
.
outputs
[
0
].
token_ids
==
# decremented on the second block
output_with_prefix
.
outputs
[
0
].
token_ids
)
block_allocator
.
free
(
first_block
)
assert
len
(
llm
.
llm_engine
.
scheduler
.
prefix_pool
.
prefixes
)
==
1
assert
(
second_block
.
ref_count
==
1
)
# Free the second block
block_allocator
.
free
(
second_block
)
# Reallocate the first block and confirm that, even after the block
# had its ref_count go to 0, we still get the same block back
first_block
=
block_allocator
.
allocate
(
block_hash
,
0
)
assert
(
first_block
==
second_block
)
assert
(
first_block
.
block_hash
==
block_hash
)
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
16
])
def
test_eviction
(
num_blocks
:
int
,
):
block_size
=
16
block_allocator
=
CachedBlockAllocator
(
Device
.
CPU
,
block_size
,
num_blocks
)
blocks
=
[]
for
i
in
range
(
num_blocks
):
# use i as the block_hash
blocks
.
append
(
block_allocator
.
allocate
(
i
,
0
))
#Free all blocks
for
block
in
blocks
:
block_allocator
.
free
(
block
)
# Allocate a new block and confirm that it's the first block freed.
# I.E The Least Recently Used block
new_block_hash
=
block_size
new_block
=
block_allocator
.
allocate
(
new_block_hash
,
0
)
assert
(
new_block
==
blocks
[
0
])
assert
(
new_block
.
block_hash
==
new_block_hash
)
# Reallocate the second in blocks to remove it from the free list
realloc_block_hash
=
1
realloc_block
=
block_allocator
.
allocate
(
realloc_block_hash
,
0
)
assert
(
realloc_block
==
blocks
[
realloc_block_hash
])
assert
(
realloc_block
.
block_hash
==
realloc_block_hash
)
# Allocate a new block and confirm that it's not the realloc_block,
# since the realloc_block shouldn't be in the free list
new_block_hash
=
block_size
+
1
new_block
=
block_allocator
.
allocate
(
new_block_hash
,
0
)
assert
(
realloc_block
!=
new_block
)
assert
(
new_block
.
block_hash
==
new_block_hash
)
assert
(
new_block
.
block_number
==
2
)
tests/samplers/test_beam_search.py
View file @
7c4f76e3
"""Compare the outputs of HF and vLLM when using beam search.
"""Compare the outputs of HF and vLLM when using beam search.
Run `pytest tests/samplers/test_beam_search.py
--forked
`.
Run `pytest tests/samplers/test_beam_search.py`.
"""
"""
import
gc
import
pytest
import
pytest
import
torch
# FIXME(zhuohan): The test can not pass if we:
# FIXME(zhuohan): The test can not pass if we:
# 1. Increase max_tokens to 256.
# 1. Increase max_tokens to 256.
...
@@ -36,6 +39,10 @@ def test_beam_search_single_input(
...
@@ -36,6 +39,10 @@ def test_beam_search_single_input(
vllm_outputs
=
vllm_model
.
generate_beam_search
(
example_prompts
,
beam_width
,
vllm_outputs
=
vllm_model
.
generate_beam_search
(
example_prompts
,
beam_width
,
max_tokens
)
max_tokens
)
del
vllm_model
del
vllm_model
# NOTE(woosuk): For some reason, the following GC is required to avoid
# GPU OOM errors in the following tests using `vllm_runner`.
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
for
i
in
range
(
len
(
example_prompts
)):
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
_
=
hf_outputs
[
i
]
hf_output_ids
,
_
=
hf_outputs
[
i
]
...
...
tests/samplers/test_logprobs.py
View file @
7c4f76e3
import
pytest
import
pytest
import
torch
import
torch
from
tests.conftest
import
VllmRunner
from
vllm
import
SamplingParams
from
vllm
import
SamplingParams
MODELS
=
[
"facebook/opt-125m"
]
MODELS
=
[
"facebook/opt-125m"
]
...
@@ -16,6 +17,7 @@ def test_get_prompt_logprobs(
...
@@ -16,6 +17,7 @@ def test_get_prompt_logprobs(
example_prompts
,
example_prompts
,
):
):
max_tokens
=
5
max_tokens
=
5
num_top_logprobs
=
6
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_logprobs
=
hf_model
.
generate_greedy_logprobs
(
hf_logprobs
=
hf_model
.
generate_greedy_logprobs
(
example_prompts
,
example_prompts
,
...
@@ -23,19 +25,32 @@ def test_get_prompt_logprobs(
...
@@ -23,19 +25,32 @@ def test_get_prompt_logprobs(
)
)
del
hf_model
del
hf_model
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
max_logprobs
=
num_top_logprobs
)
vllm_sampling_params
=
SamplingParams
(
max_tokens
=
max_tokens
,
vllm_sampling_params
=
SamplingParams
(
max_tokens
=
max_tokens
,
logprobs
=
5
,
logprobs
=
num_top_logprobs
,
prompt_logprobs
=
5
,
prompt_logprobs
=
5
,
temperature
=
0.0
)
temperature
=
0.0
)
vllm_results
=
vllm_model
.
model
.
generate
(
vllm_results
=
vllm_model
.
model
.
generate
(
example_prompts
,
sampling_params
=
vllm_sampling_params
)
example_prompts
,
sampling_params
=
vllm_sampling_params
)
del
vllm_model
# Test whether logprobs are included in the results.
# Test whether logprobs are included in the results.
for
result
in
vllm_results
:
for
result
in
vllm_results
:
assert
result
.
prompt_logprobs
is
not
None
assert
result
.
prompt_logprobs
is
not
None
assert
result
.
outputs
[
0
].
logprobs
is
not
None
assert
result
.
outputs
[
0
].
logprobs
is
not
None
assert
len
(
result
.
outputs
[
0
].
logprobs
)
==
max_tokens
for
logprobs
in
result
.
outputs
[
0
].
logprobs
:
assert
len
(
logprobs
)
==
num_top_logprobs
output_text
=
result
.
outputs
[
0
].
text
output_string_from_most_likely_tokens
=
[]
for
top_logprobs
in
result
.
outputs
[
0
].
logprobs
:
top_logprob
=
next
(
iter
(
top_logprobs
.
values
()))
output_string_from_most_likely_tokens
.
append
(
top_logprob
.
decoded_token
)
output_string_from_most_likely_tokens
=
""
.
join
(
output_string_from_most_likely_tokens
)
assert
output_text
==
output_string_from_most_likely_tokens
,
(
"The output text from the top logprob for each token position "
"should be the same as the output text in the result."
)
# Test whether prompt logprobs are consistent with HF
# Test whether prompt logprobs are consistent with HF
for
vllm_result
,
hf_logprob
in
zip
(
vllm_results
,
hf_logprobs
):
for
vllm_result
,
hf_logprob
in
zip
(
vllm_results
,
hf_logprobs
):
...
@@ -43,14 +58,29 @@ def test_get_prompt_logprobs(
...
@@ -43,14 +58,29 @@ def test_get_prompt_logprobs(
vllm_prompt_logprobs
=
vllm_result
.
prompt_logprobs
[
1
:]
vllm_prompt_logprobs
=
vllm_result
.
prompt_logprobs
[
1
:]
for
i
,
vllm_prompt_logprob_dict
in
enumerate
(
vllm_prompt_logprobs
):
for
i
,
vllm_prompt_logprob_dict
in
enumerate
(
vllm_prompt_logprobs
):
for
token_id
,
logprob
in
vllm_prompt_logprob_dict
.
items
():
for
token_id
,
logprob
in
vllm_prompt_logprob_dict
.
items
():
torch
.
testing
.
assert_close
(
logprob
,
torch
.
testing
.
assert_close
(
logprob
.
logprob
,
hf_logprob
[
0
][
i
][
token_id
].
item
(),
hf_logprob
[
0
][
i
][
token_id
].
item
(),
atol
=
1e-2
,
atol
=
1e-2
,
rtol
=
1e-2
)
rtol
=
1e-2
)
vllm_sample_logprobs
=
vllm_result
.
outputs
[
0
].
logprobs
vllm_sample_logprobs
=
vllm_result
.
outputs
[
0
].
logprobs
for
i
,
vllm_sample_logprob_dict
in
enumerate
(
vllm_sample_logprobs
):
for
i
,
top_logprobs
in
enumerate
(
vllm_sample_logprobs
):
for
token_id
,
logprob
in
vllm_sample_logprob_dict
.
items
():
for
token_id
,
sample_logprob
in
top_logprobs
.
items
():
logprob
=
sample_logprob
.
logprob
torch
.
testing
.
assert_close
(
logprob
,
torch
.
testing
.
assert_close
(
logprob
,
hf_logprob
[
i
][
-
1
][
token_id
].
item
(),
hf_logprob
[
i
][
-
1
][
token_id
].
item
(),
atol
=
1e-2
,
atol
=
1e-2
,
rtol
=
1e-2
)
rtol
=
1e-2
)
assert
isinstance
(
sample_logprob
.
decoded_token
,
str
),
(
"The token should be decoded by the time it is returned "
" to the user."
)
def
test_max_logprobs
():
runner
=
VllmRunner
(
"facebook/opt-125m"
,
max_logprobs
=
1
)
vllm_sampling_params
=
SamplingParams
(
logprobs
=
1
)
# should pass
runner
.
generate
([
"Hello world"
],
sampling_params
=
vllm_sampling_params
)
bad_sampling_params
=
SamplingParams
(
logprobs
=
2
)
with
pytest
.
raises
(
ValueError
):
runner
.
generate
([
"Hello world"
],
sampling_params
=
bad_sampling_params
)
tests/samplers/test_ranks.py
0 → 100644
View file @
7c4f76e3
import
pytest
from
vllm
import
SamplingParams
MODELS
=
[
"facebook/opt-125m"
]
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_ranks
(
vllm_runner
,
model
,
dtype
,
example_prompts
,
):
max_tokens
=
5
num_top_logprobs
=
5
num_prompt_logprobs
=
5
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
max_logprobs
=
num_top_logprobs
)
## Test greedy logprobs ranks
vllm_sampling_params
=
SamplingParams
(
temperature
=
0.0
,
top_p
=
1.0
,
max_tokens
=
max_tokens
,
logprobs
=
num_top_logprobs
,
prompt_logprobs
=
num_prompt_logprobs
)
vllm_results
=
vllm_model
.
generate_w_logprobs
(
example_prompts
,
vllm_sampling_params
)
for
result
in
vllm_results
:
assert
result
[
2
]
is
not
None
assert
len
(
result
[
2
])
==
len
(
result
[
0
])
# check whether all chosen tokens have ranks = 1
for
token
,
logprobs
in
zip
(
result
[
0
],
result
[
2
]):
assert
token
in
logprobs
assert
logprobs
[
token
].
rank
==
1
## Test non-greedy logprobs ranks
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
top_p
=
1.0
,
max_tokens
=
max_tokens
,
logprobs
=
num_top_logprobs
,
prompt_logprobs
=
num_prompt_logprobs
)
res
=
vllm_model
.
generate_w_logprobs
(
example_prompts
,
sampling_params
)
for
result
in
res
:
assert
result
[
2
]
is
not
None
assert
len
(
result
[
2
])
==
len
(
result
[
0
])
# check whether all chosen tokens have ranks
for
token
,
logprobs
in
zip
(
result
[
0
],
result
[
2
]):
assert
logprobs
[
token
].
rank
>=
1
tests/samplers/test_rejection_sampler.py
View file @
7c4f76e3
"""Tests for rejection sampling."""
"""Tests for rejection sampling."""
import
pytest
from
typing
import
List
,
Tuple
from
typing
import
List
,
Tuple
import
pytest
import
torch
import
torch
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
vllm.model_executor.utils
import
set_random_seed
from
vllm.model_executor.layers.rejection_sampler
import
RejectionSampler
from
vllm.model_executor.layers.rejection_sampler
import
RejectionSampler
from
vllm.model_executor.utils
import
set_random_seed
CUDA_DEVICES
=
[
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
...
...
tests/samplers/test_sampler.py
View file @
7c4f76e3
import
random
import
random
from
typing
import
Tuple
,
List
from
typing
import
List
,
Optional
,
Tuple
from
unittest.mock
import
patch
from
unittest.mock
import
patch
import
pytest
import
pytest
import
torch
import
torch
from
transformers
import
GenerationConfig
,
GenerationMixin
from
transformers
import
GenerationConfig
,
GenerationMixin
from
typing
import
Optional
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.utils
import
set_random_seed
from
vllm.model_executor.utils
import
set_random_seed
from
vllm.sequence
import
SamplingParams
,
SequenceData
,
SequenceGroupMetadata
from
vllm.sequence
import
SamplingParams
,
SequenceData
,
SequenceGroupMetadata
from
vllm.utils
import
Counter
from
vllm.worker.model_runner
import
ModelRunner
from
vllm.worker.model_runner
import
ModelRunner
class
MockLogitsSampler
(
Sampler
):
class
MockLogitsSampler
(
Sampler
):
def
__init__
(
self
,
vocab_size
:
int
,
fake_logits
:
torch
.
Tensor
):
def
__init__
(
self
,
fake_logits
:
torch
.
Tensor
):
super
().
__init__
(
vocab_size
=
vocab_size
)
super
().
__init__
()
self
.
fake_logits
=
fake_logits
self
.
fake_logits
=
fake_logits
def
forward
(
self
,
*
args
,
**
kwargs
):
def
forward
(
self
,
*
args
,
**
kwargs
):
with
patch
(
return
super
().
forward
(
*
args
,
**
kwargs
)
"vllm.model_executor.layers.sampler._prune_hidden_states"
,
lambda
x
,
y
:
x
),
patch
(
"vllm.model_executor.layers.sampler.Sampler._get_logits"
,
lambda
*
args
,
**
kwargs
:
self
.
fake_logits
):
return
super
().
forward
(
*
args
,
**
kwargs
)
def
_prepare_test
(
def
_prepare_test
(
batch_size
:
int
batch_size
:
int
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
MockLogitsSampler
,
ModelRunner
]:
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
MockLogitsSampler
,
ModelRunner
]:
vocab_size
=
32000
input_tensor
=
torch
.
rand
((
batch_size
,
1024
),
dtype
=
torch
.
float16
)
input_tensor
=
torch
.
rand
((
batch_size
,
1024
),
dtype
=
torch
.
float16
)
fake_logits
=
torch
.
full
((
batch_size
,
vocab_size
),
fake_logits
=
torch
.
full
((
batch_size
,
VOCAB_SIZE
),
1e-2
,
1e-2
,
dtype
=
input_tensor
.
dtype
)
dtype
=
input_tensor
.
dtype
)
sampler
=
MockLogitsSampler
(
32000
,
fake_logits
)
sampler
=
MockLogitsSampler
(
fake_logits
)
model_runner
=
ModelRunner
(
None
,
None
,
None
,
None
,
None
)
model_runner
=
ModelRunner
(
None
,
None
,
None
,
None
,
None
)
return
input_tensor
,
fake_logits
,
sampler
,
model_runner
return
input_tensor
,
fake_logits
,
sampler
,
model_runner
VOCAB_SIZE
=
32000
RANDOM_SEEDS
=
list
(
range
(
128
))
RANDOM_SEEDS
=
list
(
range
(
128
))
CUDA_DEVICES
=
[
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
...
@@ -70,9 +65,7 @@ def _do_sample(
...
@@ -70,9 +65,7 @@ def _do_sample(
sampling_metadata
=
model_runner
.
_prepare_sample
(
seq_group_metadata_list
,
sampling_metadata
=
model_runner
.
_prepare_sample
(
seq_group_metadata_list
,
prompt_lens
,
prompt_lens
,
subquery_lens
=
prompt_lens
)
subquery_lens
=
prompt_lens
)
return
sampler
(
embedding
=
None
,
return
sampler
(
logits
=
input_tensor
,
sampling_metadata
=
sampling_metadata
)
hidden_states
=
input_tensor
,
sampling_metadata
=
sampling_metadata
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
...
@@ -85,8 +78,8 @@ def test_sampler_all_greedy(seed: int, device: str):
...
@@ -85,8 +78,8 @@ def test_sampler_all_greedy(seed: int, device: str):
batch_size
)
batch_size
)
sampling_params
=
SamplingParams
(
temperature
=
0
)
sampling_params
=
SamplingParams
(
temperature
=
0
)
sampler_output
=
_do_sample
(
batch_size
,
input_tensor
,
sampl
er
,
sampler_output
=
_do_sample
(
batch_size
,
fake_logits
,
sampler
,
model_runn
er
,
model_runner
,
sampling_params
)
sampling_params
)
expected
=
torch
.
argmax
(
fake_logits
,
dim
=-
1
)
expected
=
torch
.
argmax
(
fake_logits
,
dim
=-
1
)
for
i
,
sequence_output
in
enumerate
(
sampler_output
):
for
i
,
sequence_output
in
enumerate
(
sampler_output
):
for
nth_output
in
sequence_output
.
samples
:
for
nth_output
in
sequence_output
.
samples
:
...
@@ -111,8 +104,8 @@ def test_sampler_all_random(seed: int, device: str):
...
@@ -111,8 +104,8 @@ def test_sampler_all_random(seed: int, device: str):
temperature
=
1.0
,
temperature
=
1.0
,
n
=
random
.
randint
(
1
,
10
),
n
=
random
.
randint
(
1
,
10
),
)
)
sampler_output
=
_do_sample
(
batch_size
,
input_tensor
,
sampl
er
,
sampler_output
=
_do_sample
(
batch_size
,
fake_logits
,
sampler
,
model_runn
er
,
model_runner
,
sampling_params
)
sampling_params
)
for
i
,
sequence_output
in
enumerate
(
sampler_output
):
for
i
,
sequence_output
in
enumerate
(
sampler_output
):
for
nth_output
in
sequence_output
.
samples
:
for
nth_output
in
sequence_output
.
samples
:
...
@@ -127,8 +120,7 @@ def test_sampler_all_random_seed(seed: int, device: str):
...
@@ -127,8 +120,7 @@ def test_sampler_all_random_seed(seed: int, device: str):
set_random_seed
(
seed
)
set_random_seed
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
batch_size
=
random
.
randint
(
1
,
256
)
batch_size
=
random
.
randint
(
1
,
256
)
input_tensor
,
fake_logits
,
sampler
,
model_runner
=
_prepare_test
(
_
,
fake_logits
,
sampler
,
model_runner
=
_prepare_test
(
batch_size
)
batch_size
)
for
i
in
range
(
batch_size
):
for
i
in
range
(
batch_size
):
fake_logits
[
i
,
i
]
=
1e2
fake_logits
[
i
,
i
]
=
1e2
...
@@ -138,8 +130,8 @@ def test_sampler_all_random_seed(seed: int, device: str):
...
@@ -138,8 +130,8 @@ def test_sampler_all_random_seed(seed: int, device: str):
n
=
random
.
randint
(
1
,
10
),
n
=
random
.
randint
(
1
,
10
),
seed
=
random
.
randint
(
0
,
10000
),
seed
=
random
.
randint
(
0
,
10000
),
)
)
sampler_output
=
_do_sample
(
batch_size
,
input_tensor
,
sampl
er
,
sampler_output
=
_do_sample
(
batch_size
,
fake_logits
,
sampler
,
model_runn
er
,
model_runner
,
sampling_params
)
sampling_params
)
for
i
,
sequence_output
in
enumerate
(
sampler_output
):
for
i
,
sequence_output
in
enumerate
(
sampler_output
):
for
nth_output
in
sequence_output
.
samples
:
for
nth_output
in
sequence_output
.
samples
:
...
@@ -154,18 +146,17 @@ def test_sampler_all_random_seed_deterministic(seed: int, device: str):
...
@@ -154,18 +146,17 @@ def test_sampler_all_random_seed_deterministic(seed: int, device: str):
set_random_seed
(
seed
)
set_random_seed
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
batch_size
=
random
.
randint
(
1
,
256
)
batch_size
=
random
.
randint
(
1
,
256
)
input_tensor
,
fake_logits
,
sampler
,
model_runner
=
_prepare_test
(
_
,
fake_logits
,
sampler
,
model_runner
=
_prepare_test
(
batch_size
)
batch_size
)
sampling_params
=
SamplingParams
(
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
temperature
=
1.0
,
n
=
random
.
randint
(
1
,
10
),
n
=
random
.
randint
(
1
,
10
),
seed
=
random
.
randint
(
0
,
10000
),
seed
=
random
.
randint
(
0
,
10000
),
)
)
first_sampler_output
=
_do_sample
(
batch_size
,
input_tensor
,
sampler
,
first_sampler_output
=
_do_sample
(
batch_size
,
fake_logits
,
sampler
,
model_runner
,
sampling_params
)
model_runner
,
sampling_params
)
second_sampler_output
=
_do_sample
(
batch_size
,
input_tensor
,
sampler
,
second_sampler_output
=
_do_sample
(
batch_size
,
fake_logits
,
sampler
,
model_runner
,
sampling_params
)
model_runner
,
sampling_params
)
assert
first_sampler_output
==
second_sampler_output
assert
first_sampler_output
==
second_sampler_output
...
@@ -179,15 +170,14 @@ def test_sampler_all_beam(seed: int, device: str):
...
@@ -179,15 +170,14 @@ def test_sampler_all_beam(seed: int, device: str):
set_random_seed
(
seed
)
set_random_seed
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
batch_size
=
random
.
randint
(
1
,
256
)
batch_size
=
random
.
randint
(
1
,
256
)
input_tensor
,
_
,
sampler
,
model_runner
=
_prepare_test
(
batch_size
)
_
,
fake_logits
,
sampler
,
model_runner
=
_prepare_test
(
batch_size
)
sampling_params
=
SamplingParams
(
sampling_params
=
SamplingParams
(
temperature
=
0
,
temperature
=
0
,
best_of
=
2
,
best_of
=
2
,
use_beam_search
=
True
,
use_beam_search
=
True
,
)
)
_do_sample
(
batch_size
,
input_tensor
,
sampler
,
model_runner
,
_do_sample
(
batch_size
,
fake_logits
,
sampler
,
model_runner
,
sampling_params
)
sampling_params
)
# no assertion here as I am not sure how to determine whether
# no assertion here as I am not sure how to determine whether
# the outputs are expected - in other words, this just tests
# the outputs are expected - in other words, this just tests
# whether there are no exceptions in the sampler
# whether there are no exceptions in the sampler
...
@@ -195,6 +185,225 @@ def test_sampler_all_beam(seed: int, device: str):
...
@@ -195,6 +185,225 @@ def test_sampler_all_beam(seed: int, device: str):
del
model_runner
del
model_runner
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_sampler_min_tokens_penalty
(
seed
:
int
,
device
:
str
):
seq_id_counter
=
Counter
(
start
=
random
.
randint
(
0
,
100
))
set_random_seed
(
seed
)
torch
.
set_default_device
(
device
)
def
create_sampling_params
(
min_tokens
,
eos_token_id
=
0
,
stop_token_ids
=
None
):
sampling_params
=
SamplingParams
(
min_tokens
=
min_tokens
,
max_tokens
=
9999
,
# keep higher than max of min_tokens
stop_token_ids
=
stop_token_ids
,
)
sampling_params
.
eos_token_id
=
eos_token_id
return
sampling_params
def
create_sequence_data
(
num_input
=
3
,
num_generated
=
0
):
seq_data
=
SequenceData
(
random
.
choices
(
range
(
0
,
VOCAB_SIZE
),
k
=
num_input
))
if
num_generated
>
0
:
seq_data
.
output_token_ids
=
random
.
choices
(
range
(
0
,
VOCAB_SIZE
),
k
=
num_generated
)
return
seq_data
def
generate_test_case
():
# generate multiple seq groups but limit total batch size
batch_size
=
random
.
randint
(
1
,
128
)
expected_penalization
=
[]
sequence_metadata_list
=
[]
while
batch_size
>
0
:
# 20% chance to generate prompt seq group with single sequence
is_prompt
=
random
.
random
()
<
0.2
num_seqs
=
1
if
is_prompt
else
random
.
randint
(
1
,
batch_size
)
eos_token_id
=
random
.
randint
(
0
,
VOCAB_SIZE
-
1
)
min_tokens
=
random
.
randint
(
0
,
50
)
num_stop_tokens
=
random
.
randint
(
0
,
8
)
if
num_stop_tokens
>
0
:
stop_token_ids
=
random
.
choices
(
range
(
0
,
VOCAB_SIZE
-
1
),
k
=
num_stop_tokens
)
else
:
stop_token_ids
=
None
sampling_params
=
create_sampling_params
(
min_tokens
=
min_tokens
,
eos_token_id
=
eos_token_id
,
stop_token_ids
=
stop_token_ids
)
seq_data
=
{}
seq_group_penalization
=
[]
for
_
in
range
(
num_seqs
):
num_input
=
random
.
randint
(
1
,
100
)
num_generated
=
random
.
randint
(
1
,
100
)
if
not
is_prompt
else
0
seq_data
[
next
(
seq_id_counter
)]
=
create_sequence_data
(
num_input
=
num_input
,
num_generated
=
num_generated
)
seq_group_penalization
.
append
(
num_generated
<
min_tokens
)
expected_penalization
.
extend
(
seq_group_penalization
)
sequence_metadata_list
.
append
(
SequenceGroupMetadata
(
request_id
=
f
"test_
{
batch_size
}
"
,
is_prompt
=
is_prompt
,
seq_data
=
seq_data
,
sampling_params
=
sampling_params
,
block_tables
=
{},
))
batch_size
-=
num_seqs
return
{
"expected_penalization"
:
expected_penalization
,
"seq_group_metadata_list"
:
sequence_metadata_list
,
}
# define some explicit test cases for edge case behavior
prompt_without_penalization
=
{
"expected_penalization"
:
[
False
],
"seq_group_metadata_list"
:
[
SequenceGroupMetadata
(
request_id
=
"test_1"
,
is_prompt
=
True
,
seq_data
=
{
next
(
seq_id_counter
):
create_sequence_data
(),
},
sampling_params
=
create_sampling_params
(
0
),
block_tables
=
{},
),
]
}
prompt_with_penalization
=
{
"expected_penalization"
:
[
True
],
"seq_group_metadata_list"
:
[
SequenceGroupMetadata
(
request_id
=
"test_1"
,
is_prompt
=
True
,
seq_data
=
{
next
(
seq_id_counter
):
create_sequence_data
(),
},
sampling_params
=
create_sampling_params
(
1
),
block_tables
=
{},
),
]
}
stop_penalizing_after_min_tokens
=
{
"expected_penalization"
:
[
False
],
"seq_group_metadata_list"
:
[
SequenceGroupMetadata
(
request_id
=
"test_1"
,
is_prompt
=
False
,
seq_data
=
{
next
(
seq_id_counter
):
create_sequence_data
(
num_generated
=
1
),
},
sampling_params
=
create_sampling_params
(
1
),
block_tables
=
{},
)
]
}
stop_token_ids
=
[
42
,
99
,
42
,
0
]
# intentional duplication
simple_combination
=
{
"expected_penalization"
:
[
True
,
False
,
False
],
"seq_group_metadata_list"
:
[
SequenceGroupMetadata
(
request_id
=
"test_1"
,
is_prompt
=
False
,
seq_data
=
{
next
(
seq_id_counter
):
create_sequence_data
(
num_generated
=
1
),
next
(
seq_id_counter
):
create_sequence_data
(
num_generated
=
100
),
},
sampling_params
=
create_sampling_params
(
2
,
stop_token_ids
=
stop_token_ids
),
block_tables
=
{},
),
SequenceGroupMetadata
(
request_id
=
"test_2"
,
is_prompt
=
True
,
seq_data
=
{
next
(
seq_id_counter
):
create_sequence_data
(),
},
sampling_params
=
create_sampling_params
(
0
,
stop_token_ids
=
stop_token_ids
),
block_tables
=
{},
)
]
}
if
seed
==
0
:
test_cases
=
[
prompt_without_penalization
,
prompt_with_penalization
,
stop_penalizing_after_min_tokens
,
simple_combination
,
]
else
:
test_cases
=
[
generate_test_case
()]
def
run_test_case
(
*
,
expected_penalization
=
None
,
seq_group_metadata_list
=
None
):
assert
expected_penalization
,
"Invalid test case"
assert
seq_group_metadata_list
,
"Invalid test case"
batch_size
=
0
prompt_lens
=
[]
sampling_params_per_seq
=
[]
for
sgm
in
seq_group_metadata_list
:
num_seqs
=
len
(
sgm
.
seq_data
)
batch_size
+=
num_seqs
sampling_params
=
sgm
.
sampling_params
for
seq_id
in
sgm
.
seq_data
:
prompt_lens
.
append
(
sgm
.
seq_data
[
seq_id
].
get_prompt_len
())
sampling_params_per_seq
.
append
(
sampling_params
)
_
,
fake_logits
,
sampler
,
model_runner
=
_prepare_test
(
batch_size
)
sampling_metadata
=
model_runner
.
_prepare_sample
(
seq_group_metadata_list
,
prompt_lens
=
prompt_lens
,
subquery_lens
=
prompt_lens
)
# the logits tensor is modified in-place by the sampler
_
=
sampler
(
logits
=
fake_logits
,
sampling_metadata
=
sampling_metadata
)
for
logits_idx
,
(
should_penalize
,
sampling_params
)
in
enumerate
(
zip
(
expected_penalization
,
sampling_params_per_seq
)):
tokens_to_check
=
[
sampling_params
.
eos_token_id
]
if
sampling_params
.
stop_token_ids
:
tokens_to_check
.
extend
(
sampling_params
.
stop_token_ids
)
tokens_to_check
=
set
(
tokens_to_check
)
if
should_penalize
:
for
token_id
in
tokens_to_check
:
assert
fake_logits
[
logits_idx
,
token_id
]
==
-
float
(
'inf'
),
f
"Expected token
{
token_id
}
for logits row
{
logits_idx
}
"
" to be penalized"
# no other tokens should be set to -inf
assert
torch
.
count_nonzero
(
fake_logits
[
logits_idx
,
:]
==
-
float
(
'inf'
))
==
len
(
tokens_to_check
),
f
"Expected only
{
len
(
tokens_to_check
)
}
to be penalized"
else
:
# no tokens should be set to -inf
assert
torch
.
count_nonzero
(
fake_logits
[
logits_idx
,
:]
==
-
float
(
'inf'
))
==
0
,
"No tokens should have been penalized"
del
model_runner
for
test_case
in
test_cases
:
run_test_case
(
**
test_case
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_sampler_mixed
(
seed
:
int
,
device
:
str
):
def
test_sampler_mixed
(
seed
:
int
,
device
:
str
):
...
@@ -246,8 +455,7 @@ def test_sampler_mixed(seed: int, device: str):
...
@@ -246,8 +455,7 @@ def test_sampler_mixed(seed: int, device: str):
def
test_sampling
(
model_runner
:
ModelRunner
):
def
test_sampling
(
model_runner
:
ModelRunner
):
sampling_metadata
=
model_runner
.
_prepare_sample
(
sampling_metadata
=
model_runner
.
_prepare_sample
(
seq_group_metadata_list
,
prompt_lens
,
subquery_lens
=
prompt_lens
)
seq_group_metadata_list
,
prompt_lens
,
subquery_lens
=
prompt_lens
)
sampler_output
=
sampler
(
embedding
=
None
,
sampler_output
=
sampler
(
logits
=
fake_logits
,
hidden_states
=
input_tensor
,
sampling_metadata
=
sampling_metadata
)
sampling_metadata
=
sampling_metadata
)
for
i
,
(
sequence_output
,
metadata
)
in
enumerate
(
for
i
,
(
sequence_output
,
metadata
)
in
enumerate
(
...
@@ -255,9 +463,10 @@ def test_sampler_mixed(seed: int, device: str):
...
@@ -255,9 +463,10 @@ def test_sampler_mixed(seed: int, device: str):
if
metadata
.
sampling_params
.
use_beam_search
:
if
metadata
.
sampling_params
.
use_beam_search
:
continue
continue
if
metadata
.
sampling_params
.
seed
is
not
None
\
if
(
metadata
.
sampling_params
.
seed
is
not
None
and
expected_tokens
[
i
]
is
None
:
and
expected_tokens
[
i
]
is
None
):
# Record seeded random result to compare with results of second invocation
# Record seeded random result to compare with results of
# second invocation
expected_tokens
[
i
]
=
[
expected_tokens
[
i
]
=
[
nth_output
.
output_token
nth_output
.
output_token
for
nth_output
in
sequence_output
.
samples
for
nth_output
in
sequence_output
.
samples
...
@@ -265,11 +474,13 @@ def test_sampler_mixed(seed: int, device: str):
...
@@ -265,11 +474,13 @@ def test_sampler_mixed(seed: int, device: str):
continue
continue
for
n
,
nth_output
in
enumerate
(
sequence_output
.
samples
):
for
n
,
nth_output
in
enumerate
(
sequence_output
.
samples
):
if
metadata
.
sampling_params
.
temperature
==
0
or
metadata
.
sampling_params
.
seed
is
not
None
:
if
(
metadata
.
sampling_params
.
temperature
==
0
or
metadata
.
sampling_params
.
seed
is
not
None
):
# Ensure exact matches for greedy or random with seed
# Ensure exact matches for greedy or random with seed
assert
nth_output
.
output_token
==
expected_tokens
[
i
][
n
]
assert
nth_output
.
output_token
==
expected_tokens
[
i
][
n
]
else
:
else
:
# For non-seeded random check that one of the high-logit tokens were chosen
# For non-seeded random check that one of the high-logit
# tokens were chosen
assert
nth_output
.
output_token
in
expected_tokens
[
i
]
assert
nth_output
.
output_token
in
expected_tokens
[
i
]
# Test batch
# Test batch
...
@@ -284,55 +495,13 @@ def test_sampler_mixed(seed: int, device: str):
...
@@ -284,55 +495,13 @@ def test_sampler_mixed(seed: int, device: str):
input_tensor
.
data
=
input_tensor
.
index_select
(
0
,
target_index
)
input_tensor
.
data
=
input_tensor
.
index_select
(
0
,
target_index
)
fake_logits
.
data
=
fake_logits
.
index_select
(
0
,
target_index
)
fake_logits
.
data
=
fake_logits
.
index_select
(
0
,
target_index
)
# This time, results of seeded random samples will be compared with
the corresponding
# This time, results of seeded random samples will be compared with
# sample in the pre-shuffled batch
#
the corresponding
sample in the pre-shuffled batch
test_sampling
(
model_runner
)
test_sampling
(
model_runner
)
del
model_runner
del
model_runner
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_sampler_logits_processors
(
seed
:
int
,
device
:
str
):
set_random_seed
(
seed
)
torch
.
set_default_device
(
device
)
batch_size
=
random
.
randint
(
1
,
256
)
input_tensor
,
_
,
sampler
,
model_runner
=
_prepare_test
(
batch_size
)
# This sample logits processor gives infinite score to the i-th token,
# where i is the length of the input sequence.
# We therefore expect the output token sequence to be [0, 1, 2, ...]
def
pick_ith
(
token_ids
,
logits
):
logits
[
len
(
token_ids
)]
=
float
(
"inf"
)
return
logits
seq_group_metadata_list
=
[]
prompt_lens
=
[]
for
i
in
range
(
batch_size
):
seq_group_metadata_list
.
append
(
SequenceGroupMetadata
(
request_id
=
f
"test_
{
i
}
"
,
is_prompt
=
True
,
seq_data
=
{
0
:
SequenceData
([
1
,
2
,
3
])},
sampling_params
=
SamplingParams
(
temperature
=
0
,
logits_processors
=
[
pick_ith
]),
block_tables
=
{
0
:
[
1
]},
))
prompt_lens
.
append
(
seq_group_metadata_list
[
-
1
].
seq_data
[
0
].
get_len
())
sampling_metadata
=
model_runner
.
_prepare_sample
(
seq_group_metadata_list
,
prompt_lens
,
subquery_lens
=
prompt_lens
)
sampler_output
=
sampler
(
embedding
=
None
,
hidden_states
=
input_tensor
,
sampling_metadata
=
sampling_metadata
)
for
_
,
sequence_output
in
enumerate
(
sampler_output
):
for
idx
,
nth_output
in
enumerate
(
sequence_output
.
samples
):
assert
nth_output
.
output_token
==
idx
del
model_runner
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_sampler_top_k_top_p
(
seed
:
int
,
device
:
str
):
def
test_sampler_top_k_top_p
(
seed
:
int
,
device
:
str
):
...
@@ -349,7 +518,7 @@ def test_sampler_top_k_top_p(seed: int, device: str):
...
@@ -349,7 +518,7 @@ def test_sampler_top_k_top_p(seed: int, device: str):
size
=
(
batch_size
,
vocab_size
),
size
=
(
batch_size
,
vocab_size
),
device
=
input_tensor
.
device
,
device
=
input_tensor
.
device
,
dtype
=
input_tensor
.
dtype
)
dtype
=
input_tensor
.
dtype
)
sampler
=
MockLogitsSampler
(
32000
,
fake_logits
)
sampler
=
MockLogitsSampler
(
fake_logits
)
model_runner
=
ModelRunner
(
None
,
None
,
None
,
None
,
None
)
model_runner
=
ModelRunner
(
None
,
None
,
None
,
None
,
None
)
generation_model
=
GenerationMixin
()
generation_model
=
GenerationMixin
()
...
@@ -382,15 +551,13 @@ def test_sampler_top_k_top_p(seed: int, device: str):
...
@@ -382,15 +551,13 @@ def test_sampler_top_k_top_p(seed: int, device: str):
sample_probs
=
None
sample_probs
=
None
def
mock_sample
(
probs
,
logprobs
,
sampling_metadata
):
def
mock_sample
(
probs
,
*
args
,
**
kwargs
):
nonlocal
sample_probs
nonlocal
sample_probs
sample_probs
=
probs
sample_probs
=
probs
return
[[
prob
.
topk
(
1
,
dim
=-
1
).
indices
.
tolist
(),
[
0
]]
for
prob
in
probs
]
return
[[
prob
.
topk
(
1
,
dim
=-
1
).
indices
.
tolist
(),
[
0
]]
for
prob
in
probs
]
with
patch
(
"vllm.model_executor.layers.sampler._sample"
,
mock_sample
):
with
patch
(
"vllm.model_executor.layers.sampler._sample"
,
mock_sample
):
sampler
(
embedding
=
None
,
sampler
(
logits
=
fake_logits
,
sampling_metadata
=
sampling_metadata
)
hidden_states
=
input_tensor
,
sampling_metadata
=
sampling_metadata
)
hf_probs
=
warpers
(
torch
.
zeros_like
(
fake_logits
),
fake_logits
.
clone
())
hf_probs
=
warpers
(
torch
.
zeros_like
(
fake_logits
),
fake_logits
.
clone
())
hf_probs
=
torch
.
softmax
(
hf_probs
,
dim
=-
1
,
dtype
=
torch
.
float
)
hf_probs
=
torch
.
softmax
(
hf_probs
,
dim
=-
1
,
dtype
=
torch
.
float
)
assert
torch
.
allclose
(
hf_probs
,
sample_probs
,
atol
=
1e-5
)
assert
torch
.
allclose
(
hf_probs
,
sample_probs
,
atol
=
1e-5
)
...
...
tests/samplers/test_seeded_generate.py
View file @
7c4f76e3
"""Verify that seeded random sampling is deterministic.
"""Verify that seeded random sampling is deterministic.
Run `pytest tests/samplers/test_seeded_generate.py
--forked
`.
Run `pytest tests/samplers/test_seeded_generate.py`.
"""
"""
import
copy
import
copy
import
random
import
random
...
@@ -8,8 +8,8 @@ from itertools import combinations
...
@@ -8,8 +8,8 @@ from itertools import combinations
import
pytest
import
pytest
from
vllm.model_executor.utils
import
set_random_seed
from
vllm
import
SamplingParams
from
vllm
import
SamplingParams
from
vllm.model_executor.utils
import
set_random_seed
MODEL
=
"facebook/opt-125m"
MODEL
=
"facebook/opt-125m"
RANDOM_SEEDS
=
list
(
range
(
5
))
RANDOM_SEEDS
=
list
(
range
(
5
))
...
...
tests/samplers/test_stop_reason.py
0 → 100644
View file @
7c4f76e3
"""Test the different finish_reason="stop" situations during generation:
1. One of the provided stop strings
2. One of the provided stop tokens
3. The EOS token
Run `pytest tests/samplers/test_stop_reason.py`.
"""
import
pytest
import
transformers
from
vllm
import
SamplingParams
MODEL
=
"facebook/opt-350m"
STOP_STR
=
"."
SEED
=
42
MAX_TOKENS
=
1024
@
pytest
.
fixture
def
vllm_model
(
vllm_runner
):
vllm_model
=
vllm_runner
(
MODEL
)
yield
vllm_model
del
vllm_model
def
test_stop_reason
(
vllm_model
,
example_prompts
):
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
MODEL
)
stop_token_id
=
tokenizer
.
convert_tokens_to_ids
(
STOP_STR
)
llm
=
vllm_model
.
model
# test stop token
outputs
=
llm
.
generate
(
example_prompts
,
sampling_params
=
SamplingParams
(
seed
=
SEED
,
max_tokens
=
MAX_TOKENS
,
stop_token_ids
=
[
stop_token_id
]))
for
output
in
outputs
:
output
=
output
.
outputs
[
0
]
assert
output
.
finish_reason
==
"stop"
assert
output
.
stop_reason
==
stop_token_id
# test stop string
outputs
=
llm
.
generate
(
example_prompts
,
sampling_params
=
SamplingParams
(
seed
=
SEED
,
max_tokens
=
MAX_TOKENS
,
stop
=
"."
))
for
output
in
outputs
:
output
=
output
.
outputs
[
0
]
assert
output
.
finish_reason
==
"stop"
assert
output
.
stop_reason
==
STOP_STR
# test EOS token
outputs
=
llm
.
generate
(
example_prompts
,
sampling_params
=
SamplingParams
(
seed
=
SEED
,
max_tokens
=
MAX_TOKENS
))
for
output
in
outputs
:
output
=
output
.
outputs
[
0
]
assert
output
.
finish_reason
==
"length"
or
(
output
.
finish_reason
==
"stop"
and
output
.
stop_reason
is
None
)
tests/spec_decode/__init__.py
0 → 100644
View file @
7c4f76e3
tests/spec_decode/test_batch_expansion.py
0 → 100644
View file @
7c4f76e3
import
pytest
import
torch
from
vllm.spec_decode.batch_expansion
import
BatchExpansionTop1Scorer
from
.utils
import
create_seq_group_metadata_from_prompts
,
mock_worker
@
pytest
.
mark
.
parametrize
(
'num_target_seq_ids'
,
[
100
])
def
test_create_target_seq_id_iterator
(
num_target_seq_ids
:
int
):
"""Verify all new sequence ids are greater than all input
seq ids.
"""
scorer
=
BatchExpansionTop1Scorer
(
mock_worker
(),
'cuda:0'
,
32_000
)
all_seq_ids
=
[
[
1
,
3
,
5
,
7
],
list
(
range
(
100
))
+
[
0
],
[
100
],
]
for
seq_ids
in
all_seq_ids
:
max_seq_id
=
max
(
seq_ids
)
iterator
=
scorer
.
_create_target_seq_id_iterator
(
seq_ids
)
# pylint: disable=protected-access
for
_
in
range
(
num_target_seq_ids
):
assert
next
(
iterator
)
>
max_seq_id
@
pytest
.
mark
.
parametrize
(
'k'
,
[
1
,
2
,
6
])
def
test_get_token_ids_to_score
(
k
:
int
):
"""Verify correct tokens are selected for scoring.
"""
proposal_token_ids
=
torch
.
tensor
(
list
(
range
(
k
)),
dtype
=
torch
.
int64
,
device
=
'cuda'
,
)
expected_output
=
[
[],
]
for
i
in
range
(
proposal_token_ids
.
shape
[
0
]):
expected_output
.
append
(
proposal_token_ids
[:
i
+
1
].
tolist
())
scorer
=
BatchExpansionTop1Scorer
(
mock_worker
(),
'cuda:0'
,
32_000
)
actual_output
=
scorer
.
_get_token_ids_to_score
(
proposal_token_ids
)
# pylint: disable=protected-access
actual_output
=
[
x
.
tolist
()
if
isinstance
(
x
,
torch
.
Tensor
)
else
x
for
x
in
actual_output
]
assert
actual_output
==
expected_output
@
pytest
.
mark
.
parametrize
(
'k'
,
[
1
,
2
,
6
])
def
test_create_single_target_seq_group_metadata
(
k
:
int
):
"""Verify correct creation of a batch-expanded seq group metadata.
"""
prompt_tokens
=
[
1
,
2
,
3
]
prev_output_tokens
=
[
4
,
5
,
6
]
token_ids
=
list
(
range
(
k
))
num_tokens_processed
=
len
(
prompt_tokens
)
+
len
(
prev_output_tokens
)
-
1
final_seq_len
=
len
(
prompt_tokens
)
+
len
(
prev_output_tokens
)
+
len
(
token_ids
)
block_size
=
32
input_seq_group_metadata
=
create_seq_group_metadata_from_prompts
(
[
prompt_tokens
],
2048
//
block_size
,
block_size
,
[
final_seq_len
],
[
prev_output_tokens
],
[
num_tokens_processed
])[
0
]
input_seq_id
=
list
(
input_seq_group_metadata
.
seq_data
.
keys
())[
0
]
target_seq_id
=
100
scorer
=
BatchExpansionTop1Scorer
(
mock_worker
(),
'cuda:0'
,
32_000
)
output
=
scorer
.
_create_single_target_seq_group_metadata
(
# pylint: disable=protected-access
input_seq_group_metadata
,
input_seq_id
,
target_seq_id
,
token_ids
,
)
assert
output
.
request_id
==
input_seq_group_metadata
.
request_id
assert
len
(
output
.
seq_data
)
==
1
assert
output
.
seq_data
[
target_seq_id
].
get_prompt_token_ids
(
)
==
prompt_tokens
assert
output
.
seq_data
[
target_seq_id
].
get_output_token_ids
(
)
==
prev_output_tokens
+
token_ids
assert
len
(
output
.
block_tables
)
==
1
assert
output
.
block_tables
[
target_seq_id
]
==
input_seq_group_metadata
.
block_tables
[
input_seq_id
]
tests/spec_decode/test_metrics.py
0 → 100644
View file @
7c4f76e3
import
math
from
unittest.mock
import
MagicMock
import
pytest
import
torch
from
vllm.spec_decode.metrics
import
AsyncMetricsCollector
def
test_initial_call_returns_none
():
"""Expect first call to get metrics to return None.
"""
rej_sampler
=
MagicMock
()
rej_sampler
.
num_accepted_tokens
=
torch
.
tensor
(
0
,
dtype
=
torch
.
long
,
device
=
'cuda'
)
rej_sampler
.
num_emitted_tokens
=
torch
.
tensor
(
0
,
dtype
=
torch
.
long
,
device
=
'cuda'
)
rej_sampler
.
num_draft_tokens
=
0
collector
=
AsyncMetricsCollector
(
rej_sampler
)
collector
.
init_gpu_tensors
(
rank
=
0
)
maybe_metrics
=
collector
.
maybe_collect_rejsample_metrics
(
k
=
5
)
assert
maybe_metrics
is
None
def
test_second_call_returns_metrics
():
"""Expect second call to not return None.
"""
rej_sampler
=
MagicMock
()
rej_sampler
.
num_accepted_tokens
=
torch
.
tensor
(
0
,
dtype
=
torch
.
long
,
device
=
'cuda'
)
rej_sampler
.
num_emitted_tokens
=
torch
.
tensor
(
0
,
dtype
=
torch
.
long
,
device
=
'cuda'
)
rej_sampler
.
num_draft_tokens
=
0
collect_interval_s
=
5.0
timer
=
MagicMock
()
timer
.
side_effect
=
[
0.0
,
collect_interval_s
+
0.1
,
collect_interval_s
+
0.2
]
collector
=
AsyncMetricsCollector
(
rejection_sampler
=
rej_sampler
,
timer
=
timer
,
collect_interval_s
=
collect_interval_s
)
collector
.
init_gpu_tensors
(
rank
=
0
)
_
=
collector
.
maybe_collect_rejsample_metrics
(
k
=
5
)
metrics
=
collector
.
maybe_collect_rejsample_metrics
(
k
=
5
)
assert
metrics
is
not
None
@
pytest
.
mark
.
parametrize
(
"rank"
,
[
1
,
2
,
3
,
4
])
def
test_nonzero_rank_noop
(
rank
):
"""Verify nonzero ranks don't collect metrics.
"""
rej_sampler
=
MagicMock
()
rej_sampler
.
num_accepted_tokens
=
torch
.
tensor
(
0
,
dtype
=
torch
.
long
,
device
=
'cuda'
)
rej_sampler
.
num_emitted_tokens
=
torch
.
tensor
(
0
,
dtype
=
torch
.
long
,
device
=
'cuda'
)
rej_sampler
.
num_draft_tokens
=
0
collector
=
AsyncMetricsCollector
(
rej_sampler
)
collector
.
init_gpu_tensors
(
rank
=
rank
)
_
=
collector
.
maybe_collect_rejsample_metrics
(
k
=
5
)
metrics
=
collector
.
maybe_collect_rejsample_metrics
(
k
=
5
)
assert
metrics
is
None
def
test_noop_until_time
():
"""Verify metrics aren't collected until enough time passes.
"""
rej_sampler
=
MagicMock
()
rej_sampler
.
num_accepted_tokens
=
torch
.
tensor
(
0
,
dtype
=
torch
.
long
,
device
=
'cuda'
)
rej_sampler
.
num_emitted_tokens
=
torch
.
tensor
(
0
,
dtype
=
torch
.
long
,
device
=
'cuda'
)
rej_sampler
.
num_draft_tokens
=
0
collect_interval_s
=
5.0
timer
=
MagicMock
()
timer
.
side_effect
=
[
0.0
,
collect_interval_s
-
0.1
,
collect_interval_s
-
0.1
,
collect_interval_s
+
0.1
,
collect_interval_s
+
0.1
]
collector
=
AsyncMetricsCollector
(
rejection_sampler
=
rej_sampler
,
timer
=
timer
,
collect_interval_s
=
collect_interval_s
)
collector
.
init_gpu_tensors
(
rank
=
0
)
_
=
collector
.
maybe_collect_rejsample_metrics
(
k
=
5
)
metrics
=
collector
.
maybe_collect_rejsample_metrics
(
k
=
5
)
assert
metrics
is
None
_
=
collector
.
maybe_collect_rejsample_metrics
(
k
=
5
)
metrics
=
collector
.
maybe_collect_rejsample_metrics
(
k
=
5
)
assert
metrics
is
not
None
@
pytest
.
mark
.
parametrize
(
"has_data"
,
[
True
,
False
])
def
test_initial_metrics_has_correct_values
(
has_data
:
bool
):
"""Test correctness of metrics data.
"""
if
has_data
:
num_accepted_tokens
=
103
num_emitted_tokens
=
104
num_draft_tokens
=
105
else
:
num_accepted_tokens
=
0
num_emitted_tokens
=
0
num_draft_tokens
=
0
k
=
5
num_possible_tokens
=
AsyncMetricsCollector
.
get_max_num_accepted_tokens
(
num_draft_tokens
,
k
)
rej_sampler
=
MagicMock
()
rej_sampler
.
num_accepted_tokens
=
torch
.
tensor
(
num_accepted_tokens
,
dtype
=
torch
.
long
,
device
=
'cuda'
)
rej_sampler
.
num_emitted_tokens
=
torch
.
tensor
(
num_emitted_tokens
,
dtype
=
torch
.
long
,
device
=
'cuda'
)
rej_sampler
.
num_draft_tokens
=
num_draft_tokens
collect_interval_s
=
5.0
timer
=
MagicMock
()
timer
.
side_effect
=
[
0.0
,
collect_interval_s
+
0.1
,
collect_interval_s
+
0.2
]
collector
=
AsyncMetricsCollector
(
rejection_sampler
=
rej_sampler
,
timer
=
timer
,
collect_interval_s
=
collect_interval_s
)
collector
.
init_gpu_tensors
(
rank
=
0
)
_
=
collector
.
maybe_collect_rejsample_metrics
(
k
)
metrics
=
collector
.
maybe_collect_rejsample_metrics
(
k
)
assert
metrics
.
num_spec_tokens
==
k
assert
metrics
.
accepted_tokens
==
num_accepted_tokens
assert
metrics
.
draft_tokens
==
num_draft_tokens
assert
metrics
.
emitted_tokens
==
num_emitted_tokens
if
has_data
:
assert
(
metrics
.
draft_acceptance_rate
==
num_accepted_tokens
/
num_draft_tokens
)
assert
(
metrics
.
system_efficiency
==
num_emitted_tokens
/
num_possible_tokens
)
else
:
assert
math
.
isnan
(
metrics
.
draft_acceptance_rate
)
assert
math
.
isnan
(
metrics
.
system_efficiency
)
Prev
1
…
4
5
6
7
8
9
10
11
12
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment