Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
96ae75ad
Commit
96ae75ad
authored
Jan 04, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.6.6.post1' into v0.6.6.post1-dev
parents
f9f4a735
2339d59f
Changes
374
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
947 additions
and
38 deletions
+947
-38
tests/models/encoder_decoder/vision_language/test_mllama.py
tests/models/encoder_decoder/vision_language/test_mllama.py
+1
-1
tests/models/registry.py
tests/models/registry.py
+3
-0
tests/models/test_registry.py
tests/models/test_registry.py
+7
-4
tests/multimodal/test_mapper.py
tests/multimodal/test_mapper.py
+1
-1
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
...dd_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+3
-2
tests/quantization/test_compressed_tensors.py
tests/quantization/test_compressed_tensors.py
+112
-10
tests/runai_model_streamer/__init__.py
tests/runai_model_streamer/__init__.py
+0
-0
tests/runai_model_streamer/test_runai_model_streamer_loader.py
.../runai_model_streamer/test_runai_model_streamer_loader.py
+31
-0
tests/runai_model_streamer/test_weight_utils.py
tests/runai_model_streamer/test_weight_utils.py
+39
-0
tests/samplers/test_rejection_sampler.py
tests/samplers/test_rejection_sampler.py
+63
-0
tests/tensorizer_loader/test_tensorizer.py
tests/tensorizer_loader/test_tensorizer.py
+7
-2
tests/tool_use/utils.py
tests/tool_use/utils.py
+9
-1
tests/v1/core/test_prefix_caching.py
tests/v1/core/test_prefix_caching.py
+86
-2
tests/v1/engine/test_engine_args.py
tests/v1/engine/test_engine_args.py
+0
-15
tests/v1/engine/test_engine_core.py
tests/v1/engine/test_engine_core.py
+38
-0
tests/v1/sample/__init__.py
tests/v1/sample/__init__.py
+0
-0
tests/v1/sample/test_sampler.py
tests/v1/sample/test_sampler.py
+321
-0
tests/v1/worker/__init__.py
tests/v1/worker/__init__.py
+0
-0
tests/v1/worker/test_gpu_input_batch.py
tests/v1/worker/test_gpu_input_batch.py
+224
-0
tests/weight_loading/models.txt
tests/weight_loading/models.txt
+2
-0
No files found.
tests/models/encoder_decoder/vision_language/test_mllama.py
View file @
96ae75ad
...
@@ -7,7 +7,7 @@ from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
...
@@ -7,7 +7,7 @@ from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
from
vllm.attention.selector
import
(
_Backend
,
_cached_get_attn_backend
,
from
vllm.attention.selector
import
(
_Backend
,
_cached_get_attn_backend
,
global_force_attn_backend_context_manager
)
global_force_attn_backend_context_manager
)
from
vllm.multimodal.
utils
import
rescale_image_size
from
vllm.multimodal.
image
import
rescale_image_size
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
from
....conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
,
from
....conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
,
...
...
tests/models/registry.py
View file @
96ae75ad
...
@@ -61,6 +61,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -61,6 +61,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"DeepseekForCausalLM"
:
_HfExamplesInfo
(
"deepseek-ai/deepseek-llm-7b-chat"
),
"DeepseekForCausalLM"
:
_HfExamplesInfo
(
"deepseek-ai/deepseek-llm-7b-chat"
),
"DeepseekV2ForCausalLM"
:
_HfExamplesInfo
(
"deepseek-ai/DeepSeek-V2-Lite-Chat"
,
# noqa: E501
"DeepseekV2ForCausalLM"
:
_HfExamplesInfo
(
"deepseek-ai/DeepSeek-V2-Lite-Chat"
,
# noqa: E501
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"DeepseekV3ForCausalLM"
:
_HfExamplesInfo
(
"deepseek-ai/DeepSeek-V3"
,
# noqa: E501
trust_remote_code
=
True
),
"ExaoneForCausalLM"
:
_HfExamplesInfo
(
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
),
# noqa: E501
"ExaoneForCausalLM"
:
_HfExamplesInfo
(
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
),
# noqa: E501
"FalconForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/falcon-7b"
),
"FalconForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/falcon-7b"
),
"GemmaForCausalLM"
:
_HfExamplesInfo
(
"google/gemma-2b"
),
"GemmaForCausalLM"
:
_HfExamplesInfo
(
"google/gemma-2b"
),
...
@@ -138,6 +140,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
...
@@ -138,6 +140,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
"BertModel"
:
_HfExamplesInfo
(
"BAAI/bge-base-en-v1.5"
),
"BertModel"
:
_HfExamplesInfo
(
"BAAI/bge-base-en-v1.5"
),
"Gemma2Model"
:
_HfExamplesInfo
(
"BAAI/bge-multilingual-gemma2"
),
"Gemma2Model"
:
_HfExamplesInfo
(
"BAAI/bge-multilingual-gemma2"
),
"GritLM"
:
_HfExamplesInfo
(
"parasail-ai/GritLM-7B-vllm"
),
"GritLM"
:
_HfExamplesInfo
(
"parasail-ai/GritLM-7B-vllm"
),
"JambaForSequenceClassification"
:
_HfExamplesInfo
(
"ai21labs/Jamba-tiny-reward-dev"
),
# noqa: E501
"LlamaModel"
:
_HfExamplesInfo
(
"llama"
,
is_available_online
=
False
),
"LlamaModel"
:
_HfExamplesInfo
(
"llama"
,
is_available_online
=
False
),
"MistralModel"
:
_HfExamplesInfo
(
"intfloat/e5-mistral-7b-instruct"
),
"MistralModel"
:
_HfExamplesInfo
(
"intfloat/e5-mistral-7b-instruct"
),
"Qwen2Model"
:
_HfExamplesInfo
(
"ssmits/Qwen2-7B-Instruct-embed-base"
),
"Qwen2Model"
:
_HfExamplesInfo
(
"ssmits/Qwen2-7B-Instruct-embed-base"
),
...
...
tests/models/test_registry.py
View file @
96ae75ad
...
@@ -6,7 +6,9 @@ import torch.cuda
...
@@ -6,7 +6,9 @@ import torch.cuda
from
vllm.model_executor.models
import
(
is_pooling_model
,
from
vllm.model_executor.models
import
(
is_pooling_model
,
is_text_generation_model
,
is_text_generation_model
,
supports_multimodal
)
supports_multimodal
)
from
vllm.model_executor.models.adapters
import
as_embedding_model
from
vllm.model_executor.models.adapters
import
(
as_classification_model
,
as_embedding_model
,
as_reward_model
)
from
vllm.model_executor.models.registry
import
(
_MULTIMODAL_MODELS
,
from
vllm.model_executor.models.registry
import
(
_MULTIMODAL_MODELS
,
_SPECULATIVE_DECODING_MODELS
,
_SPECULATIVE_DECODING_MODELS
,
_TEXT_GENERATION_MODELS
,
_TEXT_GENERATION_MODELS
,
...
@@ -29,9 +31,10 @@ def test_registry_imports(model_arch):
...
@@ -29,9 +31,10 @@ def test_registry_imports(model_arch):
or
model_arch
in
_MULTIMODAL_MODELS
):
or
model_arch
in
_MULTIMODAL_MODELS
):
assert
is_text_generation_model
(
model_cls
)
assert
is_text_generation_model
(
model_cls
)
# All vLLM models should be convertible to an embedding model
# All vLLM models should be convertible to a pooling model
embed_model
=
as_embedding_model
(
model_cls
)
assert
is_pooling_model
(
as_classification_model
(
model_cls
))
assert
is_pooling_model
(
embed_model
)
assert
is_pooling_model
(
as_embedding_model
(
model_cls
))
assert
is_pooling_model
(
as_reward_model
(
model_cls
))
if
model_arch
in
_MULTIMODAL_MODELS
:
if
model_arch
in
_MULTIMODAL_MODELS
:
assert
supports_multimodal
(
model_cls
)
assert
supports_multimodal
(
model_cls
)
...
...
tests/multimodal/test_mapper.py
View file @
96ae75ad
...
@@ -8,7 +8,7 @@ from transformers import LlavaNextImageProcessor
...
@@ -8,7 +8,7 @@ from transformers import LlavaNextImageProcessor
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
from
vllm.multimodal
import
MultiModalRegistry
from
vllm.multimodal
import
MultiModalRegistry
from
vllm.multimodal.
utils
import
rescale_image_size
from
vllm.multimodal.
image
import
rescale_image_size
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
...
...
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
View file @
96ae75ad
...
@@ -13,6 +13,7 @@ from vllm.sequence import IntermediateTensors, PoolerOutput
...
@@ -13,6 +13,7 @@ from vllm.sequence import IntermediateTensors, PoolerOutput
class
MyGemma2Embedding
(
nn
.
Module
):
class
MyGemma2Embedding
(
nn
.
Module
):
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"model."
:
""
})
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
super
().
__init__
()
...
@@ -62,8 +63,8 @@ class MyGemma2Embedding(nn.Module):
...
@@ -62,8 +63,8 @@ class MyGemma2Embedding(nn.Module):
return
self
.
_pooler
(
hidden_states
,
pooling_metadata
)
return
self
.
_pooler
(
hidden_states
,
pooling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"model."
:
""
})
weights
=
hf_to_vllm_mapper
.
apply
(
weights
)
weights
=
self
.
hf_to_vllm_mapper
.
apply
(
weights
)
weights
=
((
name
,
data
)
for
name
,
data
in
weights
weights
=
((
name
,
data
)
for
name
,
data
in
weights
if
not
name
.
startswith
(
"lm_head."
))
if
not
name
.
startswith
(
"lm_head."
))
return
self
.
model
.
load_weights
(
weights
)
return
self
.
model
.
load_weights
(
weights
)
tests/quantization/test_compressed_tensors.py
View file @
96ae75ad
...
@@ -12,11 +12,14 @@ from compressed_tensors.quantization import QuantizationType
...
@@ -12,11 +12,14 @@ from compressed_tensors.quantization import QuantizationType
from
tests.models.utils
import
check_logprobs_close
from
tests.models.utils
import
check_logprobs_close
from
vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors
import
(
# noqa: E501
from
vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors
import
(
# noqa: E501
CompressedTensorsLinearMethod
,
CompressedTensorsW4A16Sparse24
,
CompressedTensors24
,
CompressedTensorsLinearMethod
,
CompressedTensorsW8A8Fp8
,
CompressedTensorsW8A8Int8
,
CompressedTensorsW4A16Sparse24
,
CompressedTensorsW8A8Fp8
,
CompressedTensorsW8A16Fp8
,
CompressedTensorsWNA16
)
CompressedTensorsW8A8Int8
,
CompressedTensorsW8A16Fp8
,
from
..utils
import
models_path_prefix
CompressedTensorsWNA16
)
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
sparse_cutlass_supported
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
..utils
import
models_path_prefix
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
...
@@ -79,12 +82,12 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
...
@@ -79,12 +82,12 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
assert
output
assert
output
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model_path"
,
[
"
model_path
"
,
os
.
path
.
join
(
model
s
_path
_prefix
,
"neuralmagic/Llama-3.2-1B-quantized.w8a8"
)
,
[
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym"
),
os
.
path
.
join
(
models_path_prefix
,
"n
euralmagic/Llama-3.2-1B-quantized.w8a8
"
)
os
.
path
.
join
(
models_path_prefix
,
"n
m-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym
"
)
,
# TODO static & asymmetric
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
)
])
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
def
test_compressed_tensors_w8a8_logprobs
(
hf_runner
,
vllm_runner
,
def
test_compressed_tensors_w8a8_logprobs
(
hf_runner
,
vllm_runner
,
...
@@ -92,6 +95,10 @@ def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner,
...
@@ -92,6 +95,10 @@ def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner,
max_tokens
,
num_logprobs
):
max_tokens
,
num_logprobs
):
dtype
=
"bfloat16"
dtype
=
"bfloat16"
# skip language translation prompt for the static per tensor asym model
if
model_path
==
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
):
# noqa: E501
example_prompts
=
example_prompts
[
0
:
-
1
]
with
hf_runner
(
model_path
,
dtype
=
dtype
)
as
hf_model
:
with
hf_runner
(
model_path
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
,
num_logprobs
)
example_prompts
,
max_tokens
,
num_logprobs
)
...
@@ -220,3 +227,98 @@ def test_compressed_tensors_kv_cache(vllm_runner):
...
@@ -220,3 +227,98 @@ def test_compressed_tensors_kv_cache(vllm_runner):
with
vllm_runner
(
model_path
,
kv_cache_dtype
=
"fp8"
)
as
llm
:
with
vllm_runner
(
model_path
,
kv_cache_dtype
=
"fp8"
)
as
llm
:
output
=
llm
.
generate_greedy
(
"Hello world!"
,
max_tokens
=
20
)
output
=
llm
.
generate_greedy
(
"Hello world!"
,
max_tokens
=
20
)
assert
output
assert
output
@
pytest
.
mark
.
skipif
(
not
sparse_cutlass_supported
(),
reason
=
"Sparse FP8 is not yet supported on this GPU type."
)
def
_test_2of4_quant_models
(
qkv_proj
,
weight_strategy
,
input_strategy
):
assert
isinstance
(
qkv_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
qkv_proj
.
scheme
,
CompressedTensors24
)
assert
qkv_proj
.
scheme
.
weight_quant
.
strategy
==
weight_strategy
assert
qkv_proj
.
scheme
.
input_quant
.
strategy
==
input_strategy
assert
qkv_proj
.
scheme
.
quantized
assert
qkv_proj
.
quant_method
.
quantization_config
.
sparsity_scheme_map
sparsity_map
=
qkv_proj
.
quant_method
.
quantization_config
.
sparsity_scheme_map
# noqa: E501
assert
sparsity_map
.
get
(
"Linear"
).
format
==
"dense"
assert
sparsity_map
.
get
(
"Linear"
).
sparsity_structure
==
"2:4"
@
pytest
.
mark
.
skipif
(
not
current_platform
.
has_device_capability
(
90
),
reason
=
"Sparse FP8 is not yet supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"args_2of4"
,
[
(
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing"
,
"channel"
,
"token"
),
(
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing"
,
"channel"
,
"tensor"
),
(
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing"
,
"tensor"
,
"tensor"
),
(
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing"
,
"tensor"
,
"token"
),
])
def
test_compressed_tensors_2of4_quant_fp8
(
vllm_runner
,
args_2of4
):
model
,
weight_strategy
,
input_strategy
=
args_2of4
with
vllm_runner
(
model
)
as
llm
:
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
# noqa: E501
layer
=
model
.
model
.
layers
[
0
]
qkv_proj
=
layer
.
self_attn
.
qkv_proj
assert
qkv_proj
.
scheme
.
weights_dtype
==
torch
.
float8_e4m3fn
_test_2of4_quant_models
(
qkv_proj
,
weight_strategy
,
input_strategy
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
print
(
output
)
assert
output
@
pytest
.
mark
.
skipif
(
not
sparse_cutlass_supported
(),
reason
=
"Sparse FP8 is not yet supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"args_2of4"
,
[
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing"
,
"channel"
,
"token"
),
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing"
,
"tensor"
,
"tensor"
),
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing"
,
"tensor"
,
"token"
),
])
def
test_compressed_tensors_2of4_quant_int8
(
vllm_runner
,
args_2of4
):
model
,
weight_strategy
,
input_strategy
=
args_2of4
with
vllm_runner
(
model
)
as
llm
:
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
# noqa: E501
layer
=
model
.
model
.
layers
[
0
]
qkv_proj
=
layer
.
self_attn
.
qkv_proj
assert
qkv_proj
.
scheme
.
weights_dtype
==
torch
.
int8
_test_2of4_quant_models
(
qkv_proj
,
weight_strategy
,
input_strategy
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
print
(
output
)
assert
output
@
pytest
.
mark
.
skipif
(
not
sparse_cutlass_supported
(),
reason
=
"Sparse FP8 is not yet supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"args_2of4"
,
[(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor"
)])
def
test_compressed_tensors_2of4_sparse
(
vllm_runner
,
args_2of4
):
model
=
args_2of4
with
vllm_runner
(
model
)
as
llm
:
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
# noqa: E501
layer
=
model
.
model
.
layers
[
0
]
qkv_proj
=
layer
.
self_attn
.
qkv_proj
assert
isinstance
(
qkv_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
qkv_proj
.
scheme
,
CompressedTensors24
)
assert
qkv_proj
.
scheme
.
weight_quant
is
None
assert
qkv_proj
.
scheme
.
input_quant
is
None
assert
not
qkv_proj
.
scheme
.
quantized
assert
qkv_proj
.
quant_method
.
quantization_config
.
sparsity_scheme_map
sparsity_map
=
qkv_proj
.
quant_method
.
quantization_config
.
sparsity_scheme_map
# noqa: E501
assert
sparsity_map
.
get
(
"Linear"
).
format
==
"dense"
assert
sparsity_map
.
get
(
"Linear"
).
sparsity_structure
==
"2:4"
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
print
(
output
)
assert
output
tests/runai_model_streamer/__init__.py
0 → 100644
View file @
96ae75ad
tests/runai_model_streamer/test_runai_model_streamer_loader.py
0 → 100644
View file @
96ae75ad
from
vllm
import
SamplingParams
from
vllm.config
import
LoadConfig
,
LoadFormat
from
vllm.model_executor.model_loader.loader
import
(
RunaiModelStreamerLoader
,
get_model_loader
)
test_model
=
"openai-community/gpt2"
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
seed
=
0
)
def
get_runai_model_loader
():
load_config
=
LoadConfig
(
load_format
=
LoadFormat
.
RUNAI_STREAMER
)
return
get_model_loader
(
load_config
)
def
test_get_model_loader_with_runai_flag
():
model_loader
=
get_runai_model_loader
()
assert
isinstance
(
model_loader
,
RunaiModelStreamerLoader
)
def
test_runai_model_loader_download_files
(
vllm_runner
):
with
vllm_runner
(
test_model
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
)
as
llm
:
deserialized_outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
assert
deserialized_outputs
tests/runai_model_streamer/test_weight_utils.py
0 → 100644
View file @
96ae75ad
import
glob
import
tempfile
import
huggingface_hub.constants
import
torch
from
vllm.model_executor.model_loader.weight_utils
import
(
download_weights_from_hf
,
runai_safetensors_weights_iterator
,
safetensors_weights_iterator
)
def
test_runai_model_loader
():
with
tempfile
.
TemporaryDirectory
()
as
tmpdir
:
huggingface_hub
.
constants
.
HF_HUB_OFFLINE
=
False
download_weights_from_hf
(
"openai-community/gpt2"
,
allow_patterns
=
[
"*.safetensors"
],
cache_dir
=
tmpdir
)
safetensors
=
glob
.
glob
(
f
"
{
tmpdir
}
/**/*.safetensors"
,
recursive
=
True
)
assert
len
(
safetensors
)
>
0
runai_model_streamer_tensors
=
{}
hf_safetensors_tensors
=
{}
for
name
,
tensor
in
runai_safetensors_weights_iterator
(
safetensors
):
runai_model_streamer_tensors
[
name
]
=
tensor
for
name
,
tensor
in
safetensors_weights_iterator
(
safetensors
):
hf_safetensors_tensors
[
name
]
=
tensor
assert
len
(
runai_model_streamer_tensors
)
==
len
(
hf_safetensors_tensors
)
for
name
,
runai_tensor
in
runai_model_streamer_tensors
.
items
():
assert
runai_tensor
.
dtype
==
hf_safetensors_tensors
[
name
].
dtype
assert
runai_tensor
.
shape
==
hf_safetensors_tensors
[
name
].
shape
assert
torch
.
all
(
runai_tensor
.
eq
(
hf_safetensors_tensors
[
name
]))
if
__name__
==
"__main__"
:
test_runai_model_loader
()
tests/samplers/test_rejection_sampler.py
View file @
96ae75ad
...
@@ -203,6 +203,69 @@ def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
...
@@ -203,6 +203,69 @@ def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
@
pytest
.
mark
.
skipif
(
is_hip
(),
@
pytest
.
mark
.
skipif
(
is_hip
(),
reason
=
"Consistent with NV."
)
reason
=
"Consistent with NV."
)
@
pytest
.
mark
.
parametrize
(
"k"
,
[
1
,
3
,
6
])
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
30_000
,
50_000
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
3
,
8
,
32
,
128
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"use_flashinfer"
,
[
True
,
False
])
@
torch
.
inference_mode
()
def
test_mixed_seeded_batch
(
k
:
int
,
vocab_size
:
int
,
batch_size
:
int
,
device
:
str
,
use_flashinfer
:
bool
):
torch
.
set_default_device
(
device
)
set_random_seed
(
0
)
draft_probs
=
torch
.
rand
(
batch_size
,
k
,
vocab_size
,
dtype
=
torch
.
float32
)
target_probs
=
torch
.
rand
(
batch_size
,
k
+
1
,
vocab_size
,
dtype
=
torch
.
float32
)
bonus_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
1
),
dtype
=
torch
.
int64
)
draft_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
k
),
dtype
=
torch
.
int64
)
single_batches
=
[]
for
i
in
range
(
batch_size
):
single_batches
.
append
((
draft_probs
[
i
].
clone
().
unsqueeze
(
0
),
draft_token_ids
[
i
].
clone
().
unsqueeze
(
0
),
target_probs
[
i
].
clone
().
unsqueeze
(
0
),
bonus_token_ids
[
i
].
clone
().
unsqueeze
(
0
),
draft_token_ids
[
i
].
clone
().
unsqueeze
(
0
)))
set_random_seed
(
0
)
rejection_sampler
=
RejectionSampler
(
use_flashinfer
=
use_flashinfer
)
rejection_sampler
.
init_gpu_tensors
(
device
=
device
)
results
=
[]
seeded_seqs
=
{
i
:
torch
.
Generator
(
device
=
device
).
manual_seed
(
i
)
for
i
in
range
(
1
,
batch_size
)
# 0 is seed None
}
batch_result
=
rejection_sampler
(
target_probs
.
clone
(),
bonus_token_ids
.
clone
(),
draft_probs
.
clone
(),
draft_token_ids
.
clone
(),
seeded_seqs
)
set_random_seed
(
0
)
rejection_sampler
=
RejectionSampler
(
use_flashinfer
=
use_flashinfer
)
rejection_sampler
.
init_gpu_tensors
(
device
=
device
)
for
i
in
range
(
batch_size
):
request_seeded_seqs
=
{
0
:
torch
.
Generator
(
device
=
device
).
manual_seed
(
i
)
}
if
seeded_seqs
.
get
(
i
)
is
not
None
else
None
(
draft_probs
,
draft_token_ids
,
target_probs
,
bonus_token_ids
,
draft_token_ids
)
=
single_batches
[
i
]
results
.
append
(
rejection_sampler
(
target_probs
,
bonus_token_ids
,
draft_probs
,
draft_token_ids
,
request_seeded_seqs
))
for
i
in
range
(
batch_size
):
assert
torch
.
equal
(
batch_result
[
i
],
results
[
i
].
squeeze
(
0
))
@
pytest
.
mark
.
parametrize
(
"k"
,
[
1
,
3
,
6
])
@
pytest
.
mark
.
parametrize
(
"k"
,
[
1
,
3
,
6
])
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
30_000
,
50_000
])
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
30_000
,
50_000
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
8
,
32
,
128
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
8
,
32
,
128
])
...
...
tests/tensorizer_loader/test_tensorizer.py
View file @
96ae75ad
...
@@ -9,7 +9,6 @@ import openai
...
@@ -9,7 +9,6 @@ import openai
import
pytest
import
pytest
import
torch
import
torch
from
huggingface_hub
import
snapshot_download
from
huggingface_hub
import
snapshot_download
from
tensorizer
import
EncryptionParams
from
vllm
import
EngineArgs
,
LLMEngine
,
RequestOutput
,
SamplingParams
from
vllm
import
EngineArgs
,
LLMEngine
,
RequestOutput
,
SamplingParams
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.arg_utils
import
EngineArgs
...
@@ -23,13 +22,19 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
...
@@ -23,13 +22,19 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
serialize_vllm_model
,
serialize_vllm_model
,
tensorize_vllm_model
)
tensorize_vllm_model
)
# yapf: enable
# yapf: enable
from
vllm.utils
import
import_from_path
from
vllm.utils
import
PlaceholderModule
,
import_from_path
from
..conftest
import
VllmRunner
from
..conftest
import
VllmRunner
from
..utils
import
VLLM_PATH
,
RemoteOpenAIServer
from
..utils
import
VLLM_PATH
,
RemoteOpenAIServer
from
.conftest
import
retry_until_skip
from
.conftest
import
retry_until_skip
from
..utils
import
RemoteOpenAIServer
,
models_path_prefix
from
..utils
import
RemoteOpenAIServer
,
models_path_prefix
try
:
from
tensorizer
import
EncryptionParams
except
ImportError
:
tensorizer
=
PlaceholderModule
(
"tensorizer"
)
# type: ignore[assignment]
EncryptionParams
=
tensorizer
.
placeholder_attr
(
"EncryptionParams"
)
EXAMPLES_PATH
=
VLLM_PATH
/
"examples"
EXAMPLES_PATH
=
VLLM_PATH
/
"examples"
prompts
=
[
prompts
=
[
...
...
tests/tool_use/utils.py
View file @
96ae75ad
...
@@ -105,7 +105,7 @@ CONFIGS: Dict[str, ServerConfig] = {
...
@@ -105,7 +105,7 @@ CONFIGS: Dict[str, ServerConfig] = {
"supports_rocm"
:
"supports_rocm"
:
False
,
False
,
},
},
"granite8b"
:
{
"granite
-3.0-
8b"
:
{
"model"
:
"model"
:
"ibm-granite/granite-3.0-8b-instruct"
,
"ibm-granite/granite-3.0-8b-instruct"
,
"arguments"
:
[
"arguments"
:
[
...
@@ -113,6 +113,14 @@ CONFIGS: Dict[str, ServerConfig] = {
...
@@ -113,6 +113,14 @@ CONFIGS: Dict[str, ServerConfig] = {
str
(
VLLM_PATH
/
"examples/tool_chat_template_granite.jinja"
)
str
(
VLLM_PATH
/
"examples/tool_chat_template_granite.jinja"
)
],
],
},
},
"granite-3.1-8b"
:
{
"model"
:
"ibm-granite/granite-3.1-8b-instruct"
,
"arguments"
:
[
"--tool-call-parser"
,
"granite"
,
],
"supports_parallel"
:
True
,
},
"internlm"
:
{
"internlm"
:
{
"model"
:
"model"
:
"internlm/internlm2_5-7b-chat"
,
"internlm/internlm2_5-7b-chat"
,
...
...
tests/v1/core/test_prefix_caching.py
View file @
96ae75ad
...
@@ -2,16 +2,23 @@
...
@@ -2,16 +2,23 @@
import
pytest
import
pytest
from
vllm.inputs
import
token_inputs
from
vllm.inputs
import
token_inputs
from
vllm.multimodal.inputs
import
PlaceholderRange
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
vllm.utils
import
cdiv
from
vllm.utils
import
cdiv
from
vllm.v1.core.kv_cache_manager
import
KVCacheManager
,
Request
from
vllm.v1.core.kv_cache_manager
import
KVCacheManager
,
Request
from
vllm.v1.core.kv_cache_utils
import
KVCacheBlock
,
hash_block_tokens
from
vllm.v1.core.kv_cache_utils
import
KVCacheBlock
,
hash_block_tokens
def
make_request
(
request_id
,
prompt_token_ids
):
def
make_request
(
request_id
,
prompt_token_ids
,
mm_positions
=
None
,
mm_hashes
=
None
):
return
Request
(
return
Request
(
request_id
=
request_id
,
request_id
=
request_id
,
inputs
=
token_inputs
(
prompt_token_ids
=
prompt_token_ids
),
inputs
=
token_inputs
(
prompt_token_ids
=
prompt_token_ids
,
multi_modal_placeholders
=
{
"image"
:
mm_positions
}
if
mm_positions
else
None
,
multi_modal_hashes
=
mm_hashes
),
sampling_params
=
SamplingParams
(
max_tokens
=
17
),
sampling_params
=
SamplingParams
(
max_tokens
=
17
),
eos_token_id
=
100
,
eos_token_id
=
100
,
arrival_time
=
0
,
arrival_time
=
0
,
...
@@ -38,6 +45,7 @@ def test_prefill():
...
@@ -38,6 +45,7 @@ def test_prefill():
all_token_ids
=
common_token_ids
+
unique_token_ids
all_token_ids
=
common_token_ids
+
unique_token_ids
req0
=
make_request
(
"0"
,
all_token_ids
)
req0
=
make_request
(
"0"
,
all_token_ids
)
computed_blocks
=
manager
.
get_computed_blocks
(
req0
)
computed_blocks
=
manager
.
get_computed_blocks
(
req0
)
assert
len
(
req0
.
kv_block_hashes
)
==
3
assert
not
computed_blocks
assert
not
computed_blocks
blocks
=
manager
.
allocate_slots
(
req0
,
55
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req0
,
55
,
computed_blocks
)
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
0
,
1
,
2
,
3
,
4
]
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
0
,
1
,
2
,
3
,
4
]
...
@@ -61,6 +69,7 @@ def test_prefill():
...
@@ -61,6 +69,7 @@ def test_prefill():
unique_token_ids
=
[
3
]
*
5
unique_token_ids
=
[
3
]
*
5
req1
=
make_request
(
"1"
,
common_token_ids
+
unique_token_ids
)
req1
=
make_request
(
"1"
,
common_token_ids
+
unique_token_ids
)
computed_blocks
=
manager
.
get_computed_blocks
(
req1
)
computed_blocks
=
manager
.
get_computed_blocks
(
req1
)
assert
len
(
req1
.
kv_block_hashes
)
==
3
assert
[
b
.
block_id
for
b
in
computed_blocks
]
==
[
0
,
1
,
2
]
assert
[
b
.
block_id
for
b
in
computed_blocks
]
==
[
0
,
1
,
2
]
num_new_tokens
=
53
-
3
*
16
num_new_tokens
=
53
-
3
*
16
blocks
=
manager
.
allocate_slots
(
req1
,
num_new_tokens
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req1
,
num_new_tokens
,
computed_blocks
)
...
@@ -90,6 +99,7 @@ def test_prefill():
...
@@ -90,6 +99,7 @@ def test_prefill():
unique_token_ids
=
[
3
]
*
6
unique_token_ids
=
[
3
]
*
6
req2
=
make_request
(
"2"
,
common_token_ids
+
unique_token_ids
)
req2
=
make_request
(
"2"
,
common_token_ids
+
unique_token_ids
)
computed_block
=
manager
.
get_computed_blocks
(
req2
)
computed_block
=
manager
.
get_computed_blocks
(
req2
)
assert
len
(
req2
.
kv_block_hashes
)
==
3
assert
[
b
.
block_id
for
b
in
computed_block
]
==
[
0
,
1
,
2
]
assert
[
b
.
block_id
for
b
in
computed_block
]
==
[
0
,
1
,
2
]
num_new_tokens
=
53
-
3
*
16
num_new_tokens
=
53
-
3
*
16
blocks
=
manager
.
allocate_slots
(
req2
,
num_new_tokens
,
computed_blocks
)
blocks
=
manager
.
allocate_slots
(
req2
,
num_new_tokens
,
computed_blocks
)
...
@@ -416,3 +426,77 @@ def test_cache_blocks():
...
@@ -416,3 +426,77 @@ def test_cache_blocks():
)
)
assert
len
(
manager
.
cached_block_hash_to_block
)
==
3
assert
len
(
manager
.
cached_block_hash_to_block
)
==
3
assert
blocks
[
0
].
block_hash
is
not
None
assert
blocks
[
0
].
block_hash
is
not
None
def
test_mm_prefix_caching
():
"""
This tests that the multi-modal prefix caching is correct.
"""
manager
=
KVCacheManager
(
block_size
=
16
,
num_gpu_blocks
=
10
,
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
True
,
num_preallocate_tokens
=
16
,
)
# Common prompt tokens (T is text tokens and P is image placeholder tokens)
# [T,...,T, P0,...,P0], [P0,...,P0,T,...,T,P1,...,P1], [P1,...,P1]
common_token_ids
=
list
(
range
(
10
))
+
[
-
1
]
*
6
common_token_ids
+=
[
-
1
]
*
4
+
list
(
range
(
10
,
20
))
+
[
-
1
]
*
2
common_token_ids
+=
[
-
1
]
*
16
common_mm_positions
=
[
PlaceholderRange
(
offset
=
11
,
length
=
10
),
PlaceholderRange
(
offset
=
30
,
length
=
18
),
]
common_mm_hashes
=
[
"aaa"
,
"bbb"
]
# A unique image plus some text tokens.
unique_token_ids
=
[
-
1
]
*
7
+
[
100
]
*
4
all_token_ids
=
common_token_ids
+
unique_token_ids
mm_positions
=
common_mm_positions
+
[
PlaceholderRange
(
offset
=
48
,
length
=
7
)
]
mm_hashes
=
common_mm_hashes
+
[
"ccc"
]
req0
=
make_request
(
"0"
,
all_token_ids
,
mm_positions
=
mm_positions
,
mm_hashes
=
mm_hashes
)
computed_blocks
=
manager
.
get_computed_blocks
(
req0
)
# Completed block should have hashes with extra keys.
assert
not
computed_blocks
assert
len
(
req0
.
kv_block_hashes
)
==
3
assert
req0
.
kv_block_hashes
[
0
].
extra_keys
==
((
"aaa"
,
0
),
)
assert
req0
.
kv_block_hashes
[
1
].
extra_keys
==
((
"aaa"
,
5
),
(
"bbb"
,
0
))
assert
req0
.
kv_block_hashes
[
2
].
extra_keys
==
((
"bbb"
,
2
),
)
blocks
=
manager
.
allocate_slots
(
req0
,
59
,
computed_blocks
)
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
0
,
1
,
2
,
3
,
4
]
req0
.
num_computed_tokens
=
59
# Append slots without allocating a new block.
for
_
in
range
(
5
):
req0
.
append_output_token_ids
(
8
)
new_blocks
=
manager
.
append_slots
(
req0
,
5
)
assert
new_blocks
is
not
None
and
len
(
new_blocks
)
==
0
# The just completed block should have hashes with extra keys.
assert
len
(
req0
.
kv_block_hashes
)
==
4
assert
req0
.
kv_block_hashes
[
3
].
extra_keys
==
((
"ccc"
,
0
),
)
# Cache hit.
unique_token_ids
=
[
-
1
]
*
7
+
[
200
]
*
5
all_token_ids
=
common_token_ids
+
unique_token_ids
mm_positions
=
common_mm_positions
+
[
PlaceholderRange
(
offset
=
48
,
length
=
7
)
]
mm_hashes
=
common_mm_hashes
+
[
"ccc"
]
req1
=
make_request
(
"1"
,
all_token_ids
,
mm_positions
=
mm_positions
,
mm_hashes
=
mm_hashes
)
computed_blocks
=
manager
.
get_computed_blocks
(
req1
)
assert
len
(
computed_blocks
)
==
3
tests/v1/engine/test_engine_args.py
View file @
96ae75ad
...
@@ -33,14 +33,6 @@ def test_prefix_caching_from_cli():
...
@@ -33,14 +33,6 @@ def test_prefix_caching_from_cli():
assert
engine_args
.
enable_prefix_caching
assert
engine_args
.
enable_prefix_caching
def
test_defaults
():
engine_args
=
EngineArgs
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
))
# Assert V1 defaults
assert
(
engine_args
.
enable_prefix_caching
),
"V1 turns on prefix caching by default"
def
test_defaults_with_usage_context
():
def
test_defaults_with_usage_context
():
engine_args
=
EngineArgs
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
))
engine_args
=
EngineArgs
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
))
vllm_config
:
VllmConfig
=
engine_args
.
create_engine_config
(
vllm_config
:
VllmConfig
=
engine_args
.
create_engine_config
(
...
@@ -54,10 +46,3 @@ def test_defaults_with_usage_context():
...
@@ -54,10 +46,3 @@ def test_defaults_with_usage_context():
UsageContext
.
OPENAI_API_SERVER
)
UsageContext
.
OPENAI_API_SERVER
)
assert
vllm_config
.
scheduler_config
.
max_num_seqs
==
1024
assert
vllm_config
.
scheduler_config
.
max_num_seqs
==
1024
assert
vllm_config
.
scheduler_config
.
max_num_batched_tokens
==
2048
assert
vllm_config
.
scheduler_config
.
max_num_batched_tokens
==
2048
def
test_prefix_cache_disabled_with_multimodel
():
engine_args
=
EngineArgs
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
))
vllm_config
=
engine_args
.
create_engine_config
(
UsageContext
.
LLM_CLASS
)
assert
not
vllm_config
.
cache_config
.
enable_prefix_caching
tests/v1/engine/test_engine_core.py
View file @
96ae75ad
...
@@ -141,3 +141,41 @@ def test_engine_core(monkeypatch):
...
@@ -141,3 +141,41 @@ def test_engine_core(monkeypatch):
engine_core
.
abort_requests
([
req2
.
request_id
,
req0
.
request_id
])
engine_core
.
abort_requests
([
req2
.
request_id
,
req0
.
request_id
])
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
def
test_engine_core_advanced_sampling
(
monkeypatch
):
"""
A basic end-to-end test to verify that the engine functions correctly
when additional sampling parameters, such as min_tokens and
presence_penalty, are set.
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
"""Setup the EngineCore."""
engine_args
=
EngineArgs
(
model
=
MODEL_NAME
)
vllm_config
=
engine_args
.
create_engine_config
(
usage_context
=
UsageContext
.
UNKNOWN_CONTEXT
)
executor_class
=
AsyncLLM
.
_get_executor_cls
(
vllm_config
)
engine_core
=
EngineCore
(
vllm_config
=
vllm_config
,
executor_class
=
executor_class
,
usage_context
=
UsageContext
.
UNKNOWN_CONTEXT
)
"""Test basic request lifecycle."""
# First request.
request
:
EngineCoreRequest
=
make_request
()
request
.
sampling_params
=
SamplingParams
(
min_tokens
=
4
,
presence_penalty
=
1.0
,
frequency_penalty
=
1.0
,
repetition_penalty
=
0.1
,
stop_token_ids
=
[
1001
,
1002
],
)
engine_core
.
add_request
(
request
)
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
1
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
# Loop through until they are all done.
while
len
(
engine_core
.
step
())
>
0
:
pass
assert
len
(
engine_core
.
scheduler
.
waiting
)
==
0
assert
len
(
engine_core
.
scheduler
.
running
)
==
0
tests/v1/sample/__init__.py
0 → 100644
View file @
96ae75ad
tests/v1/sample/test_sampler.py
0 → 100644
View file @
96ae75ad
from
typing
import
List
,
Set
,
Tuple
import
numpy
as
np
import
pytest
import
torch
from
vllm.utils
import
make_tensor_with_pad
from
vllm.v1.sample.metadata
import
SamplingMetadata
from
vllm.v1.sample.sampler
import
Sampler
VOCAB_SIZE
=
1024
NUM_OUTPUT_TOKENS
=
20
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
]
MAX_NUM_PROMPT_TOKENS
=
64
def
_create_fake_logits
(
batch_size
:
int
,
vocab_size
:
int
)
->
torch
.
Tensor
:
fake_logits
=
torch
.
full
((
batch_size
,
vocab_size
),
1e-2
,
dtype
=
torch
.
float
)
return
fake_logits
def
_create_penalty_tensor
(
batch_size
:
int
,
penalty_value
:
float
,
device
:
torch
.
device
)
->
torch
.
Tensor
:
return
torch
.
full
((
batch_size
,
),
fill_value
=
penalty_value
,
dtype
=
torch
.
float
,
device
=
device
)
def
_create_prompt_tokens_tensor
(
prompt_token_ids
:
List
[
List
[
int
]],
vocab_size
:
int
,
device
:
torch
.
device
,
)
->
torch
.
Tensor
:
return
make_tensor_with_pad
(
prompt_token_ids
,
pad
=
vocab_size
,
device
=
device
,
dtype
=
torch
.
int64
,
pin_memory
=
False
,
)
def
_create_default_sampling_metadata
(
num_output_tokens
:
int
,
batch_size
:
int
,
vocab_size
:
int
,
device
:
torch
.
device
,
)
->
SamplingMetadata
:
output_token_ids
:
List
[
List
[
int
]]
=
[]
prompt_token_ids
:
List
[
List
[
int
]]
=
[]
for
_
in
range
(
batch_size
):
output_token_ids
.
append
(
np
.
random
.
randint
(
0
,
vocab_size
,
size
=
num_output_tokens
).
tolist
())
prompt_token_ids
.
append
(
np
.
random
.
randint
(
0
,
vocab_size
,
size
=
np
.
random
.
randint
(
1
,
MAX_NUM_PROMPT_TOKENS
)).
tolist
())
fake_sampling_metadata
=
SamplingMetadata
(
temperature
=
torch
.
full
((
batch_size
,
),
0.0
),
all_greedy
=
True
,
all_random
=
False
,
top_p
=
torch
.
empty
(
batch_size
,
),
top_k
=
torch
.
empty
(
batch_size
,
),
no_top_p
=
True
,
no_top_k
=
True
,
generators
=
{},
max_num_logprobs
=
0
,
prompt_token_ids
=
_create_prompt_tokens_tensor
(
prompt_token_ids
,
vocab_size
,
device
),
output_token_ids
=
output_token_ids
,
frequency_penalties
=
_create_penalty_tensor
(
batch_size
,
0.0
,
device
),
presence_penalties
=
_create_penalty_tensor
(
batch_size
,
0.0
,
device
),
repetition_penalties
=
_create_penalty_tensor
(
batch_size
,
1.0
,
device
),
no_penalties
=
True
,
min_tokens
=
[],
stop_token_ids
=
[],
)
return
fake_sampling_metadata
def
_generate_min_token_penalties_and_stop_tokens
(
num_output_tokens
:
int
,
batch_size
:
int
,
vocab_size
:
int
,
batch_indices_for_min_token_penalty
:
List
[
int
]
)
->
Tuple
[
List
[
int
],
List
[
Set
[
int
]]]:
"""
Generates and returns a list of minimum token penalties (`min_tokens`)
and a corresponding list of stop token IDs (`stop_token_ids`) for each
batch.
If a batch index is included in `batch_indices_for_min_token_penalty`,
a higher `min_tokens` value is assigned (within a randomized range),
and a random set of stop token IDs is created. Otherwise, a lower
`min_tokens` value is assigned, and the stop token IDs set is empty.
"""
stop_token_ids
:
List
[
Set
[
int
]]
=
[]
min_tokens
:
List
[
int
]
=
[]
for
index
in
range
(
batch_size
):
if
index
in
batch_indices_for_min_token_penalty
:
min_tokens
.
append
(
np
.
random
.
randint
(
num_output_tokens
+
1
,
2
*
num_output_tokens
))
stop_token_ids
.
append
(
set
(
np
.
random
.
randint
(
0
,
vocab_size
-
1
)
for
_
in
range
(
np
.
random
.
randint
(
0
,
vocab_size
))))
else
:
min_tokens
.
append
(
np
.
random
.
randint
(
0
,
num_output_tokens
))
stop_token_ids
.
append
(
set
())
return
(
min_tokens
,
stop_token_ids
)
def
_create_weighted_output_token_list
(
batch_size
:
int
,
vocab_size
:
int
)
->
Tuple
[
List
[
List
[
int
]],
List
[
List
[
int
]]]:
"""
Creates an output token list where each token occurs a distinct
number of times.
For each batch, a random subset of token IDs is selected from the
vocabulary. The selected tokens are then added to the output token
list, each with a different frequency.
Returns:
Tuple[List[List[int]], List[List[int]]]:
- The first element is the output token list, where each sublist
corresponds to a batch and contains tokens with weighted
frequencies.
- The second element is a list of distinct token IDs for each
batch, ordered by their frequency in the corresponding output
list.
"""
output_token_ids
:
List
[
List
[
int
]]
=
[]
sorted_token_ids_in_output
:
List
[
List
[
int
]]
=
[]
for
_
in
range
(
batch_size
):
distinct_token_ids
=
np
.
random
.
choice
(
vocab_size
,
size
=
np
.
random
.
randint
(
1
,
10
),
replace
=
False
).
tolist
()
sorted_token_ids_in_output
.
append
(
distinct_token_ids
)
output_token_ids_for_batch
=
[]
for
index
,
token_id
in
enumerate
(
distinct_token_ids
):
output_token_ids_for_batch
.
extend
(
[
token_id
for
_
in
range
(
index
+
1
)])
output_token_ids
.
append
(
output_token_ids_for_batch
)
return
(
output_token_ids
,
sorted_token_ids_in_output
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
2
,
32
])
def
test_sampler_min_tokens_penalty
(
device
:
str
,
batch_size
:
int
):
"""
Tests that if the number of output tokens is less than
SamplingParams.min_tokens then we will set the logits for
the stop token ids to -inf.
"""
torch
.
set_default_device
(
device
)
fake_logits
=
_create_fake_logits
(
batch_size
,
VOCAB_SIZE
)
sampling_metadata
=
_create_default_sampling_metadata
(
NUM_OUTPUT_TOKENS
,
batch_size
,
VOCAB_SIZE
,
torch
.
device
(
device
))
batch_indices_for_min_token_penalty
=
np
.
random
.
randint
(
0
,
batch_size
-
1
,
size
=
np
.
random
.
randint
(
0
,
batch_size
)).
tolist
()
min_tokens
,
stop_token_ids
=
_generate_min_token_penalties_and_stop_tokens
(
NUM_OUTPUT_TOKENS
,
batch_size
,
VOCAB_SIZE
,
batch_indices_for_min_token_penalty
)
sampling_metadata
.
min_tokens
=
min_tokens
sampling_metadata
.
stop_token_ids
=
stop_token_ids
sampler
=
Sampler
()
logits
=
sampler
.
apply_penalties
(
fake_logits
,
sampling_metadata
)
logits
=
logits
.
cpu
()
for
batch_idx
in
range
(
batch_size
):
for
token_id
in
range
(
VOCAB_SIZE
):
if
token_id
in
stop_token_ids
[
batch_idx
]:
assert
logits
[
batch_idx
][
token_id
]
==
-
float
(
"inf"
)
else
:
assert
logits
[
batch_idx
][
token_id
]
!=
-
float
(
"inf"
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
2
,
32
])
@
pytest
.
mark
.
parametrize
(
"presence_penalty"
,
[
-
2.0
,
2.0
])
def
test_sampler_presence_penalty
(
device
:
str
,
batch_size
:
int
,
presence_penalty
:
float
):
"""
Test to verify that if presence penalty is enabled then tokens
are penalized as per their presence in the existing output.
"""
torch
.
set_default_device
(
device
)
# Create fake logits where each token is assigned the same
# logit value.
fake_logits
=
_create_fake_logits
(
batch_size
,
VOCAB_SIZE
)
sampling_metadata
=
_create_default_sampling_metadata
(
NUM_OUTPUT_TOKENS
,
batch_size
,
VOCAB_SIZE
,
torch
.
device
(
device
))
output_token_ids
=
sampling_metadata
.
output_token_ids
sampling_metadata
.
presence_penalties
=
_create_penalty_tensor
(
batch_size
,
presence_penalty
,
torch
.
device
(
device
))
sampling_metadata
.
no_penalties
=
False
sampler
=
Sampler
()
logits
=
sampler
.
apply_penalties
(
fake_logits
,
sampling_metadata
)
logits
=
logits
.
cpu
()
for
batch_idx
in
range
(
batch_size
):
# Since all tokens initially have the same logits, the non-penalized
# token ID will be the one with the highest logit value, while the
# penalized token ID will be the one with the lowest logit value.
non_penalized_token_id
=
logits
[
batch_idx
].
argmax
().
item
()
penalized_token_id
=
logits
[
batch_idx
].
argmin
().
item
()
if
presence_penalty
>
0
:
# If `presence_penalty` is set to a value greater than 0, it
# indicates a preference for new tokens over those already
# present in the output.
# Verify that the penalized token ID exists in the output, while the
# non-penalized token ID does not.
assert
penalized_token_id
in
output_token_ids
[
batch_idx
]
assert
non_penalized_token_id
not
in
output_token_ids
[
batch_idx
]
elif
presence_penalty
<
0
:
# If `presence_penalty` is set to a value less than 0, it indicates
# a preference for existing tokens over new ones. Verify that the
# non-penalized token ID exists in the output, while the penalized
# token ID does not.
assert
non_penalized_token_id
in
output_token_ids
[
batch_idx
]
assert
penalized_token_id
not
in
output_token_ids
[
batch_idx
]
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
2
,
32
])
@
pytest
.
mark
.
parametrize
(
"frequency_penalty"
,
[
-
2.0
,
2.0
])
def
test_sampler_frequency_penalty
(
device
:
str
,
batch_size
:
int
,
frequency_penalty
:
float
):
"""
Test to verify that if frequency penalty is enabled then tokens are
penalized as per their frequency of occurrence.
"""
torch
.
set_default_device
(
device
)
# Create fake logits where each token is assigned the same
# logit value.
fake_logits
=
_create_fake_logits
(
batch_size
,
VOCAB_SIZE
)
sampling_metadata
=
_create_default_sampling_metadata
(
NUM_OUTPUT_TOKENS
,
batch_size
,
VOCAB_SIZE
,
torch
.
device
(
device
))
sampling_metadata
.
frequency_penalties
=
_create_penalty_tensor
(
batch_size
,
frequency_penalty
,
torch
.
device
(
device
))
output_token_ids
,
sorted_token_ids_in_output
=
\
_create_weighted_output_token_list
(
batch_size
,
VOCAB_SIZE
)
sampling_metadata
.
output_token_ids
=
output_token_ids
sampling_metadata
.
no_penalties
=
False
sampler
=
Sampler
()
logits
=
sampler
.
apply_penalties
(
fake_logits
,
sampling_metadata
)
logits
=
logits
.
cpu
()
for
batch_idx
in
range
(
batch_size
):
non_penalized_token_id
=
logits
[
batch_idx
].
argmax
().
item
()
penalized_token_id
=
logits
[
batch_idx
].
argmin
().
item
()
distinct_sorted_token_ids_in_output
=
\
sorted_token_ids_in_output
[
batch_idx
]
most_frequent_token_id
=
distinct_sorted_token_ids_in_output
[
len
(
distinct_sorted_token_ids_in_output
)
-
1
]
if
frequency_penalty
>
0
:
# If `frequency_penalty` is set to > 0, it indicates
# a preference for new tokens over existing ones. Verify that the
# non-penalized token ID is not present in the output, while the
# most penalized token is the one that occurs most frequently in
# the output.
assert
non_penalized_token_id
\
not
in
distinct_sorted_token_ids_in_output
assert
penalized_token_id
==
most_frequent_token_id
elif
frequency_penalty
<
0
:
# If `frequency_penalty` is set to < 0, it indicates
# a preference for existing tokens over new ones. Verify that the
# non-penalized token ID is the one that occurs most frequently
# in the output, while the penalized token ID is one that has not
# yet appeared.
assert
non_penalized_token_id
==
most_frequent_token_id
assert
penalized_token_id
\
not
in
distinct_sorted_token_ids_in_output
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
2
,
32
])
@
pytest
.
mark
.
parametrize
(
"repetition_penalty"
,
[
0.1
,
1.9
])
def
test_sampler_repetition_penalty
(
device
:
str
,
batch_size
:
int
,
repetition_penalty
:
float
):
"""
Test to verify that when the repetition penalty is enabled, tokens
are penalized based on their presence in the prompt or the existing
output.
"""
torch
.
set_default_device
(
device
)
# Create fake logits where each token is assigned the same
# logit value.
fake_logits
=
_create_fake_logits
(
batch_size
,
VOCAB_SIZE
)
sampling_metadata
=
_create_default_sampling_metadata
(
NUM_OUTPUT_TOKENS
,
batch_size
,
VOCAB_SIZE
,
torch
.
device
(
device
))
sampling_metadata
.
repetition_penalties
=
_create_penalty_tensor
(
batch_size
,
repetition_penalty
,
torch
.
device
(
device
))
sampling_metadata
.
no_penalties
=
False
sampler
=
Sampler
()
logits
=
sampler
.
apply_penalties
(
fake_logits
,
sampling_metadata
)
logits
=
logits
.
cpu
()
for
batch_idx
in
range
(
batch_size
):
non_penalized_token_id
=
logits
[
batch_idx
].
argmax
().
item
()
penalized_token_id
=
logits
[
batch_idx
].
argmin
().
item
()
prompt_tokens
=
sampling_metadata
.
prompt_token_ids
[
batch_idx
][:].
tolist
()
output_tokens
=
sampling_metadata
.
output_token_ids
[
batch_idx
]
if
repetition_penalty
>
1.0
:
# If `repetition_penalty` > 1.0, verify that the non-penalized
# token ID has not been seen before, while the penalized token ID
# exists either in the prompt or the output.
assert
(
non_penalized_token_id
not
in
prompt_tokens
and
\
non_penalized_token_id
not
in
output_tokens
)
assert
(
penalized_token_id
in
prompt_tokens
or
\
penalized_token_id
in
output_tokens
)
elif
repetition_penalty
<
1.0
:
# If `repetition_penalty` < 1.0, verify that the penalized
# token ID has not been seen before, while the non-penalized
# token ID exists either in the prompt or the output.
assert
(
penalized_token_id
not
in
prompt_tokens
and
\
penalized_token_id
not
in
output_tokens
)
assert
(
non_penalized_token_id
in
prompt_tokens
or
\
non_penalized_token_id
in
output_tokens
)
tests/v1/worker/__init__.py
0 → 100644
View file @
96ae75ad
tests/v1/worker/test_gpu_input_batch.py
0 → 100644
View file @
96ae75ad
from
typing
import
Dict
,
List
,
Set
,
Tuple
import
numpy
as
np
import
pytest
import
torch
from
vllm.sampling_params
import
SamplingParams
from
vllm.utils
import
is_pin_memory_available
,
make_tensor_with_pad
from
vllm.v1.sample.metadata
import
SamplingMetadata
from
vllm.v1.worker.gpu_input_batch
import
CachedRequestState
,
InputBatch
VOCAB_SIZE
=
1024
NUM_OUTPUT_TOKENS
=
20
MAX_PROMPT_SIZE
=
100
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
]
MAX_NUM_PROMPT_TOKENS
=
64
def
_remove_requests
(
input_batch
:
InputBatch
,
batch_size
:
int
,
reqs
:
List
[
CachedRequestState
])
->
Tuple
[
Set
[
str
],
List
[
int
]]:
"""
Remove some requests randomly from the batch and returns a Tuple
of 1) set of request removed 2) indices of the requests removed
ordered in descending order
"""
num_reqs_to_remove
=
np
.
random
.
randint
(
0
,
batch_size
)
req_indices_to_remove
:
Set
[
int
]
=
set
()
for
_
in
range
(
num_reqs_to_remove
):
req_index_to_remove
=
np
.
random
.
randint
(
0
,
batch_size
)
req_indices_to_remove
.
add
(
req_index_to_remove
)
req_indices_to_remove_list
=
list
(
req_indices_to_remove
)
req_indices_to_remove_list
.
sort
(
reverse
=
True
)
req_ids_to_remove
:
Set
[
str
]
=
set
()
for
index
in
req_indices_to_remove
:
input_batch
.
remove_request
(
reqs
[
index
].
req_id
)
req_ids_to_remove
.
add
(
reqs
[
index
].
req_id
)
return
(
req_ids_to_remove
,
req_indices_to_remove_list
)
def
_construct_expected_sampling_metadata
(
reqs
:
List
[
CachedRequestState
],
req_ids_retained
:
Set
[
int
],
req_id_index_in_input_batch
:
Dict
[
str
,
int
],
device
:
torch
.
device
)
->
SamplingMetadata
:
"""
Constructs and returns the expected SamplingMetadata for this
batch.
"""
num_reqs
=
len
(
req_ids_retained
)
output_token_ids
:
List
[
List
[
int
]]
=
[
list
()
for
_
in
range
(
num_reqs
)]
prompt_token_ids
:
List
[
List
[
int
]]
=
[
list
()
for
_
in
range
(
num_reqs
)]
presence_penalties
=
[
0.0
for
_
in
range
(
num_reqs
)]
frequency_penalties
=
[
0.0
for
_
in
range
(
num_reqs
)]
repetition_penalties
=
[
1.0
for
_
in
range
(
num_reqs
)]
top_k
=
[
0
for
_
in
range
(
num_reqs
)]
top_p
=
[
0.0
for
_
in
range
(
num_reqs
)]
temperature
=
[
0.0
for
_
in
range
(
num_reqs
)]
stop_token_ids
:
List
[
Set
[
int
]]
=
[
set
()
for
_
in
range
(
num_reqs
)]
min_tokens
=
[
0
for
_
in
range
(
num_reqs
)]
for
req
in
reqs
:
if
req
.
req_id
not
in
req_ids_retained
:
continue
index_in_input_batch
=
req_id_index_in_input_batch
[
req
.
req_id
]
output_token_ids
[
index_in_input_batch
]
=
req
.
output_token_ids
prompt_token_ids
[
index_in_input_batch
]
=
req
.
prompt_token_ids
presence_penalties
[
index_in_input_batch
]
=
req
.
sampling_params
.
presence_penalty
frequency_penalties
[
index_in_input_batch
]
=
req
.
sampling_params
.
frequency_penalty
repetition_penalties
[
index_in_input_batch
]
=
req
.
sampling_params
.
repetition_penalty
top_k
[
index_in_input_batch
]
=
req
.
sampling_params
.
top_k
top_p
[
index_in_input_batch
]
=
req
.
sampling_params
.
top_p
temperature
[
index_in_input_batch
]
=
req
.
sampling_params
.
temperature
stop_token_ids
[
index_in_input_batch
]
=
req
.
sampling_params
.
all_stop_token_ids
min_tokens
[
index_in_input_batch
]
=
req
.
sampling_params
.
min_tokens
return
SamplingMetadata
(
temperature
=
torch
.
tensor
(
temperature
,
dtype
=
torch
.
float
,
device
=
device
),
all_greedy
=
False
,
all_random
=
True
,
top_p
=
torch
.
tensor
(
top_p
,
dtype
=
torch
.
float
,
device
=
device
),
top_k
=
torch
.
tensor
(
top_k
,
dtype
=
torch
.
int
,
device
=
device
),
no_top_p
=
all
(
x
==
1.0
for
x
in
top_p
),
no_top_k
=
all
(
x
==
0
for
x
in
top_k
),
generators
=
{},
max_num_logprobs
=
0
,
prompt_token_ids
=
make_tensor_with_pad
(
prompt_token_ids
,
pad
=
VOCAB_SIZE
,
device
=
torch
.
device
(
device
),
dtype
=
torch
.
int64
,
),
frequency_penalties
=
torch
.
tensor
(
frequency_penalties
,
dtype
=
torch
.
float
,
device
=
device
),
presence_penalties
=
torch
.
tensor
(
presence_penalties
,
dtype
=
torch
.
float
,
device
=
device
),
repetition_penalties
=
torch
.
tensor
(
repetition_penalties
,
dtype
=
torch
.
float
,
device
=
device
),
output_token_ids
=
output_token_ids
,
min_tokens
=
min_tokens
,
stop_token_ids
=
stop_token_ids
,
no_penalties
=
(
all
(
x
==
0
for
x
in
presence_penalties
)
and
\
all
(
x
==
0
for
x
in
frequency_penalties
)
and
\
all
(
x
==
1
for
x
in
repetition_penalties
))
)
def
_create_sampling_params
():
return
SamplingParams
(
top_k
=
np
.
random
.
randint
(
1
,
10
),
top_p
=
np
.
random
.
uniform
(
0.0
,
1.0
),
presence_penalty
=
np
.
random
.
uniform
(
-
2.0
,
2.0
),
repetition_penalty
=
np
.
random
.
uniform
(
0.0
,
2.0
),
frequency_penalty
=
np
.
random
.
uniform
(
-
2.0
,
2.0
),
min_tokens
=
np
.
random
.
randint
(
1
,
10
),
stop_token_ids
=
[
np
.
random
.
randint
(
0
,
VOCAB_SIZE
)
for
_
in
range
(
np
.
random
.
randint
(
10
))
])
def
_construct_cached_request_state
(
req_id_suffix
:
int
):
prompt_token_ids
=
[
np
.
random
.
randint
(
0
,
VOCAB_SIZE
)
for
_
in
range
(
np
.
random
.
randint
(
0
,
MAX_PROMPT_SIZE
))
]
output_token_ids
=
[
np
.
random
.
randint
(
0
,
VOCAB_SIZE
)
for
_
in
range
(
np
.
random
.
randint
(
0
,
NUM_OUTPUT_TOKENS
))
]
return
CachedRequestState
(
req_id
=
f
"req_id_
{
req_id_suffix
}
"
,
prompt_token_ids
=
prompt_token_ids
,
prompt
=
None
,
sampling_params
=
_create_sampling_params
(),
mm_inputs
=
[],
mm_positions
=
[],
block_ids
=
[],
generator
=
None
,
num_computed_tokens
=
len
(
output_token_ids
),
output_token_ids
=
output_token_ids
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
2
,
32
,
64
])
def
test_sampling_metadata_in_input_batch
(
device
:
str
,
batch_size
:
int
):
"""
Tests the logic for managing sampling metadata in the InputBatch.
This test involves adding a set of requests to the InputBatch,
followed by removing a subset of them. Afterward, the batch is compacted,
and the `make_sampling_metadata` method is invoked on the batch. The
output of `make_sampling_metadata` is then compared against the expected
results to ensure correctness.
"""
input_batch
:
InputBatch
=
InputBatch
(
max_num_reqs
=
batch_size
,
max_model_len
=
1024
,
max_num_blocks_per_req
=
10
,
device
=
torch
.
device
(
device
),
pin_memory
=
is_pin_memory_available
(),
vocab_size
=
1024
)
reqs
:
List
[
CachedRequestState
]
=
[]
req_id_reqs
=
{}
req_id_output_token_ids
=
{}
# Add requests
for
req_index
in
range
(
batch_size
):
req
:
CachedRequestState
=
_construct_cached_request_state
(
req_index
)
input_batch
.
add_request
(
req
,
req_index
)
reqs
.
append
(
req
)
req_id_reqs
[
req
.
req_id
]
=
req
req_id_output_token_ids
[
req
.
req_id
]
=
req
.
output_token_ids
# Remove some requests
req_ids_to_remove
,
req_indices_to_remove
=
_remove_requests
(
input_batch
,
batch_size
,
reqs
)
req_ids_retained
=
set
(
req_id_reqs
.
keys
())
-
req_ids_to_remove
# Compact the input batch
input_batch
.
condense
(
req_indices_to_remove
)
# Generate the sampling metadata
sampling_metadata
=
input_batch
.
make_sampling_metadata
(
req_id_output_token_ids
,
skip_copy
=
False
)
# Create expected output.
expected_sampling_metadata
=
_construct_expected_sampling_metadata
(
reqs
,
req_ids_retained
,
input_batch
.
req_id_to_index
,
device
=
torch
.
device
(
device
))
# Assert the actual and expected output.
assert
torch
.
allclose
(
expected_sampling_metadata
.
temperature
,
sampling_metadata
.
temperature
)
assert
torch
.
allclose
(
expected_sampling_metadata
.
top_p
,
sampling_metadata
.
top_p
)
assert
torch
.
allclose
(
expected_sampling_metadata
.
top_k
,
sampling_metadata
.
top_k
)
assert
torch
.
allclose
(
expected_sampling_metadata
.
frequency_penalties
,
sampling_metadata
.
frequency_penalties
)
assert
torch
.
allclose
(
expected_sampling_metadata
.
presence_penalties
,
sampling_metadata
.
presence_penalties
)
assert
torch
.
allclose
(
expected_sampling_metadata
.
repetition_penalties
,
sampling_metadata
.
repetition_penalties
)
assert
torch
.
allclose
(
expected_sampling_metadata
.
prompt_token_ids
,
sampling_metadata
.
prompt_token_ids
)
assert
(
expected_sampling_metadata
.
output_token_ids
==
sampling_metadata
.
output_token_ids
)
assert
(
expected_sampling_metadata
.
min_tokens
==
sampling_metadata
.
min_tokens
)
assert
(
expected_sampling_metadata
.
stop_token_ids
==
sampling_metadata
.
stop_token_ids
)
assert
(
expected_sampling_metadata
.
no_penalties
==
sampling_metadata
.
no_penalties
)
assert
(
expected_sampling_metadata
.
no_top_p
==
sampling_metadata
.
no_top_p
)
assert
(
expected_sampling_metadata
.
no_top_k
==
sampling_metadata
.
no_top_k
)
tests/weight_loading/models.txt
View file @
96ae75ad
...
@@ -21,6 +21,8 @@ compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
...
@@ -21,6 +21,8 @@ compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-FP8-Dynamic-testing, main, 90
compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing, main, 90
awq, casperhansen/mixtral-instruct-awq, main
awq, casperhansen/mixtral-instruct-awq, main
awq_marlin, casperhansen/mixtral-instruct-awq, main
awq_marlin, casperhansen/mixtral-instruct-awq, main
fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
...
...
Prev
1
…
8
9
10
11
12
13
14
15
16
…
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment