Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f2b20fe4
Unverified
Commit
f2b20fe4
authored
Feb 14, 2025
by
Harry Mellor
Committed by
GitHub
Feb 13, 2025
Browse files
Consolidate Llama model usage in tests (#13094)
parent
40932d7a
Changes
22
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
42 additions
and
50 deletions
+42
-50
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_basic_correctness.py
+5
-5
tests/basic_correctness/test_chunked_prefill.py
tests/basic_correctness/test_chunked_prefill.py
+3
-3
tests/basic_correctness/test_cpu_offload.py
tests/basic_correctness/test_cpu_offload.py
+1
-1
tests/basic_correctness/test_cumem.py
tests/basic_correctness/test_cumem.py
+1
-1
tests/compile/test_basic_correctness.py
tests/compile/test_basic_correctness.py
+1
-1
tests/compile/utils.py
tests/compile/utils.py
+4
-10
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+2
-2
tests/entrypoints/openai/test_serving_models.py
tests/entrypoints/openai/test_serving_models.py
+1
-1
tests/entrypoints/openai/test_shutdown.py
tests/entrypoints/openai/test_shutdown.py
+1
-1
tests/kv_transfer/disagg_test.py
tests/kv_transfer/disagg_test.py
+4
-6
tests/models/decoder_only/language/test_fp8.py
tests/models/decoder_only/language/test_fp8.py
+4
-4
tests/models/registry.py
tests/models/registry.py
+1
-1
tests/quantization/test_register_quantization_config.py
tests/quantization/test_register_quantization_config.py
+1
-1
tests/samplers/test_ignore_eos.py
tests/samplers/test_ignore_eos.py
+1
-1
tests/spec_decode/e2e/test_compatibility.py
tests/spec_decode/e2e/test_compatibility.py
+3
-3
tests/test_config.py
tests/test_config.py
+1
-1
tests/test_sharded_state_loader.py
tests/test_sharded_state_loader.py
+4
-4
tests/tokenization/test_detokenize.py
tests/tokenization/test_detokenize.py
+1
-1
tests/tokenization/test_get_eos.py
tests/tokenization/test_get_eos.py
+2
-2
tests/v1/engine/test_async_llm.py
tests/v1/engine/test_async_llm.py
+1
-1
No files found.
tests/basic_correctness/test_basic_correctness.py
View file @
f2b20fe4
...
@@ -17,7 +17,7 @@ from ..utils import multi_gpu_test
...
@@ -17,7 +17,7 @@ from ..utils import multi_gpu_test
MODELS
=
[
MODELS
=
[
"google/gemma-2-2b-it"
,
"google/gemma-2-2b-it"
,
"meta-llama/Llama-3.2-1B"
,
"meta-llama/Llama-3.2-1B
-Instruct
"
,
]
]
TARGET_TEST_SUITE
=
os
.
environ
.
get
(
"TARGET_TEST_SUITE"
,
"L4"
)
TARGET_TEST_SUITE
=
os
.
environ
.
get
(
"TARGET_TEST_SUITE"
,
"L4"
)
...
@@ -96,12 +96,12 @@ def test_models(
...
@@ -96,12 +96,12 @@ def test_models(
"test_suite"
,
[
"test_suite"
,
[
(
"facebook/opt-125m"
,
"ray"
,
""
,
"L4"
),
(
"facebook/opt-125m"
,
"ray"
,
""
,
"L4"
),
(
"facebook/opt-125m"
,
"mp"
,
""
,
"L4"
),
(
"facebook/opt-125m"
,
"mp"
,
""
,
"L4"
),
(
"meta-llama/Llama-
2-7b-hf
"
,
"ray"
,
""
,
"L4"
),
(
"meta-llama/Llama-
3.2-1B-Instruct
"
,
"ray"
,
""
,
"L4"
),
(
"meta-llama/Llama-
2-7b-hf
"
,
"mp"
,
""
,
"L4"
),
(
"meta-llama/Llama-
3.2-1B-Instruct
"
,
"mp"
,
""
,
"L4"
),
(
"facebook/opt-125m"
,
"ray"
,
""
,
"A100"
),
(
"facebook/opt-125m"
,
"ray"
,
""
,
"A100"
),
(
"facebook/opt-125m"
,
"mp"
,
""
,
"A100"
),
(
"facebook/opt-125m"
,
"mp"
,
""
,
"A100"
),
(
"facebook/opt-125m"
,
"mp"
,
"FLASHINFER"
,
"A100"
),
(
"facebook/opt-125m"
,
"mp"
,
"FLASHINFER"
,
"A100"
),
(
"meta-llama/
Meta-
Llama-3
-8B
"
,
"ray"
,
"FLASHINFER"
,
"A100"
),
(
"meta-llama/Llama-3
.2-1B-Instruct
"
,
"ray"
,
"FLASHINFER"
,
"A100"
),
])
])
def
test_models_distributed
(
def
test_models_distributed
(
hf_runner
,
hf_runner
,
...
@@ -116,7 +116,7 @@ def test_models_distributed(
...
@@ -116,7 +116,7 @@ def test_models_distributed(
if
test_suite
!=
TARGET_TEST_SUITE
:
if
test_suite
!=
TARGET_TEST_SUITE
:
pytest
.
skip
(
f
"Skip test for
{
test_suite
}
"
)
pytest
.
skip
(
f
"Skip test for
{
test_suite
}
"
)
if
model
==
"meta-llama/Llama-
2-7b-hf
"
and
distributed_executor_backend
==
"ray"
and
attention_backend
==
""
and
test_suite
==
"L4"
:
# noqa
if
model
==
"meta-llama/Llama-
3.2-1B-Instruct
"
and
distributed_executor_backend
==
"ray"
and
attention_backend
==
""
and
test_suite
==
"L4"
:
# noqa
# test ray adag
# test ray adag
os
.
environ
[
'VLLM_USE_RAY_SPMD_WORKER'
]
=
"1"
os
.
environ
[
'VLLM_USE_RAY_SPMD_WORKER'
]
=
"1"
os
.
environ
[
'VLLM_USE_RAY_COMPILED_DAG'
]
=
"1"
os
.
environ
[
'VLLM_USE_RAY_COMPILED_DAG'
]
=
"1"
...
...
tests/basic_correctness/test_chunked_prefill.py
View file @
f2b20fe4
...
@@ -20,7 +20,7 @@ from ..utils import multi_gpu_test
...
@@ -20,7 +20,7 @@ from ..utils import multi_gpu_test
MODELS
=
[
MODELS
=
[
"facebook/opt-125m"
,
"facebook/opt-125m"
,
"meta-llama/Llama-3.2-1B"
,
"meta-llama/Llama-3.2-1B
-Instruct
"
,
]
]
...
@@ -92,7 +92,7 @@ def test_models_distributed(
...
@@ -92,7 +92,7 @@ def test_models_distributed(
)
->
None
:
)
->
None
:
override_backend_env_variable
(
monkeypatch
,
attention_backend
)
override_backend_env_variable
(
monkeypatch
,
attention_backend
)
if
(
model
==
"meta-llama/Llama-
2-7b-hf
"
if
(
model
==
"meta-llama/Llama-
3.2-1B-Instruct
"
and
distributed_executor_backend
==
"ray"
):
and
distributed_executor_backend
==
"ray"
):
# test ray adag
# test ray adag
os
.
environ
[
'VLLM_USE_RAY_SPMD_WORKER'
]
=
"1"
os
.
environ
[
'VLLM_USE_RAY_SPMD_WORKER'
]
=
"1"
...
@@ -221,7 +221,7 @@ def test_with_prefix_caching(
...
@@ -221,7 +221,7 @@ def test_with_prefix_caching(
Checks exact match decode with and without prefix caching
Checks exact match decode with and without prefix caching
with chunked prefill enabled.
with chunked prefill enabled.
"""
"""
model
=
"meta-llama/Llama-
2-7b-chat-hf
"
model
=
"meta-llama/Llama-
3.2-1B-Instruct
"
# The common prompt has 142 tokens with Llama-2 tokenizer.
# The common prompt has 142 tokens with Llama-2 tokenizer.
common_prompt
=
"You are a helpful AI assistant "
*
20
common_prompt
=
"You are a helpful AI assistant "
*
20
unique_prompts
=
[
unique_prompts
=
[
...
...
tests/basic_correctness/test_cpu_offload.py
View file @
f2b20fe4
...
@@ -4,5 +4,5 @@ from ..utils import compare_two_settings
...
@@ -4,5 +4,5 @@ from ..utils import compare_two_settings
def
test_cpu_offload
():
def
test_cpu_offload
():
compare_two_settings
(
"meta-llama/Llama-3.2-1B"
,
[],
compare_two_settings
(
"meta-llama/Llama-3.2-1B
-Instruct
"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
[
"--cpu-offload-gb"
,
"1"
])
tests/basic_correctness/test_cumem.py
View file @
f2b20fe4
...
@@ -118,7 +118,7 @@ def test_cumem_with_cudagraph():
...
@@ -118,7 +118,7 @@ def test_cumem_with_cudagraph():
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model"
,
"model"
,
[
[
"meta-llama/Llama-3.2-1B"
,
# sleep mode with safetensors
"meta-llama/Llama-3.2-1B
-Instruct
"
,
# sleep mode with safetensors
"facebook/opt-125m"
# sleep mode with pytorch checkpoint
"facebook/opt-125m"
# sleep mode with pytorch checkpoint
])
])
def
test_end_to_end
(
model
):
def
test_end_to_end
(
model
):
...
...
tests/compile/test_basic_correctness.py
View file @
f2b20fe4
...
@@ -26,7 +26,7 @@ class TestSetting:
...
@@ -26,7 +26,7 @@ class TestSetting:
test_settings
=
[
test_settings
=
[
# basic llama model
# basic llama model
TestSetting
(
TestSetting
(
model
=
"meta-llama/Llama-3.2-1B"
,
model
=
"meta-llama/Llama-3.2-1B
-Instruct
"
,
model_args
=
[],
model_args
=
[],
pp_size
=
2
,
pp_size
=
2
,
tp_size
=
2
,
tp_size
=
2
,
...
...
tests/compile/utils.py
View file @
f2b20fe4
...
@@ -6,7 +6,6 @@ import torch
...
@@ -6,7 +6,6 @@ import torch
from
tests.quantization.utils
import
is_quant_method_supported
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
CompilationLevel
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
TEST_MODELS
=
[
TEST_MODELS
=
[
...
@@ -15,14 +14,14 @@ TEST_MODELS = [
...
@@ -15,14 +14,14 @@ TEST_MODELS = [
"dtype"
:
torch
.
float16
,
"dtype"
:
torch
.
float16
,
"quantization"
:
"compressed-tensors"
"quantization"
:
"compressed-tensors"
}),
}),
(
"neuralmagic/
Meta-
Llama-3
-8
B-Instruct-FP8"
,
{
(
"neuralmagic/Llama-3
.2-1
B-Instruct-FP8
-dynamic
"
,
{
"dtype"
:
torch
.
float16
,
"dtype"
:
torch
.
float16
,
"quantization"
:
"
fp8
"
"quantization"
:
"
compressed-tensors
"
}),
}),
(
"n
m-testing/Meta-
Llama-3
-8
B-Instruct-
W8A8-Dyn-Per-Token-2048-Samples
"
,
{
(
"n
euralmagic/
Llama-3
.2-1
B-Instruct-
quantized.w8a8
"
,
{
"quantization"
:
"compressed-tensors"
"quantization"
:
"compressed-tensors"
}),
}),
(
"meta-llama/
Meta-
Llama-3
-8B
"
,
{}),
(
"meta-llama/Llama-3
.2-1B-Instruct
"
,
{}),
]
]
if
is_quant_method_supported
(
"aqlm"
):
if
is_quant_method_supported
(
"aqlm"
):
...
@@ -69,11 +68,6 @@ def check_full_graph_support(model,
...
@@ -69,11 +68,6 @@ def check_full_graph_support(model,
# make sure these models can be captured in full graph mode
# make sure these models can be captured in full graph mode
os
.
environ
[
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"
]
=
"1"
os
.
environ
[
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"
]
=
"1"
# The base meta llama uses too much memory.
if
(
model
==
"meta-llama/Meta-Llama-3-8B"
and
optimization_level
>=
CompilationLevel
.
PIECEWISE
):
return
print
(
f
"MODEL=
{
model
}
"
)
print
(
f
"MODEL=
{
model
}
"
)
prompts
=
[
prompts
=
[
...
...
tests/distributed/test_pipeline_parallel.py
View file @
f2b20fe4
...
@@ -162,7 +162,7 @@ TEXT_GENERATION_MODELS = {
...
@@ -162,7 +162,7 @@ TEXT_GENERATION_MODELS = {
"internlm/internlm2-chat-7b"
:
PPTestSettings
.
fast
(),
"internlm/internlm2-chat-7b"
:
PPTestSettings
.
fast
(),
"inceptionai/jais-13b-chat"
:
PPTestSettings
.
fast
(),
"inceptionai/jais-13b-chat"
:
PPTestSettings
.
fast
(),
"ai21labs/Jamba-tiny-dev"
:
PPTestSettings
.
fast
(),
"ai21labs/Jamba-tiny-dev"
:
PPTestSettings
.
fast
(),
"meta-llama/
Meta-
Llama-3
-8B
"
:
PPTestSettings
.
detailed
(),
"meta-llama/Llama-3
.2-1B-Instruct
"
:
PPTestSettings
.
detailed
(),
"openbmb/MiniCPM-2B-sft-bf16"
:
PPTestSettings
.
fast
(),
"openbmb/MiniCPM-2B-sft-bf16"
:
PPTestSettings
.
fast
(),
"openbmb/MiniCPM3-4B"
:
PPTestSettings
.
fast
(),
"openbmb/MiniCPM3-4B"
:
PPTestSettings
.
fast
(),
# Uses Llama
# Uses Llama
...
@@ -230,7 +230,7 @@ MULTIMODAL_MODELS = {
...
@@ -230,7 +230,7 @@ MULTIMODAL_MODELS = {
TEST_MODELS
=
[
TEST_MODELS
=
[
# [LANGUAGE GENERATION]
# [LANGUAGE GENERATION]
"microsoft/Phi-3.5-MoE-instruct"
,
"microsoft/Phi-3.5-MoE-instruct"
,
"meta-llama/
Meta-
Llama-3
-8B
"
,
"meta-llama/Llama-3
.2-1B-Instruct
"
,
"ibm/PowerLM-3b"
,
"ibm/PowerLM-3b"
,
# [LANGUAGE EMBEDDING]
# [LANGUAGE EMBEDDING]
"intfloat/e5-mistral-7b-instruct"
,
"intfloat/e5-mistral-7b-instruct"
,
...
...
tests/entrypoints/openai/test_serving_models.py
View file @
f2b20fe4
...
@@ -14,7 +14,7 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath,
...
@@ -14,7 +14,7 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath,
OpenAIServingModels
)
OpenAIServingModels
)
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
MODEL_NAME
=
"meta-llama/Llama-
2-7b
"
MODEL_NAME
=
"meta-llama/Llama-
3.2-1B-Instruct
"
BASE_MODEL_PATHS
=
[
BaseModelPath
(
name
=
MODEL_NAME
,
model_path
=
MODEL_NAME
)]
BASE_MODEL_PATHS
=
[
BaseModelPath
(
name
=
MODEL_NAME
,
model_path
=
MODEL_NAME
)]
LORA_LOADING_SUCCESS_MESSAGE
=
(
LORA_LOADING_SUCCESS_MESSAGE
=
(
"Success: LoRA adapter '{lora_name}' added successfully."
)
"Success: LoRA adapter '{lora_name}' added successfully."
)
...
...
tests/entrypoints/openai/test_shutdown.py
View file @
f2b20fe4
...
@@ -5,7 +5,7 @@ import pytest
...
@@ -5,7 +5,7 @@ import pytest
from
...utils
import
RemoteOpenAIServer
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"meta-llama/Llama-3.2-1B"
MODEL_NAME
=
"meta-llama/Llama-3.2-1B
-Instruct
"
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
...
tests/kv_transfer/disagg_test.py
View file @
f2b20fe4
...
@@ -28,7 +28,7 @@ def setup_servers():
...
@@ -28,7 +28,7 @@ def setup_servers():
"-m"
,
"-m"
,
"vllm.entrypoints.openai.api_server"
,
"vllm.entrypoints.openai.api_server"
,
"--model"
,
"--model"
,
"meta-llama/
Meta-
Llama-3.
1-8
B-Instruct"
,
"meta-llama/Llama-3.
2-1
B-Instruct"
,
"--port"
,
"--port"
,
"8100"
,
"8100"
,
"--gpu-memory-utilization"
,
"--gpu-memory-utilization"
,
...
@@ -49,7 +49,7 @@ def setup_servers():
...
@@ -49,7 +49,7 @@ def setup_servers():
"-m"
,
"-m"
,
"vllm.entrypoints.openai.api_server"
,
"vllm.entrypoints.openai.api_server"
,
"--model"
,
"--model"
,
"meta-llama/
Meta-
Llama-3.
1-8
B-Instruct"
,
"meta-llama/Llama-3.
2-1
B-Instruct"
,
"--port"
,
"--port"
,
"8200"
,
"8200"
,
"--gpu-memory-utilization"
,
"--gpu-memory-utilization"
,
...
@@ -100,8 +100,7 @@ def test_disaggregated_prefilling(prompt):
...
@@ -100,8 +100,7 @@ def test_disaggregated_prefilling(prompt):
response
=
requests
.
post
(
"http://localhost:8100/v1/completions"
,
response
=
requests
.
post
(
"http://localhost:8100/v1/completions"
,
headers
=
{
"Content-Type"
:
"application/json"
},
headers
=
{
"Content-Type"
:
"application/json"
},
json
=
{
json
=
{
"model"
:
"model"
:
"meta-llama/Llama-3.2-1B-Instruct"
,
"meta-llama/Meta-Llama-3.1-8B-Instruct"
,
"prompt"
:
prompt
,
"prompt"
:
prompt
,
"max_tokens"
:
1
,
"max_tokens"
:
1
,
"temperature"
:
0
"temperature"
:
0
...
@@ -112,8 +111,7 @@ def test_disaggregated_prefilling(prompt):
...
@@ -112,8 +111,7 @@ def test_disaggregated_prefilling(prompt):
response
=
requests
.
post
(
"http://localhost:8200/v1/completions"
,
response
=
requests
.
post
(
"http://localhost:8200/v1/completions"
,
headers
=
{
"Content-Type"
:
"application/json"
},
headers
=
{
"Content-Type"
:
"application/json"
},
json
=
{
json
=
{
"model"
:
"model"
:
"meta-llama/Llama-3.2-1B-Instruct"
,
"meta-llama/Meta-Llama-3.1-8B-Instruct"
,
"prompt"
:
prompt
,
"prompt"
:
prompt
,
"max_tokens"
:
10
,
"max_tokens"
:
10
,
"temperature"
:
0
"temperature"
:
0
...
...
tests/models/decoder_only/language/test_fp8.py
View file @
f2b20fe4
...
@@ -26,12 +26,12 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
...
@@ -26,12 +26,12 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
(
"fp8_e4m3"
,
"meta-llama/Llama-3.2-1B-Instruct"
,
(
"fp8_e4m3"
,
"meta-llama/Llama-3.2-1B-Instruct"
,
"nm-testing/Llama-3.2-1B-Instruct-FP8-KV"
),
"nm-testing/Llama-3.2-1B-Instruct-FP8-KV"
),
# Test F
P
16 checkpoint w. fp8_e5m2 kv-cache.
# Test
B
F16 checkpoint w. fp8_e5m2 kv-cache.
(
"fp8_e5m2"
,
"meta-llama/Llama-3.2-1B-Instruct"
,
(
"fp8_e5m2"
,
"meta-llama/Llama-3.2-1B-Instruct"
,
"meta-llama/Llama-3.2-1B-Instruct"
),
"meta-llama/Llama-3.2-1B-Instruct"
),
# Test F
P
16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
# Test
B
F16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
(
"fp8_e4m3"
,
"meta-llama/Llama-
2-7b-chat-hf
"
,
(
"fp8_e4m3"
,
"meta-llama/Llama-
3.2-1B-Instruct
"
,
"meta-llama/Llama-
2-7b-chat-hf
"
)
"meta-llama/Llama-
3.2-1B-Instruct
"
)
])
])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
4
])
...
...
tests/models/registry.py
View file @
f2b20fe4
...
@@ -141,7 +141,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -141,7 +141,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"JAISLMHeadModel"
:
_HfExamplesInfo
(
"inceptionai/jais-13b-chat"
),
"JAISLMHeadModel"
:
_HfExamplesInfo
(
"inceptionai/jais-13b-chat"
),
"JambaForCausalLM"
:
_HfExamplesInfo
(
"ai21labs/AI21-Jamba-1.5-Mini"
,
"JambaForCausalLM"
:
_HfExamplesInfo
(
"ai21labs/AI21-Jamba-1.5-Mini"
,
extras
=
{
"tiny"
:
"ai21labs/Jamba-tiny-dev"
}),
# noqa: E501
extras
=
{
"tiny"
:
"ai21labs/Jamba-tiny-dev"
}),
# noqa: E501
"LlamaForCausalLM"
:
_HfExamplesInfo
(
"meta-llama/
Meta-
Llama-3
-8B
"
),
"LlamaForCausalLM"
:
_HfExamplesInfo
(
"meta-llama/Llama-3
.2-1B-Instruct
"
),
"LLaMAForCausalLM"
:
_HfExamplesInfo
(
"decapoda-research/llama-7b-hf"
,
"LLaMAForCausalLM"
:
_HfExamplesInfo
(
"decapoda-research/llama-7b-hf"
,
is_available_online
=
False
),
is_available_online
=
False
),
"MambaForCausalLM"
:
_HfExamplesInfo
(
"state-spaces/mamba-130m-hf"
),
"MambaForCausalLM"
:
_HfExamplesInfo
(
"state-spaces/mamba-130m-hf"
),
...
...
tests/quantization/test_register_quantization_config.py
View file @
f2b20fe4
...
@@ -99,7 +99,7 @@ def test_register_quantization_config():
...
@@ -99,7 +99,7 @@ def test_register_quantization_config():
@
pytest
.
mark
.
parametrize
(
argnames
=
"model"
,
@
pytest
.
mark
.
parametrize
(
argnames
=
"model"
,
argvalues
=
[
argvalues
=
[
"meta-llama/
Meta-
Llama-3
-8
B-Instruct"
,
"meta-llama/Llama-3
.2-1
B-Instruct"
,
])
])
def
test_custom_quant
(
vllm_runner
,
model
):
def
test_custom_quant
(
vllm_runner
,
model
):
"""Test infer with the custom quantization method."""
"""Test infer with the custom quantization method."""
...
...
tests/samplers/test_ignore_eos.py
View file @
f2b20fe4
...
@@ -10,7 +10,7 @@ from vllm import SamplingParams
...
@@ -10,7 +10,7 @@ from vllm import SamplingParams
# We also test with llama because it has generation_config to specify EOS
# We also test with llama because it has generation_config to specify EOS
# (past regression).
# (past regression).
MODELS
=
[
"facebook/opt-125m"
,
"meta-llama/Llama-
2-7b-hf
"
]
MODELS
=
[
"facebook/opt-125m"
,
"meta-llama/Llama-
3.2-1B-Instruct
"
]
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
...
...
tests/spec_decode/e2e/test_compatibility.py
View file @
f2b20fe4
...
@@ -8,7 +8,7 @@ from .conftest import get_output_from_llm_generator
...
@@ -8,7 +8,7 @@ from .conftest import get_output_from_llm_generator
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"model"
:
"meta-llama/Llama-
2-7b-chat-hf
"
,
"model"
:
"meta-llama/Llama-
3.2-1B-Instruct
"
,
"speculative_model"
:
"JackFram/llama-68m"
,
"speculative_model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
5
,
"num_speculative_tokens"
:
5
,
}])
}])
...
@@ -27,8 +27,8 @@ from .conftest import get_output_from_llm_generator
...
@@ -27,8 +27,8 @@ from .conftest import get_output_from_llm_generator
},
},
{
{
# Speculative max model len > target max model len should raise.
# Speculative max model len > target max model len should raise.
# https://huggingface.co/meta-llama/Llama-
2-7b-chat-hf/blob/f5db02db724555f92da89c216ac04704f23d4590
/config.json#L1
2
# https://huggingface.co/meta-llama/Llama-
3.2-1B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6
/config.json#L1
8
"speculative_max_model_len"
:
4096
+
1
,
"speculative_max_model_len"
:
131072
+
1
,
},
},
])
])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{}])
...
...
tests/test_config.py
View file @
f2b20fe4
...
@@ -251,7 +251,7 @@ def test_rope_customization():
...
@@ -251,7 +251,7 @@ def test_rope_customization():
@
pytest
.
mark
.
parametrize
((
"model_id"
,
"is_encoder_decoder"
),
[
@
pytest
.
mark
.
parametrize
((
"model_id"
,
"is_encoder_decoder"
),
[
(
"facebook/opt-125m"
,
False
),
(
"facebook/opt-125m"
,
False
),
(
"facebook/bart-base"
,
True
),
(
"facebook/bart-base"
,
True
),
(
"meta-llama/Llama-3.2-1B"
,
False
),
(
"meta-llama/Llama-3.2-1B
-Instruct
"
,
False
),
(
"meta-llama/Llama-3.2-11B-Vision"
,
True
),
(
"meta-llama/Llama-3.2-11B-Vision"
,
True
),
])
])
def
test_is_encoder_decoder
(
model_id
,
is_encoder_decoder
):
def
test_is_encoder_decoder
(
model_id
,
is_encoder_decoder
):
...
...
tests/test_sharded_state_loader.py
View file @
f2b20fe4
...
@@ -46,9 +46,9 @@ def test_filter_subtensors():
...
@@ -46,9 +46,9 @@ def test_filter_subtensors():
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
llama_2_
7
b_files
():
def
llama_
3p
2_
1
b_files
():
with
TemporaryDirectory
()
as
cache_dir
:
with
TemporaryDirectory
()
as
cache_dir
:
input_dir
=
snapshot_download
(
"meta-llama/Llama-3.2-1B"
,
input_dir
=
snapshot_download
(
"meta-llama/Llama-3.2-1B
-Instruct
"
,
cache_dir
=
cache_dir
,
cache_dir
=
cache_dir
,
ignore_patterns
=
[
"*.bin*"
,
"original/*"
])
ignore_patterns
=
[
"*.bin*"
,
"original/*"
])
...
@@ -81,13 +81,13 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
...
@@ -81,13 +81,13 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
@
pytest
.
mark
.
parametrize
(
"enable_lora"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"enable_lora"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
,
2
])
def
test_sharded_state_loader
(
enable_lora
,
tp_size
,
num_gpus_available
,
def
test_sharded_state_loader
(
enable_lora
,
tp_size
,
num_gpus_available
,
llama_2_
7
b_files
):
llama_
3p
2_
1
b_files
):
if
num_gpus_available
<
tp_size
:
if
num_gpus_available
<
tp_size
:
pytest
.
skip
(
f
"Not enough GPUs for tensor parallelism
{
tp_size
}
"
)
pytest
.
skip
(
f
"Not enough GPUs for tensor parallelism
{
tp_size
}
"
)
weights_patterns
=
(
"*.safetensors"
,
)
weights_patterns
=
(
"*.safetensors"
,
)
gpu_memory_utilization
=
0.8
gpu_memory_utilization
=
0.8
input_dir
=
llama_2_
7
b_files
input_dir
=
llama_
3p
2_
1
b_files
ctx
=
mp
.
get_context
(
"spawn"
)
ctx
=
mp
.
get_context
(
"spawn"
)
# Run in separate processes for memory & CUDA isolation
# Run in separate processes for memory & CUDA isolation
...
...
tests/tokenization/test_detokenize.py
View file @
f2b20fe4
...
@@ -31,7 +31,7 @@ TOKENIZERS = [
...
@@ -31,7 +31,7 @@ TOKENIZERS = [
"bigscience/bloom-560m"
,
"bigscience/bloom-560m"
,
"mosaicml/mpt-7b"
,
"mosaicml/mpt-7b"
,
"tiiuae/falcon-7b"
,
"tiiuae/falcon-7b"
,
"meta-llama/Llama-
2-7b-hf
"
,
"meta-llama/Llama-
3.2-1B-Instruct
"
,
"codellama/CodeLlama-7b-hf"
,
"codellama/CodeLlama-7b-hf"
,
"mistralai/Pixtral-12B-2409"
,
"mistralai/Pixtral-12B-2409"
,
]
]
...
...
tests/tokenization/test_get_eos.py
View file @
f2b20fe4
...
@@ -9,7 +9,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
...
@@ -9,7 +9,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
def
test_get_llama3_eos_token
():
def
test_get_llama3_eos_token
():
model_name
=
"meta-llama/
Meta-
Llama-3
-8
B-Instruct"
model_name
=
"meta-llama/Llama-3
.2-1
B-Instruct"
tokenizer
=
get_tokenizer
(
model_name
)
tokenizer
=
get_tokenizer
(
model_name
)
assert
tokenizer
.
eos_token_id
==
128009
assert
tokenizer
.
eos_token_id
==
128009
...
@@ -17,7 +17,7 @@ def test_get_llama3_eos_token():
...
@@ -17,7 +17,7 @@ def test_get_llama3_eos_token():
generation_config
=
try_get_generation_config
(
model_name
,
generation_config
=
try_get_generation_config
(
model_name
,
trust_remote_code
=
False
)
trust_remote_code
=
False
)
assert
generation_config
is
not
None
assert
generation_config
is
not
None
assert
generation_config
.
eos_token_id
==
[
128001
,
128009
]
assert
generation_config
.
eos_token_id
==
[
128001
,
128008
,
128009
]
def
test_get_blip2_eos_token
():
def
test_get_blip2_eos_token
():
...
...
tests/v1/engine/test_async_llm.py
View file @
f2b20fe4
...
@@ -17,7 +17,7 @@ if not current_platform.is_cuda():
...
@@ -17,7 +17,7 @@ if not current_platform.is_cuda():
pytest
.
skip
(
reason
=
"V1 currently only supported on CUDA."
,
pytest
.
skip
(
reason
=
"V1 currently only supported on CUDA."
,
allow_module_level
=
True
)
allow_module_level
=
True
)
ENGINE_ARGS
=
AsyncEngineArgs
(
model
=
"meta-llama/Llama-3.2-1B"
,
ENGINE_ARGS
=
AsyncEngineArgs
(
model
=
"meta-llama/Llama-3.2-1B
-Instruct
"
,
enforce_eager
=
True
,
enforce_eager
=
True
,
disable_log_requests
=
True
)
disable_log_requests
=
True
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment