Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
87a2e37f
Commit
87a2e37f
authored
Nov 27, 2024
by
zhuwenwen
Browse files
update tests
parent
3c9817d2
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
34 additions
and
31 deletions
+34
-31
tests/basic_correctness/test_preemption.py
tests/basic_correctness/test_preemption.py
+2
-2
tests/compile/utils.py
tests/compile/utils.py
+16
-16
tests/spec_decode/e2e/test_eagle_correctness.py
tests/spec_decode/e2e/test_eagle_correctness.py
+2
-1
tests/tokenization/test_tokenizer_group.py
tests/tokenization/test_tokenizer_group.py
+7
-7
tests/tool_use/conftest.py
tests/tool_use/conftest.py
+2
-2
tests/tool_use/utils.py
tests/tool_use/utils.py
+4
-2
vllm/model_executor/models/llama.py
vllm/model_executor/models/llama.py
+1
-1
No files found.
tests/basic_correctness/test_preemption.py
View file @
87a2e37f
...
...
@@ -20,7 +20,7 @@ import os
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
]
]
@
pytest
.
fixture
(
scope
=
"module"
,
autouse
=
True
)
...
...
tests/compile/utils.py
View file @
87a2e37f
...
...
@@ -22,10 +22,10 @@ TEST_MODELS = [
"dtype"
:
torch
.
float16
,
"quantization"
:
"compressed-tensors"
}),
(
os
.
path
.
join
(
models_path_prefix
,
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
),
{
"dtype"
:
torch
.
float16
,
"quantization"
:
"fp8"
}),
#
(os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), {
#
"dtype": torch.float16,
#
"quantization": "fp8"
#
}),
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples"
),
{
"quantization"
:
"compressed-tensors"
}),
...
...
@@ -49,20 +49,20 @@ if is_quant_method_supported("gptq"):
"quantization"
:
"gptq"
}))
if
is_quant_method_supported
(
"gptq_marlin"
):
TEST_MODELS
.
append
((
os
.
path
.
join
(
models_path_prefix
,
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
),
{
"quantization"
:
"gptq_marlin"
}))
#
if is_quant_method_supported("gptq_marlin"):
#
TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), {
#
"quantization": "gptq_marlin"
#
}))
if
is_quant_method_supported
(
"gptq_marlin_24"
):
TEST_MODELS
.
append
((
os
.
path
.
join
(
models_path_prefix
,
"alexm-nm/tinyllama-24-marlin24-4bit-g128"
),
{
"quantization"
:
"gptq_marlin_24"
}))
#
if is_quant_method_supported("gptq_marlin_24"):
#
TEST_MODELS.append((os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-marlin24-4bit-g128"), {
#
"quantization": "gptq_marlin_24"
#
}))
if
is_quant_method_supported
(
"marlin"
):
TEST_MODELS
.
append
((
os
.
path
.
join
(
models_path_prefix
,
"robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin"
),
{
"quantization"
:
"marlin"
}))
#
if is_quant_method_supported("marlin"):
#
TEST_MODELS.append((os.path.join(models_path_prefix, "robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin"), {
#
"quantization": "marlin"
#
}))
if
not
is_hip
()
and
is_quant_method_supported
(
"awq"
):
TEST_MODELS
.
append
((
os
.
path
.
join
(
models_path_prefix
,
"TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"
),
{
...
...
tests/spec_decode/e2e/test_eagle_correctness.py
View file @
87a2e37f
...
...
@@ -24,6 +24,7 @@ import os
from
.conftest
import
run_equality_correctness_test
from
...utils
import
models_path_prefix
import
vllm.envs
as
envs
# main model
MAIN_MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
...
...
@@ -36,7 +37,7 @@ SPEC_MODEL = os.path.join(models_path_prefix, "abhigoyal/vllm-eagle-llama-68m-ra
MAX_SPEC_TOKENS
=
4
# precision
PRECISION
=
"float32"
PRECISION
=
"float32"
if
envs
.
VLLM_USE_TRITON_FLASH_ATTN
else
"half"
@
pytest
.
mark
.
parametrize
(
...
...
tests/tokenization/test_tokenizer_group.py
View file @
87a2e37f
...
...
@@ -34,7 +34,7 @@ async def test_tokenizer_group(tokenizer_group_type):
reference_tokenizer
=
AutoTokenizer
.
from_pretrained
(
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
))
tokenizer_group
=
get_tokenizer_group
(
get_tokenizer_pool_config
(
tokenizer_group_type
),
tokenizer_id
=
"gpt2"
,
tokenizer_id
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
)
,
enable_lora
=
False
,
max_num_seqs
=
1
,
max_input_length
=
None
,
...
...
@@ -58,7 +58,7 @@ async def test_tokenizer_group_pool(tokenizer_group_type):
reference_tokenizer
=
AutoTokenizer
.
from_pretrained
(
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
))
tokenizer_group_pool
=
get_tokenizer_group
(
get_tokenizer_pool_config
(
tokenizer_group_type
),
tokenizer_id
=
"gpt2"
,
tokenizer_id
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
)
,
enable_lora
=
False
,
max_num_seqs
=
1
,
max_input_length
=
None
,
...
...
@@ -100,7 +100,7 @@ async def test_tokenizer_group_ray_pool_env_var_propagation(
tokenizer_pool_config
=
get_tokenizer_pool_config
(
tokenizer_group_type
)
tokenizer_pool
=
EnvVarCheckerRayTokenizerGroupPool
.
from_config
(
tokenizer_pool_config
,
tokenizer_id
=
"gpt2"
,
tokenizer_id
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
)
,
enable_lora
=
False
,
max_num_seqs
=
1
,
max_input_length
=
None
)
...
...
@@ -111,7 +111,7 @@ async def test_tokenizer_group_ray_pool_env_var_propagation(
tokenizer_pool_config
=
get_tokenizer_pool_config
(
tokenizer_group_type
)
tokenizer_pool
=
EnvVarCheckerRayTokenizerGroupPool
.
from_config
(
tokenizer_pool_config
,
tokenizer_id
=
"gpt2"
,
tokenizer_id
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
)
,
enable_lora
=
False
,
max_num_seqs
=
1
,
max_input_length
=
None
)
...
...
@@ -148,7 +148,7 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
tokenizer_pool_config
=
get_tokenizer_pool_config
(
tokenizer_group_type
)
tokenizer_group_pool
=
FailingRayTokenizerGroupPool
.
from_config
(
tokenizer_pool_config
,
tokenizer_id
=
"gpt2"
,
tokenizer_id
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
)
,
enable_lora
=
False
,
max_num_seqs
=
1
,
max_input_length
=
None
,
...
...
@@ -175,7 +175,7 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
fail_at
=
[
1
]
tokenizer_group_pool
=
FailingRayTokenizerGroupPool
.
from_config
(
tokenizer_pool_config
,
tokenizer_id
=
"gpt2"
,
tokenizer_id
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
)
,
enable_lora
=
False
,
max_num_seqs
=
1
,
max_input_length
=
None
,
...
...
@@ -196,7 +196,7 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
fail_at
=
[]
tokenizer_group_pool
=
FailingRayTokenizerGroupPool
.
from_config
(
tokenizer_pool_config
,
tokenizer_id
=
"gpt2"
,
tokenizer_id
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
)
,
enable_lora
=
False
,
max_num_seqs
=
1
,
max_input_length
=
2
,
...
...
tests/tool_use/conftest.py
View file @
87a2e37f
import
pytest
import
pytest_asyncio
from
huggingface_hub
import
snapshot_download
#
from huggingface_hub import snapshot_download
from
tests.utils
import
RemoteOpenAIServer
...
...
@@ -12,7 +12,7 @@ from .utils import ARGS, CONFIGS, ServerConfig
def
server_config
(
request
):
config
=
CONFIGS
[
request
.
param
]
# download model and tokenizer using transformers
snapshot_download
(
config
[
"model"
])
#
snapshot_download(config["model"])
yield
CONFIGS
[
request
.
param
]
...
...
tests/tool_use/utils.py
View file @
87a2e37f
from
typing
import
Dict
,
List
import
os
from
openai.types.chat
import
(
ChatCompletionMessageParam
,
ChatCompletionToolParam
)
from
typing_extensions
import
TypedDict
from
tests.utils
import
VLLM_PATH
from
..utils
import
models_path_prefix
class
ServerConfig
(
TypedDict
):
...
...
@@ -19,7 +21,7 @@ ARGS: List[str] = ["--enable-auto-tool-choice", "--max-model-len", "8096"]
CONFIGS
:
Dict
[
str
,
ServerConfig
]
=
{
"hermes"
:
{
"model"
:
"NousResearch/Hermes-3-Llama-3.1-8B"
,
os
.
path
.
join
(
models_path_prefix
,
"NousResearch/Hermes-3-Llama-3.1-8B"
)
,
"arguments"
:
[
"--tool-call-parser"
,
"hermes"
,
"--chat-template"
,
str
(
VLLM_PATH
/
"examples/tool_chat_template_hermes.jinja"
)
...
...
@@ -27,7 +29,7 @@ CONFIGS: Dict[str, ServerConfig] = {
},
"mistral"
:
{
"model"
:
"mistralai/Mistral-7B-Instruct-v0.3"
,
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Mistral-7B-Instruct-v0.3"
)
,
"arguments"
:
[
"--tool-call-parser"
,
"mistral"
,
"--chat-template"
,
str
(
VLLM_PATH
/
"examples/tool_chat_template_mistral.jinja"
),
...
...
vllm/model_executor/models/llama.py
View file @
87a2e37f
...
...
@@ -582,7 +582,7 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
qkv_words
=
"|"
.
join
(
lay_qkv_words
)
for
layername
,
weight
in
params_dict
.
items
():
if
"lm_head.weight"
in
layername
:
if
"lm_head.weight"
in
layername
and
weight
.
shape
[
1
]
>=
4096
:
lay_key_words
.
append
(
"lm_head.weight"
)
combined_words
=
"|"
.
join
(
lay_key_words
)
os
.
environ
[
'LM_NN'
]
=
'1'
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment