Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1591c68f
Commit
1591c68f
authored
May 25, 2024
by
zhuwenwen
Browse files
merge v0.4.2
parents
09bcf00b
c7f2cf2b
Changes
265
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
795 additions
and
140 deletions
+795
-140
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+37
-0
tests/entrypoints/test_guided_processors.py
tests/entrypoints/test_guided_processors.py
+3
-1
tests/entrypoints/test_openai_server.py
tests/entrypoints/test_openai_server.py
+56
-1
tests/kernels/conftest.py
tests/kernels/conftest.py
+7
-1
tests/kernels/test_attention.py
tests/kernels/test_attention.py
+17
-18
tests/kernels/test_cache.py
tests/kernels/test_cache.py
+77
-0
tests/kernels/test_moe.py
tests/kernels/test_moe.py
+2
-2
tests/kernels/test_prefix_prefill.py
tests/kernels/test_prefix_prefill.py
+38
-12
tests/lora/test_layers.py
tests/lora/test_layers.py
+23
-6
tests/lora/test_punica.py
tests/lora/test_punica.py
+49
-2
tests/metrics/test_metrics.py
tests/metrics/test_metrics.py
+124
-0
tests/model_executor/weight_utils.py
tests/model_executor/weight_utils.py
+29
-1
tests/models/test_big_models.py
tests/models/test_big_models.py
+15
-0
tests/models/test_fp8.py
tests/models/test_fp8.py
+90
-0
tests/models/test_gptq_marlin.py
tests/models/test_gptq_marlin.py
+98
-0
tests/models/test_marlin.py
tests/models/test_marlin.py
+13
-32
tests/models/test_models.py
tests/models/test_models.py
+15
-0
tests/models/utils.py
tests/models/utils.py
+29
-0
tests/quantization/test_autogptq_marlin_configs.py
tests/quantization/test_autogptq_marlin_configs.py
+0
-64
tests/quantization/test_configs.py
tests/quantization/test_configs.py
+73
-0
No files found.
tests/entrypoints/openai/test_serving_chat.py
0 → 100644
View file @
1591c68f
import
asyncio
from
dataclasses
import
dataclass
from
vllm.entrypoints.openai.serving_chat
import
OpenAIServingChat
MODEL_NAME
=
"openai-community/gpt2"
CHAT_TEMPLATE
=
"Dummy chat template for testing {}"
@
dataclass
class
MockModelConfig
:
tokenizer
=
MODEL_NAME
trust_remote_code
=
False
tokenizer_mode
=
"auto"
max_model_len
=
100
tokenizer_revision
=
None
@
dataclass
class
MockEngine
:
async
def
get_model_config
(
self
):
return
MockModelConfig
async
def
_async_serving_chat_init
():
serving_completion
=
OpenAIServingChat
(
MockEngine
(),
served_model_names
=
[
MODEL_NAME
],
response_role
=
"assistant"
,
chat_template
=
CHAT_TEMPLATE
)
return
serving_completion
def
test_async_serving_chat_init
():
serving_completion
=
asyncio
.
run
(
_async_serving_chat_init
())
assert
serving_completion
.
tokenizer
is
not
None
assert
serving_completion
.
tokenizer
.
chat_template
==
CHAT_TEMPLATE
tests/entrypoints/test_guided_processors.py
View file @
1591c68f
...
@@ -57,7 +57,9 @@ def test_guided_logits_processors():
...
@@ -57,7 +57,9 @@ def test_guided_logits_processors():
"""Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
"""Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
tokenizer
=
AutoTokenizer
.
from_pretrained
(
'HuggingFaceH4/zephyr-7b-beta'
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
'HuggingFaceH4/zephyr-7b-beta'
)
regex_LP
=
RegexLogitsProcessor
(
TEST_REGEX
,
tokenizer
)
regex_LP
=
RegexLogitsProcessor
(
TEST_REGEX
,
tokenizer
)
json_LP
=
JSONLogitsProcessor
(
TEST_SCHEMA
,
tokenizer
)
json_LP
=
JSONLogitsProcessor
(
TEST_SCHEMA
,
tokenizer
,
whitespace_pattern
=
None
)
regex_LP
.
init_state
()
regex_LP
.
init_state
()
token_ids
=
tokenizer
.
encode
(
token_ids
=
tokenizer
.
encode
(
...
...
tests/entrypoints/test_openai_server.py
View file @
1591c68f
...
@@ -13,8 +13,10 @@ import pytest
...
@@ -13,8 +13,10 @@ import pytest
# and debugging.
# and debugging.
import
ray
import
ray
import
requests
import
requests
import
torch
# downloading lora to test lora requests
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
huggingface_hub
import
snapshot_download
from
openai
import
BadRequestError
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
...
@@ -148,7 +150,7 @@ def server(zephyr_lora_files):
...
@@ -148,7 +150,7 @@ def server(zephyr_lora_files):
ray
.
shutdown
()
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"
session
"
)
@
pytest
.
fixture
(
scope
=
"
module
"
)
def
client
():
def
client
():
client
=
openai
.
AsyncOpenAI
(
client
=
openai
.
AsyncOpenAI
(
base_url
=
"http://localhost:8000/v1"
,
base_url
=
"http://localhost:8000/v1"
,
...
@@ -770,6 +772,40 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
...
@@ -770,6 +772,40 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
assert
loaded
==
{
"result"
:
2
},
loaded
assert
loaded
==
{
"result"
:
2
},
loaded
async
def
test_extra_fields
(
server
,
client
:
openai
.
AsyncOpenAI
):
with
pytest
.
raises
(
BadRequestError
)
as
exc_info
:
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
,
"extra_field"
:
"0"
,
}],
# type: ignore
temperature
=
0
,
seed
=
0
)
assert
"extra_forbidden"
in
exc_info
.
value
.
message
async
def
test_complex_message_content
(
server
,
client
:
openai
.
AsyncOpenAI
):
resp
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
[{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"text"
,
"text"
:
"what is 1+1? please provide the result without any other text."
}]
}],
temperature
=
0
,
seed
=
0
)
content
=
resp
.
choices
[
0
].
message
.
content
assert
content
==
"2"
async
def
test_guided_grammar
(
server
,
client
:
openai
.
AsyncOpenAI
):
async
def
test_guided_grammar
(
server
,
client
:
openai
.
AsyncOpenAI
):
simple_sql_grammar
=
"""
simple_sql_grammar
=
"""
start: select_statement
start: select_statement
...
@@ -835,5 +871,24 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
...
@@ -835,5 +871,24 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
assert
len
(
logprobs
.
tokens
)
>
5
assert
len
(
logprobs
.
tokens
)
>
5
async
def
test_long_seed
(
server
,
client
:
openai
.
AsyncOpenAI
):
for
seed
in
[
torch
.
iinfo
(
torch
.
long
).
min
-
1
,
torch
.
iinfo
(
torch
.
long
).
max
+
1
]:
with
pytest
.
raises
(
BadRequestError
)
as
exc_info
:
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
,
}],
temperature
=
0
,
seed
=
seed
)
assert
(
"greater_than_equal"
in
exc_info
.
value
.
message
or
"less_than_equal"
in
exc_info
.
value
.
message
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
pytest
.
main
([
__file__
])
pytest
.
main
([
__file__
])
tests/kernels/conftest.py
View file @
1591c68f
import
pytest
import
pytest
from
vllm.utils
import
create_kv_caches_with_random
from
vllm.utils
import
(
create_kv_caches_with_random
,
create_kv_caches_with_random_flash
)
@
pytest
.
fixture
()
@
pytest
.
fixture
()
def
kv_cache_factory
():
def
kv_cache_factory
():
return
create_kv_caches_with_random
return
create_kv_caches_with_random
@
pytest
.
fixture
()
def
kv_cache_factory_flashinfer
():
return
create_kv_caches_with_random_flash
tests/kernels/test_attention.py
View file @
1591c68f
...
@@ -66,7 +66,7 @@ def ref_single_query_cached_kv_attention(
...
@@ -66,7 +66,7 @@ def ref_single_query_cached_kv_attention(
key_cache
:
torch
.
Tensor
,
key_cache
:
torch
.
Tensor
,
value_cache
:
torch
.
Tensor
,
value_cache
:
torch
.
Tensor
,
block_tables
:
torch
.
Tensor
,
block_tables
:
torch
.
Tensor
,
context
_lens
:
torch
.
Tensor
,
seq
_lens
:
torch
.
Tensor
,
scale
:
float
,
scale
:
float
,
alibi_slopes
:
Optional
[
torch
.
Tensor
],
alibi_slopes
:
Optional
[
torch
.
Tensor
],
)
->
None
:
)
->
None
:
...
@@ -77,15 +77,15 @@ def ref_single_query_cached_kv_attention(
...
@@ -77,15 +77,15 @@ def ref_single_query_cached_kv_attention(
num_seqs
=
query
.
shape
[
0
]
num_seqs
=
query
.
shape
[
0
]
block_tables
=
block_tables
.
cpu
().
tolist
()
block_tables
=
block_tables
.
cpu
().
tolist
()
context
_lens
=
context
_lens
.
cpu
().
tolist
()
seq
_lens
=
seq
_lens
.
cpu
().
tolist
()
for
i
in
range
(
num_seqs
):
for
i
in
range
(
num_seqs
):
q
=
query
[
i
].
unsqueeze
(
0
)
q
=
query
[
i
].
unsqueeze
(
0
)
block_table
=
block_tables
[
i
]
block_table
=
block_tables
[
i
]
context
_len
=
int
(
context
_lens
[
i
])
seq
_len
=
int
(
seq
_lens
[
i
])
keys
=
[]
keys
=
[]
values
=
[]
values
=
[]
for
j
in
range
(
context
_len
):
for
j
in
range
(
seq
_len
):
block_number
=
int
(
block_table
[
j
//
block_size
])
block_number
=
int
(
block_table
[
j
//
block_size
])
block_offset
=
j
%
block_size
block_offset
=
j
%
block_size
...
@@ -105,8 +105,8 @@ def ref_single_query_cached_kv_attention(
...
@@ -105,8 +105,8 @@ def ref_single_query_cached_kv_attention(
alibi_bias
=
None
alibi_bias
=
None
if
alibi_slopes
is
not
None
:
if
alibi_slopes
is
not
None
:
# Create the ALiBi bias used in the paged attention kernel.
# Create the ALiBi bias used in the paged attention kernel.
position_ids
=
torch
.
arange
(
context
_len
).
int
()
position_ids
=
torch
.
arange
(
seq
_len
).
int
()
alibi_bias
=
(
position_ids
-
context
_len
+
1
).
float
()
alibi_bias
=
(
position_ids
-
seq
_len
+
1
).
float
()
alibi_bias
=
alibi_slopes
.
view
(
-
1
,
1
,
1
)
*
alibi_bias
.
view
(
alibi_bias
=
alibi_slopes
.
view
(
-
1
,
1
,
1
)
*
alibi_bias
.
view
(
1
,
1
,
-
1
)
1
,
1
,
-
1
)
...
@@ -154,13 +154,13 @@ def test_paged_attention(
...
@@ -154,13 +154,13 @@ def test_paged_attention(
if
use_alibi
:
if
use_alibi
:
alibi_slopes
=
torch
.
randn
(
num_query_heads
,
dtype
=
torch
.
float
)
alibi_slopes
=
torch
.
randn
(
num_query_heads
,
dtype
=
torch
.
float
)
context
_lens
=
[
random
.
randint
(
1
,
MAX_SEQ_LEN
)
for
_
in
range
(
num_seqs
)]
seq
_lens
=
[
random
.
randint
(
1
,
MAX_SEQ_LEN
)
for
_
in
range
(
num_seqs
)]
context
_lens
[
-
1
]
=
MAX_SEQ_LEN
seq
_lens
[
-
1
]
=
MAX_SEQ_LEN
max_
context
_len
=
max
(
context
_lens
)
max_
seq
_len
=
max
(
seq
_lens
)
context
_lens
=
torch
.
tensor
(
context
_lens
,
dtype
=
torch
.
int
)
seq
_lens
=
torch
.
tensor
(
seq
_lens
,
dtype
=
torch
.
int
)
# Create the block tables.
# Create the block tables.
max_num_blocks_per_seq
=
(
max_
context
_len
+
block_size
-
1
)
//
block_size
max_num_blocks_per_seq
=
(
max_
seq
_len
+
block_size
-
1
)
//
block_size
block_tables
=
[]
block_tables
=
[]
for
_
in
range
(
num_seqs
):
for
_
in
range
(
num_seqs
):
block_table
=
[
block_table
=
[
...
@@ -191,16 +191,15 @@ def test_paged_attention(
...
@@ -191,16 +191,15 @@ def test_paged_attention(
num_kv_heads
,
num_kv_heads
,
scale
,
scale
,
block_tables
,
block_tables
,
context
_lens
,
seq
_lens
,
block_size
,
block_size
,
max_
context
_len
,
max_
seq
_len
,
alibi_slopes
,
alibi_slopes
,
kv_cache_dtype
,
kv_cache_dtype
,
kv_scale
,
kv_scale
,
)
)
elif
version
==
"v2"
:
elif
version
==
"v2"
:
num_partitions
=
((
max_context_len
+
PARTITION_SIZE
-
1
)
//
num_partitions
=
((
max_seq_len
+
PARTITION_SIZE
-
1
)
//
PARTITION_SIZE
)
PARTITION_SIZE
)
assert
PARTITION_SIZE
%
block_size
==
0
assert
PARTITION_SIZE
%
block_size
==
0
num_seqs
,
num_heads
,
head_size
=
output
.
shape
num_seqs
,
num_heads
,
head_size
=
output
.
shape
tmp_output
=
torch
.
empty
(
tmp_output
=
torch
.
empty
(
...
@@ -223,9 +222,9 @@ def test_paged_attention(
...
@@ -223,9 +222,9 @@ def test_paged_attention(
num_kv_heads
,
num_kv_heads
,
scale
,
scale
,
block_tables
,
block_tables
,
context
_lens
,
seq
_lens
,
block_size
,
block_size
,
max_
context
_len
,
max_
seq
_len
,
alibi_slopes
,
alibi_slopes
,
kv_cache_dtype
,
kv_cache_dtype
,
kv_scale
,
kv_scale
,
...
@@ -260,7 +259,7 @@ def test_paged_attention(
...
@@ -260,7 +259,7 @@ def test_paged_attention(
key_cache
,
key_cache
,
value_cache
,
value_cache
,
block_tables
,
block_tables
,
context
_lens
,
seq
_lens
,
scale
,
scale
,
alibi_slopes
,
alibi_slopes
,
)
)
...
...
tests/kernels/test_cache.py
View file @
1591c68f
...
@@ -5,6 +5,7 @@ import pytest
...
@@ -5,6 +5,7 @@ import pytest
import
torch
import
torch
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm._C
import
cache_ops
from
vllm.utils
import
is_hip
from
vllm.utils
import
is_hip
COPYING_DIRECTION
=
[(
'cuda'
,
'cpu'
),
(
'cuda'
,
'cuda'
),
(
'cpu'
,
'cuda'
)]
COPYING_DIRECTION
=
[(
'cuda'
,
'cpu'
),
(
'cuda'
,
'cuda'
),
(
'cpu'
,
'cuda'
)]
...
@@ -195,6 +196,82 @@ def test_reshape_and_cache(
...
@@ -195,6 +196,82 @@ def test_reshape_and_cache(
assert
torch
.
allclose
(
value_cache
,
cloned_value_cache
)
assert
torch
.
allclose
(
value_cache
,
cloned_value_cache
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
BLOCK_SIZES
)
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
NUM_BLOCKS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
KV_CACHE_DTYPE
)
@
torch
.
inference_mode
()
def
test_reshape_and_cache_flash
(
kv_cache_factory_flashinfer
,
num_tokens
:
int
,
num_heads
:
int
,
head_size
:
int
,
block_size
:
int
,
num_blocks
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
device
:
str
,
kv_cache_dtype
:
str
,
)
->
None
:
if
kv_cache_dtype
==
"fp8"
:
pytest
.
skip
()
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
# Create a random slot mapping.
num_slots
=
block_size
*
num_blocks
slot_mapping
=
random
.
sample
(
range
(
num_slots
),
num_tokens
)
slot_mapping
=
torch
.
tensor
(
slot_mapping
,
dtype
=
torch
.
long
,
device
=
'cuda'
)
qkv
=
torch
.
randn
(
num_tokens
,
3
,
num_heads
,
head_size
,
dtype
=
dtype
,
device
=
device
)
_
,
key
,
value
=
qkv
.
unbind
(
dim
=
1
)
# Create the KV caches.
key_caches
,
value_caches
=
kv_cache_factory_flashinfer
(
num_blocks
,
block_size
,
1
,
num_heads
,
head_size
,
kv_cache_dtype
,
dtype
,
)
key_cache
,
value_cache
=
key_caches
[
0
],
value_caches
[
0
]
# Clone the KV caches.
cloned_key_cache
=
key_cache
.
clone
()
cloned_value_cache
=
value_cache
.
clone
()
# Call the reshape_and_cache kernel.
cache_ops
.
reshape_and_cache_flash
(
key
,
value
,
key_cache
,
value_cache
,
slot_mapping
,
kv_cache_dtype
)
# Run the reference implementation.
block_indicies
=
torch
.
div
(
slot_mapping
,
block_size
,
rounding_mode
=
'floor'
)
block_indicies
=
block_indicies
.
cpu
().
tolist
()
block_offsets
=
slot_mapping
%
block_size
block_offsets
=
block_offsets
.
cpu
().
tolist
()
for
i
in
range
(
num_tokens
):
block_idx
=
block_indicies
[
i
]
block_offset
=
block_offsets
[
i
]
cloned_key_cache
[
block_idx
,
block_offset
,
:,
:]
=
key
[
i
]
cloned_value_cache
[
block_idx
,
block_offset
,
:,
:]
=
value
[
i
]
assert
torch
.
allclose
(
key_cache
,
cloned_key_cache
)
assert
torch
.
allclose
(
value_cache
,
cloned_value_cache
)
@
pytest
.
mark
.
parametrize
(
"direction"
,
COPYING_DIRECTION
)
@
pytest
.
mark
.
parametrize
(
"direction"
,
COPYING_DIRECTION
)
@
pytest
.
mark
.
parametrize
(
"num_mappings"
,
NUM_MAPPINGS
)
@
pytest
.
mark
.
parametrize
(
"num_mappings"
,
NUM_MAPPINGS
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
...
...
tests/kernels/test_moe.py
View file @
1591c68f
...
@@ -77,8 +77,8 @@ def test_mixtral_moe(dtype: torch.dtype):
...
@@ -77,8 +77,8 @@ def test_mixtral_moe(dtype: torch.dtype):
for
i
in
range
(
config
.
num_local_experts
):
for
i
in
range
(
config
.
num_local_experts
):
weights
=
(
hf_moe
.
experts
[
i
].
w1
.
weight
.
data
,
weights
=
(
hf_moe
.
experts
[
i
].
w1
.
weight
.
data
,
hf_moe
.
experts
[
i
].
w3
.
weight
.
data
)
hf_moe
.
experts
[
i
].
w3
.
weight
.
data
)
vllm_moe
.
w
s
[
i
][:]
=
torch
.
cat
(
weights
,
dim
=
0
)
vllm_moe
.
w
13_weight
[
i
][:]
=
torch
.
cat
(
weights
,
dim
=
0
)
vllm_moe
.
w2
s
[
i
][:]
=
hf_moe
.
experts
[
i
].
w2
.
weight
.
data
vllm_moe
.
w2
_weight
[
i
][:]
=
hf_moe
.
experts
[
i
].
w2
.
weight
.
data
# Generate input batch of dimensions [batch_size, seq_len, hidden_dim]
# Generate input batch of dimensions [batch_size, seq_len, hidden_dim]
hf_inputs
=
torch
.
randn
((
1
,
64
,
config
.
hidden_size
)).
to
(
dtype
).
to
(
"cuda"
)
hf_inputs
=
torch
.
randn
((
1
,
64
,
config
.
hidden_size
)).
to
(
dtype
).
to
(
"cuda"
)
...
...
tests/kernels/test_prefix_prefill.py
View file @
1591c68f
...
@@ -15,6 +15,7 @@ DTYPES = [torch.float16]
...
@@ -15,6 +15,7 @@ DTYPES = [torch.float16]
CUDA_DEVICES
=
[
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
]
]
SLIDING_WINDOW
=
[
0
,
16
,
64
,
128
,
256
,
512
,
2048
]
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
...
@@ -22,11 +23,13 @@ CUDA_DEVICES = [
...
@@ -22,11 +23,13 @@ CUDA_DEVICES = [
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"sliding_window"
,
SLIDING_WINDOW
)
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_contexted_kv_attention
(
def
test_contexted_kv_attention
(
num_heads
:
int
,
num_heads
:
int
,
num_queries_per_kv
:
int
,
num_queries_per_kv
:
int
,
head_size
:
int
,
head_size
:
int
,
sliding_window
:
int
,
dtype
:
torch
.
dtype
,
dtype
:
torch
.
dtype
,
device
:
str
,
device
:
str
,
)
->
None
:
)
->
None
:
...
@@ -48,12 +51,12 @@ def test_contexted_kv_attention(
...
@@ -48,12 +51,12 @@ def test_contexted_kv_attention(
cache_size
=
640
cache_size
=
640
block_size
=
32
block_size
=
32
max_block_per_request
=
64
max_block_per_request
=
64
sub
query_lens
=
[
random
.
randint
(
16
,
MAX_SEQ_LEN
)
for
_
in
range
(
BS
)]
query_lens
=
[
random
.
randint
(
16
,
MAX_SEQ_LEN
)
for
_
in
range
(
BS
)]
ctx_lens
=
[
random
.
randint
(
16
,
MAX_CTX_LEN
)
for
_
in
range
(
BS
)]
ctx_lens
=
[
random
.
randint
(
16
,
MAX_CTX_LEN
)
for
_
in
range
(
BS
)]
seq_lens
=
[
a
+
b
for
a
,
b
in
zip
(
sub
query_lens
,
ctx_lens
)]
seq_lens
=
[
a
+
b
for
a
,
b
in
zip
(
query_lens
,
ctx_lens
)]
num_kv_heads
=
num_heads
//
num_queries_per_kv
num_kv_heads
=
num_heads
//
num_queries_per_kv
num_tokens
=
sum
(
sub
query_lens
)
num_tokens
=
sum
(
query_lens
)
query
=
torch
.
empty
(
num_tokens
,
num_heads
,
head_size
,
dtype
=
dtype
)
query
=
torch
.
empty
(
num_tokens
,
num_heads
,
head_size
,
dtype
=
dtype
)
query
.
uniform_
(
-
1e-3
,
1e-3
)
query
.
uniform_
(
-
1e-3
,
1e-3
)
output
=
torch
.
empty
(
num_tokens
,
num_heads
,
head_size
,
dtype
=
dtype
)
output
=
torch
.
empty
(
num_tokens
,
num_heads
,
head_size
,
dtype
=
dtype
)
...
@@ -72,15 +75,15 @@ def test_contexted_kv_attention(
...
@@ -72,15 +75,15 @@ def test_contexted_kv_attention(
num_kv_heads
,
num_kv_heads
,
head_size
,
head_size
,
dtype
=
dtype
)
dtype
=
dtype
)
k
=
torch
.
zeros
(
sum
(
sub
query_lens
),
num_kv_heads
,
head_size
,
dtype
=
dtype
)
k
=
torch
.
zeros
(
sum
(
query_lens
),
num_kv_heads
,
head_size
,
dtype
=
dtype
)
v
=
torch
.
zeros
(
sum
(
sub
query_lens
),
num_kv_heads
,
head_size
,
dtype
=
dtype
)
v
=
torch
.
zeros
(
sum
(
query_lens
),
num_kv_heads
,
head_size
,
dtype
=
dtype
)
values
=
torch
.
arange
(
0
,
cache_size
,
dtype
=
torch
.
long
)
values
=
torch
.
arange
(
0
,
cache_size
,
dtype
=
torch
.
long
)
values
=
values
[
torch
.
randperm
(
cache_size
)]
values
=
values
[
torch
.
randperm
(
cache_size
)]
block_table
=
values
[:
BS
*
max_block_per_request
].
view
(
block_table
=
values
[:
BS
*
max_block_per_request
].
view
(
BS
,
max_block_per_request
)
BS
,
max_block_per_request
)
b_seq_len
=
torch
.
tensor
(
seq_lens
,
dtype
=
torch
.
long
)
b_seq_len
=
torch
.
tensor
(
seq_lens
,
dtype
=
torch
.
long
)
b_ctx_len
=
torch
.
tensor
(
ctx_lens
,
dtype
=
torch
.
long
)
b_ctx_len
=
torch
.
tensor
(
ctx_lens
,
dtype
=
torch
.
long
)
b_start_loc
=
torch
.
cumsum
(
torch
.
tensor
([
0
]
+
sub
query_lens
[:
-
1
],
b_start_loc
=
torch
.
cumsum
(
torch
.
tensor
([
0
]
+
query_lens
[:
-
1
],
dtype
=
torch
.
long
),
dtype
=
torch
.
long
),
dim
=
0
)
dim
=
0
)
max_input_len
=
MAX_SEQ_LEN
max_input_len
=
MAX_SEQ_LEN
...
@@ -89,7 +92,7 @@ def test_contexted_kv_attention(
...
@@ -89,7 +92,7 @@ def test_contexted_kv_attention(
dtype
=
torch
.
long
),
dtype
=
torch
.
long
),
dim
=
0
)
dim
=
0
)
for
i
in
range
(
BS
):
for
i
in
range
(
BS
):
for
j
in
range
(
sub
query_lens
[
i
]):
for
j
in
range
(
query_lens
[
i
]):
k
[
b_start_loc
[
i
]
+
j
].
copy_
(
key
[
b_seq_start_loc
[
i
]
+
b_ctx_len
[
i
]
+
k
[
b_start_loc
[
i
]
+
j
].
copy_
(
key
[
b_seq_start_loc
[
i
]
+
b_ctx_len
[
i
]
+
j
])
j
])
v
[
b_start_loc
[
i
]
+
j
].
copy_
(
value
[
b_seq_start_loc
[
i
]
+
v
[
b_start_loc
[
i
]
+
j
].
copy_
(
value
[
b_seq_start_loc
[
i
]
+
...
@@ -123,12 +126,32 @@ def test_contexted_kv_attention(
...
@@ -123,12 +126,32 @@ def test_contexted_kv_attention(
# Warm up the Triton kernel by calling it once before actually measuring
# Warm up the Triton kernel by calling it once before actually measuring
# generation time
# generation time
context_attention_fwd
(
query
,
k
,
v
,
output
,
k_cache
,
v_cache
,
block_table
,
context_attention_fwd
(
query
,
b_start_loc
,
b_seq_len
,
b_ctx_len
,
max_input_len
)
k
,
v
,
output
,
k_cache
,
v_cache
,
block_table
,
b_start_loc
,
b_seq_len
,
b_ctx_len
,
max_input_len
,
sliding_window
=
sliding_window
)
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
start_time
=
time
.
time
()
start_time
=
time
.
time
()
context_attention_fwd
(
query
,
k
,
v
,
output
,
k_cache
,
v_cache
,
block_table
,
context_attention_fwd
(
query
,
b_start_loc
,
b_seq_len
,
b_ctx_len
,
max_input_len
)
k
,
v
,
output
,
k_cache
,
v_cache
,
block_table
,
b_start_loc
,
b_seq_len
,
b_ctx_len
,
max_input_len
,
sliding_window
=
sliding_window
)
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
end_time
=
time
.
time
()
end_time
=
time
.
time
()
print
(
f
"triton Time:
{
(
end_time
-
start_time
)
*
1000
:.
2
f
}
ms"
)
print
(
f
"triton Time:
{
(
end_time
-
start_time
)
*
1000
:.
2
f
}
ms"
)
...
@@ -155,7 +178,10 @@ def test_contexted_kv_attention(
...
@@ -155,7 +178,10 @@ def test_contexted_kv_attention(
value
=
value
.
unsqueeze
(
0
)
value
=
value
.
unsqueeze
(
0
)
attn_bias
=
BlockDiagonalCausalFromBottomRightMask
.
from_seqlens
(
attn_bias
=
BlockDiagonalCausalFromBottomRightMask
.
from_seqlens
(
subquery_lens
,
seq_lens
)
query_lens
,
seq_lens
)
if
sliding_window
>
0
:
attn_bias
=
attn_bias
.
make_local_attention_from_bottomright
(
sliding_window
)
output_ref
=
xops
.
memory_efficient_attention_forward
(
output_ref
=
xops
.
memory_efficient_attention_forward
(
query
,
query
,
key
,
key
,
...
...
tests/lora/test_layers.py
View file @
1591c68f
...
@@ -8,6 +8,10 @@ import torch
...
@@ -8,6 +8,10 @@ import torch
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
vllm.config
import
LoRAConfig
from
vllm.config
import
LoRAConfig
from
vllm.lora.fully_sharded_layers
import
(
ColumnParallelLinearWithShardedLoRA
,
MergedColumnParallelLinearWithShardedLoRA
,
MergedQKVParallelLinearWithShardedLora
,
RowParallelLinearWithShardedLoRA
)
# yapf conflicts with isort for this block
# yapf conflicts with isort for this block
# yapf: disable
# yapf: disable
from
vllm.lora.layers
import
(
BaseLayerWithLoRA
,
ColumnParallelLinearWithLoRA
,
from
vllm.lora.layers
import
(
BaseLayerWithLoRA
,
ColumnParallelLinearWithLoRA
,
...
@@ -524,13 +528,16 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
...
@@ -524,13 +528,16 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"orientation"
,
[
"row"
,
"column"
])
@
pytest
.
mark
.
parametrize
(
"orientation"
,
[
"row"
,
"column"
])
@
pytest
.
mark
.
parametrize
(
"fully_shard"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_linear_parallel
(
dist_init
,
num_loras
,
orientation
,
device
)
->
None
:
def
test_linear_parallel
(
dist_init
,
num_loras
,
orientation
,
fully_shard
,
device
)
->
None
:
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
max_loras
=
8
max_loras
=
8
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
max_lora_rank
=
8
,
max_lora_rank
=
8
,
fully_sharded_loras
=
fully_shard
,
lora_dtype
=
torch
.
float16
)
lora_dtype
=
torch
.
float16
)
def
create_random_linear_parallel_layer
():
def
create_random_linear_parallel_layer
():
...
@@ -540,14 +547,17 @@ def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
...
@@ -540,14 +547,17 @@ def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
bias
=
False
,
bias
=
False
,
params_dtype
=
torch
.
float16
)
params_dtype
=
torch
.
float16
)
linear
.
weight
.
data
=
torch
.
rand_like
(
linear
.
weight
.
data
)
linear
.
weight
.
data
=
torch
.
rand_like
(
linear
.
weight
.
data
)
lora_linear
=
RowParallelLinearWithLoRA
(
linear
)
lora_linear
=
(
RowParallelLinearWithLoRA
(
linear
)
if
not
fully_shard
else
RowParallelLinearWithShardedLoRA
(
linear
))
else
:
else
:
linear
=
ColumnParallelLinear
(
4096
,
linear
=
ColumnParallelLinear
(
4096
,
4096
,
4096
,
bias
=
False
,
bias
=
False
,
params_dtype
=
torch
.
float16
)
params_dtype
=
torch
.
float16
)
linear
.
weight
.
data
=
torch
.
rand_like
(
linear
.
weight
.
data
)
linear
.
weight
.
data
=
torch
.
rand_like
(
linear
.
weight
.
data
)
lora_linear
=
ColumnParallelLinearWithLoRA
(
linear
)
lora_linear
=
(
ColumnParallelLinearWithLoRA
(
linear
)
if
not
fully_shard
else
ColumnParallelLinearWithShardedLoRA
(
linear
))
lora_linear
.
create_lora_weights
(
max_loras
,
lora_config
)
lora_linear
.
create_lora_weights
(
max_loras
,
lora_config
)
return
linear
,
lora_linear
return
linear
,
lora_linear
...
@@ -629,13 +639,16 @@ def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
...
@@ -629,13 +639,16 @@ def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"repeats"
,
[
1
,
2
,
3
])
@
pytest
.
mark
.
parametrize
(
"repeats"
,
[
1
,
2
,
3
])
@
pytest
.
mark
.
parametrize
(
"fully_shard"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_column_parallel_packed
(
dist_init
,
num_loras
,
repeats
,
device
)
->
None
:
def
test_column_parallel_packed
(
dist_init
,
num_loras
,
repeats
,
fully_shard
,
device
)
->
None
:
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
max_loras
=
8
max_loras
=
8
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
max_lora_rank
=
8
,
max_lora_rank
=
8
,
fully_sharded_loras
=
fully_shard
,
lora_dtype
=
torch
.
float16
)
lora_dtype
=
torch
.
float16
)
def
create_column_parallel_packed_layer
():
def
create_column_parallel_packed_layer
():
...
@@ -644,7 +657,9 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
...
@@ -644,7 +657,9 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
bias
=
False
,
bias
=
False
,
params_dtype
=
torch
.
float16
)
params_dtype
=
torch
.
float16
)
linear
.
weight
.
data
=
torch
.
rand_like
(
linear
.
weight
.
data
)
linear
.
weight
.
data
=
torch
.
rand_like
(
linear
.
weight
.
data
)
lora_linear
=
MergedColumnParallelLinearWithLoRA
(
linear
)
lora_linear
=
(
MergedColumnParallelLinearWithLoRA
(
linear
)
if
not
fully_shard
else
MergedColumnParallelLinearWithShardedLoRA
(
linear
))
elif
repeats
==
3
:
elif
repeats
==
3
:
linear
=
QKVParallelLinear
(
4096
,
linear
=
QKVParallelLinear
(
4096
,
64
,
64
,
...
@@ -652,7 +667,9 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
...
@@ -652,7 +667,9 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
bias
=
False
,
bias
=
False
,
params_dtype
=
torch
.
float16
)
params_dtype
=
torch
.
float16
)
linear
.
weight
.
data
=
torch
.
rand_like
(
linear
.
weight
.
data
)
linear
.
weight
.
data
=
torch
.
rand_like
(
linear
.
weight
.
data
)
lora_linear
=
MergedQKVParallelLinearWithLora
(
linear
)
lora_linear
=
(
MergedQKVParallelLinearWithLora
(
linear
)
if
not
fully_shard
else
MergedQKVParallelLinearWithShardedLora
(
linear
))
else
:
else
:
linear
=
QKVParallelLinear
(
4096
,
linear
=
QKVParallelLinear
(
4096
,
64
,
64
,
...
...
tests/lora/test_punica.py
View file @
1591c68f
...
@@ -34,11 +34,14 @@ def _lora_ref_impl(
...
@@ -34,11 +34,14 @@ def _lora_ref_impl(
for
i
,
lora_idx
in
zip
(
range
(
bs
),
indicies
.
cpu
().
tolist
()):
for
i
,
lora_idx
in
zip
(
range
(
bs
),
indicies
.
cpu
().
tolist
()):
xi
=
x
[
i
].
unsqueeze
(
0
).
to
(
torch
.
float32
)
xi
=
x
[
i
].
unsqueeze
(
0
).
to
(
torch
.
float32
)
wa
=
wa_T_all
[
lora_idx
,
layer_idx
].
transpose
(
-
1
,
-
2
).
to
(
torch
.
float32
)
wa
=
wa_T_all
[
lora_idx
,
layer_idx
].
transpose
(
-
1
,
-
2
).
to
(
torch
.
float32
)
wb
=
wb_T_all
[
lora_idx
,
layer_idx
].
transpose
(
-
1
,
-
2
).
to
(
torch
.
float32
)
if
wb_T_all
is
not
None
:
wb
=
wb_T_all
[
lora_idx
,
layer_idx
].
transpose
(
-
1
,
-
2
).
to
(
torch
.
float32
)
tmp
=
xi
@
wa
tmp
=
xi
@
wa
y_stage_1
[
i
]
=
tmp
.
squeeze
(
0
)
y_stage_1
[
i
]
=
tmp
.
squeeze
(
0
)
y_final
[
i
]
+=
(
tmp
@
wb
).
squeeze
(
0
)
*
s
y_final
[
i
]
+=
((
tmp
@
wb
).
squeeze
(
0
)
*
s
if
wb_T_all
is
not
None
else
y_stage_1
[
i
])
return
y_final
,
y_stage_1
return
y_final
,
y_stage_1
...
@@ -91,12 +94,56 @@ H1 = H2 = [
...
@@ -91,12 +94,56 @@ H1 = H2 = [
128000
,
128000
,
128256
,
128256
,
]
]
H2
=
[
64
]
+
H2
R
=
[
1
,
2
,
4
]
SEED
=
[
0xabcdabcd987
]
SEED
=
[
0xabcdabcd987
]
CUDA_DEVICES
=
[
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
]
]
@
pytest
.
mark
.
parametrize
(
"dtype_str"
,
[
"float16"
,
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"h1"
,
H1
)
@
pytest
.
mark
.
parametrize
(
"r"
,
R
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEED
)
@
torch
.
inference_mode
()
def
test_lora_a_extra_shapes
(
dtype_str
,
h1
,
r
,
seed
):
torch
.
manual_seed
(
seed
)
num_loras
=
4
num_layers
=
1
bs
=
32
dtype
=
getattr
(
torch
,
dtype_str
)
device
=
torch
.
device
(
"cuda"
)
wa_T_all
=
torch
.
randn
(
num_loras
,
num_layers
,
r
,
h1
,
dtype
=
dtype
,
device
=
device
)
indices
=
torch
.
randint
(
num_loras
,
(
bs
,
),
dtype
=
torch
.
long
,
device
=
device
)
for
layer_idx
in
range
(
num_layers
):
x
=
torch
.
randn
(
bs
,
h1
,
dtype
=
dtype
,
device
=
device
)
y
=
torch
.
randn
(
bs
,
r
,
dtype
=
dtype
,
device
=
device
)
y_ref
=
y
.
clone
()
_lora_ref_impl
(
y_ref
,
x
,
wa_T_all
,
None
,
indices
,
layer_idx
,
1.0
,
)
y_our
=
y
.
clone
()
punica
.
bgmv
(
y_our
,
x
,
wa_T_all
,
indices
,
layer_idx
,
1.0
)
assert_close
(
y_ref
,
y_our
)
@
pytest
.
mark
.
parametrize
(
"dtype_str"
,
[
"float16"
,
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"dtype_str"
,
[
"float16"
,
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"h1"
,
H1
)
@
pytest
.
mark
.
parametrize
(
"h1"
,
H1
)
@
pytest
.
mark
.
parametrize
(
"h2"
,
H2
)
@
pytest
.
mark
.
parametrize
(
"h2"
,
H2
)
...
...
tests/metrics/test_metrics.py
View file @
1591c68f
from
typing
import
List
import
pytest
import
pytest
from
prometheus_client
import
REGISTRY
from
vllm
import
EngineArgs
,
LLMEngine
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.sampling_params
import
SamplingParams
MODELS
=
[
MODELS
=
[
"facebook/opt-125m"
,
"facebook/opt-125m"
,
...
@@ -68,3 +76,119 @@ def test_metric_counter_generation_tokens(
...
@@ -68,3 +76,119 @@ def test_metric_counter_generation_tokens(
assert
vllm_generation_count
==
metric_count
,
(
assert
vllm_generation_count
==
metric_count
,
(
f
"generation token count:
{
vllm_generation_count
!
r
}
\n
"
f
"generation token count:
{
vllm_generation_count
!
r
}
\n
"
f
"metric:
{
metric_count
!
r
}
"
)
f
"metric:
{
metric_count
!
r
}
"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"served_model_name"
,
[
None
,
[],
[
"ModelName0"
],
[
"ModelName0"
,
"ModelName1"
,
"ModelName2"
]])
def
test_metric_set_tag_model_name
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
served_model_name
:
List
[
str
])
->
None
:
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
gpu_memory_utilization
=
0.3
,
served_model_name
=
served_model_name
)
stat_logger
=
vllm_model
.
model
.
llm_engine
.
stat_logger
metrics_tag_content
=
stat_logger
.
labels
[
"model_name"
]
del
vllm_model
if
served_model_name
is
None
or
served_model_name
==
[]:
assert
metrics_tag_content
==
model
,
(
f
"Metrics tag model_name is wrong! expect:
{
model
!
r
}
\n
"
f
"actual:
{
metrics_tag_content
!
r
}
"
)
else
:
assert
metrics_tag_content
==
served_model_name
[
0
],
(
f
"Metrics tag model_name is wrong! expect: "
f
"
{
served_model_name
[
0
]
!
r
}
\n
"
f
"actual:
{
metrics_tag_content
!
r
}
"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"disable_log_stats"
,
[
True
,
False
])
@
pytest
.
mark
.
asyncio
async
def
test_async_engine_log_metrics_regression
(
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
disable_log_stats
:
bool
,
)
->
None
:
"""
Regression test ensuring async engine generates metrics
when disable_log_stats=False
(see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678)
"""
engine_args
=
AsyncEngineArgs
(
model
=
model
,
dtype
=
dtype
,
disable_log_stats
=
disable_log_stats
)
async_engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
for
i
,
prompt
in
enumerate
(
example_prompts
):
results
=
async_engine
.
generate
(
prompt
,
SamplingParams
(
max_tokens
=
max_tokens
),
f
"request-id-
{
i
}
"
,
)
# Exhaust the async iterator to make the async engine work
async
for
_
in
results
:
pass
assert_metrics
(
async_engine
.
engine
,
disable_log_stats
,
len
(
example_prompts
))
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"disable_log_stats"
,
[
True
,
False
])
def
test_engine_log_metrics_regression
(
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
disable_log_stats
:
bool
,
)
->
None
:
engine_args
=
EngineArgs
(
model
=
model
,
dtype
=
dtype
,
disable_log_stats
=
disable_log_stats
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
for
i
,
prompt
in
enumerate
(
example_prompts
):
engine
.
add_request
(
f
"request-id-
{
i
}
"
,
prompt
,
SamplingParams
(
max_tokens
=
max_tokens
),
)
while
engine
.
has_unfinished_requests
():
engine
.
step
()
assert_metrics
(
engine
,
disable_log_stats
,
len
(
example_prompts
))
def
assert_metrics
(
engine
:
LLMEngine
,
disable_log_stats
:
bool
,
num_requests
:
int
)
->
None
:
if
disable_log_stats
:
with
pytest
.
raises
(
AttributeError
):
_
=
engine
.
stat_logger
else
:
assert
(
engine
.
stat_logger
is
not
None
),
"engine.stat_logger should be set"
# Ensure the count bucket of request-level histogram metrics matches
# the number of requests as a simple sanity check to ensure metrics are
# generated
labels
=
{
'model_name'
:
engine
.
model_config
.
model
}
request_histogram_metrics
=
[
"vllm:e2e_request_latency_seconds"
,
"vllm:request_prompt_tokens"
,
"vllm:request_generation_tokens"
,
"vllm:request_params_best_of"
,
"vllm:request_params_n"
,
]
for
metric_name
in
request_histogram_metrics
:
metric_value
=
REGISTRY
.
get_sample_value
(
f
"
{
metric_name
}
_count"
,
labels
)
assert
(
metric_value
==
num_requests
),
"Metrics should be collected"
tests/model_executor/weight_utils.py
View file @
1591c68f
import
os
import
os
import
tempfile
import
huggingface_hub.constants
import
huggingface_hub.constants
import
pytest
import
pytest
from
huggingface_hub.utils
import
LocalEntryNotFoundError
from
vllm.model_executor.model_loader.weight_utils
import
enable_hf_transfer
from
vllm.model_executor.model_loader.weight_utils
import
(
download_weights_from_hf
,
enable_hf_transfer
)
def
test_hf_transfer_auto_activation
():
def
test_hf_transfer_auto_activation
():
...
@@ -22,5 +25,30 @@ def test_hf_transfer_auto_activation():
...
@@ -22,5 +25,30 @@ def test_hf_transfer_auto_activation():
HF_TRANFER_ACTIVE
)
HF_TRANFER_ACTIVE
)
def
test_download_weights_from_hf
():
with
tempfile
.
TemporaryDirectory
()
as
tmpdir
:
# assert LocalEntryNotFoundError error is thrown
# if offline is set and model is not cached
huggingface_hub
.
constants
.
HF_HUB_OFFLINE
=
True
with
pytest
.
raises
(
LocalEntryNotFoundError
):
download_weights_from_hf
(
"facebook/opt-125m"
,
allow_patterns
=
[
"*.safetensors"
,
"*.bin"
],
cache_dir
=
tmpdir
)
# download the model
huggingface_hub
.
constants
.
HF_HUB_OFFLINE
=
False
download_weights_from_hf
(
"facebook/opt-125m"
,
allow_patterns
=
[
"*.safetensors"
,
"*.bin"
],
cache_dir
=
tmpdir
)
# now it should work offline
huggingface_hub
.
constants
.
HF_HUB_OFFLINE
=
True
assert
download_weights_from_hf
(
"facebook/opt-125m"
,
allow_patterns
=
[
"*.safetensors"
,
"*.bin"
],
cache_dir
=
tmpdir
)
is
not
None
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
test_hf_transfer_auto_activation
()
test_hf_transfer_auto_activation
()
test_download_weights_from_hf
()
tests/models/test_big_models.py
View file @
1591c68f
...
@@ -43,3 +43,18 @@ def test_models(
...
@@ -43,3 +43,18 @@ def test_models(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_str
!
r
}
\n
vLLM:
{
vllm_output_str
!
r
}
"
)
f
"Test
{
i
}
:
\n
HF:
{
hf_output_str
!
r
}
\n
vLLM:
{
vllm_output_str
!
r
}
"
)
assert
hf_output_ids
==
vllm_output_ids
,
(
assert
hf_output_ids
==
vllm_output_ids
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_ids
}
\n
vLLM:
{
vllm_output_ids
}
"
)
f
"Test
{
i
}
:
\n
HF:
{
hf_output_ids
}
\n
vLLM:
{
vllm_output_ids
}
"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_model_print
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
)
->
None
:
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print
(
vllm_model
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
)
del
vllm_model
tests/models/test_fp8.py
0 → 100644
View file @
1591c68f
# flake8: noqa
"""Tests fp8 models against ground truth generation
Note: these tests will only pass on L4 GPU.
"""
import
os
import
pytest
import
torch
from
transformers
import
AutoTokenizer
from
vllm
import
LLM
,
SamplingParams
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"true"
MAX_MODEL_LEN
=
1024
MODELS
=
[
"nm-testing/Meta-Llama-3-8B-Instruct-FP8"
,
"meta-llama/Meta-Llama-3-8B-Instruct"
,
]
EXPECTED_STRS_MAP
=
{
"nm-testing/Meta-Llama-3-8B-Instruct-FP8"
:
[
'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models ('
,
'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to '
,
'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.'
,
'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne'
,
'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep'
,
'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here'
,
'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of'
,
'Here are the translations:
\n\n
**Japanese:** (Haya tori, nemuri nemuri)
\n\n
**'
],
"meta-llama/Meta-Llama-3-8B-Instruct"
:
[
'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained'
,
'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to '
,
'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.'
,
'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne'
,
'In the year 2154, the robotics lab at NeuroSpark Industries was on the cusp of'
,
'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The'
,
'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of'
,
'Here are the translations:
\n\n
**Japanese:** (Haya aki wa mushi o tsukamu'
],
}
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
fp8_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"fp8"
].
get_min_capability
())
@
pytest
.
mark
.
skipif
(
fp8_not_supported
,
reason
=
"fp8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS
)
def
test_models
(
example_prompts
,
model_name
,
)
->
None
:
model
=
LLM
(
model
=
model_name
,
max_model_len
=
MAX_MODEL_LEN
,
enforce_eager
=
True
,
quantization
=
"fp8"
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
formatted_prompts
=
[
tokenizer
.
apply_chat_template
([{
"role"
:
"user"
,
"content"
:
prompt
}],
tokenize
=
False
,
add_generation_prompt
=
True
)
for
prompt
in
example_prompts
]
params
=
SamplingParams
(
max_tokens
=
20
,
temperature
=
0
)
generations
=
[]
# Note: these need to be run 1 at a time due to numerical precision,
# since the expected strs were generated this way.
for
prompt
in
formatted_prompts
:
outputs
=
model
.
generate
(
prompt
,
params
)
generations
.
append
(
outputs
[
0
].
outputs
[
0
].
text
)
del
model
print
(
generations
)
expected_strs
=
EXPECTED_STRS_MAP
[
model_name
]
for
i
in
range
(
len
(
example_prompts
)):
generated_str
=
generations
[
i
]
expected_str
=
expected_strs
[
i
]
assert
expected_str
==
generated_str
,
(
f
"Test
{
i
}
:
\n
Expected:
{
expected_str
!
r
}
\n
vLLM:
{
generated_str
!
r
}
"
)
tests/models/test_gptq_marlin.py
0 → 100644
View file @
1591c68f
"""Compares the outputs of gptq vs gptq_marlin
Note: GPTQ and Marlin do not have bitwise correctness.
As a result, in this test, we just confirm that the top selected tokens of the
Marlin/GPTQ models are in the top 3 selections of each other.
Note: Marlin internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for Marlin. As a result, we re-run the test
up to 3 times to see if we pass.
Note: This test currently fails running with --forked with the following:
RuntimeError: Cannot re-initialize CUDA in forked subprocess.
To use CUDA with multiprocessing, you must use the 'spawn' start method
Run `pytest tests/models/test_gptq_marlin.py`.
"""
import
os
import
pytest
import
torch
from
tests.models.utils
import
check_logprobs_close
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"true"
MAX_MODEL_LEN
=
1024
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
gptq_marlin_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"gptq_marlin"
].
get_min_capability
())
MODELS
=
[
# act_order==False, group_size=channelwise
(
"robertgshaw2/zephyr-7b-beta-channelwise-gptq"
,
"main"
),
# act_order==False, group_size=128
(
"TheBloke/Llama-2-7B-GPTQ"
,
"main"
),
# act_order==True, group_size=128
(
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
,
"main"
),
# act_order==True, group_size=64
(
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
,
"gptq-4bit-64g-actorder_True"
),
# act_order==True, group_size=32
(
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
,
"gptq-4bit-32g-actorder_True"
),
# 8-bit, act_order==True, group_size=channelwise
(
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
,
"gptq-8bit--1g-actorder_True"
),
# 8-bit, act_order==True, group_size=128
(
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
,
"gptq-8bit-128g-actorder_True"
),
# 8-bit, act_order==True, group_size=32
(
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
,
"gptq-8bit-32g-actorder_True"
),
]
@
pytest
.
mark
.
flaky
(
reruns
=
2
)
@
pytest
.
mark
.
skipif
(
gptq_marlin_not_supported
,
reason
=
"gptq_marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models
(
vllm_runner
,
example_prompts
,
model
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
)
->
None
:
model_name
,
revision
=
model
# Run marlin.
gptq_marlin_model
=
vllm_runner
(
model_name
=
model_name
,
revision
=
revision
,
dtype
=
dtype
,
quantization
=
"marlin"
,
max_model_len
=
MAX_MODEL_LEN
,
tensor_parallel_size
=
1
)
gptq_marlin_outputs
=
gptq_marlin_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
del
gptq_marlin_model
# Run gptq.
gptq_model
=
vllm_runner
(
model_name
=
model_name
,
revision
=
revision
,
dtype
=
dtype
,
quantization
=
"gptq"
,
max_model_len
=
MAX_MODEL_LEN
,
tensor_parallel_size
=
1
)
gptq_outputs
=
gptq_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
del
gptq_model
check_logprobs_close
(
outputs_0_lst
=
gptq_outputs
,
outputs_1_lst
=
gptq_marlin_outputs
,
name_0
=
"gptq"
,
name_1
=
"gptq_marlin"
,
)
tests/models/test_marlin.py
View file @
1591c68f
...
@@ -10,12 +10,12 @@ up to 3 times to see if we pass.
...
@@ -10,12 +10,12 @@ up to 3 times to see if we pass.
Run `pytest tests/models/test_marlin.py`.
Run `pytest tests/models/test_marlin.py`.
"""
"""
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
import
pytest
import
pytest
import
torch
import
torch
from
tests.models.utils
import
check_logprobs_close
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
torch
.
cuda
.
get_device_capability
()
...
@@ -55,43 +55,24 @@ def test_models(
...
@@ -55,43 +55,24 @@ def test_models(
max_tokens
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
,
num_logprobs
:
int
,
)
->
None
:
)
->
None
:
marlin_model
=
vllm_runner
(
model_pair
.
model_marlin
,
dtype
=
dtype
)
marlin_model
=
vllm_runner
(
model_pair
.
model_marlin
,
dtype
=
dtype
,
quantization
=
"marlin"
)
marlin_outputs
=
marlin_model
.
generate_greedy_logprobs
(
marlin_outputs
=
marlin_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
example_prompts
,
max_tokens
,
num_logprobs
)
# Note: not sure why, but deleting just the model on Ada Lovelace
# does not free the GPU memory. On Ampere, deleting the just model
# frees the memory.
del
marlin_model
del
marlin_model
gptq_model
=
vllm_runner
(
model_pair
.
model_gptq
,
dtype
=
dtype
)
gptq_model
=
vllm_runner
(
model_pair
.
model_gptq
,
dtype
=
dtype
,
quantization
=
"gptq"
)
gptq_outputs
=
gptq_model
.
generate_greedy_logprobs
(
example_prompts
,
gptq_outputs
=
gptq_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
max_tokens
,
num_logprobs
)
num_logprobs
)
# Note: not sure why, but deleting just the model on Ada Lovelace
# does not free the GPU memory. On Ampere, deleting the just model
# frees the memory.
del
gptq_model
del
gptq_model
# loop through the prompts
check_logprobs_close
(
for
prompt_idx
in
range
(
len
(
example_prompts
)):
outputs_0_lst
=
gptq_outputs
,
gptq_output_ids
,
gptq_output_str
,
gptq_logprobs
=
gptq_outputs
[
outputs_1_lst
=
marlin_outputs
,
prompt_idx
]
name_0
=
"gptq"
,
marlin_output_ids
,
marlin_output_str
,
marlin_logprobs
=
marlin_outputs
[
name_1
=
"marlin"
,
prompt_idx
]
)
for
idx
,
(
gptq_output_id
,
marlin_output_id
)
in
enumerate
(
zip
(
gptq_output_ids
,
marlin_output_ids
)):
# If sequence is not an exact match,
if
marlin_output_id
!=
gptq_output_id
:
# Each predicted token must be in top 5 of the other's
assert
gptq_output_id
in
marlin_logprobs
[
idx
],
(
f
"Test
{
prompt_idx
}
:
\n
GPTQ:
\t
{
gptq_output_str
!
r
}
\n
"
f
"Marlin:
\t
{
marlin_output_str
!
r
}
"
)
assert
marlin_output_id
in
gptq_logprobs
[
idx
],
(
f
"Test
{
prompt_idx
}
:
\n
GPTQ:
\t
{
gptq_output_str
!
r
}
\n
"
f
"Marlin:
\t
{
marlin_output_str
!
r
}
"
)
# Break out since sequences will now diverge.
break
tests/models/test_models.py
View file @
1591c68f
...
@@ -49,3 +49,18 @@ def test_models(
...
@@ -49,3 +49,18 @@ def test_models(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_str
!
r
}
\n
vLLM:
{
vllm_output_str
!
r
}
"
)
f
"Test
{
i
}
:
\n
HF:
{
hf_output_str
!
r
}
\n
vLLM:
{
vllm_output_str
!
r
}
"
)
assert
hf_output_ids
==
vllm_output_ids
,
(
assert
hf_output_ids
==
vllm_output_ids
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_ids
}
\n
vLLM:
{
vllm_output_ids
}
"
)
f
"Test
{
i
}
:
\n
HF:
{
hf_output_ids
}
\n
vLLM:
{
vllm_output_ids
}
"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
def
test_model_print
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
)
->
None
:
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print
(
vllm_model
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
)
del
vllm_model
tests/models/utils.py
0 → 100644
View file @
1591c68f
def
check_logprobs_close
(
outputs_0_lst
,
outputs_1_lst
,
name_0
,
name_1
):
"""Compare the logprobs of two sequences generated by different models,
which should be similar but not necessarily equal.
"""
# Loop through responses to each prompt.
for
prompt_idx
,
(
outputs_0
,
outputs_1
)
in
enumerate
(
zip
(
outputs_0_lst
,
outputs_1_lst
)):
output_ids_0
,
output_str_0
,
logprobs_0
=
outputs_0
output_ids_1
,
output_str_1
,
logprobs_1
=
outputs_1
# Loop through generated tokens.
for
idx
,
(
output_id_0
,
output_id_1
)
in
enumerate
(
zip
(
output_ids_0
,
output_ids_1
)):
# If generated tokens don't match, then
if
output_id_0
!=
output_id_1
:
# Each predicted token must be in top N logprobs of the other
assert
output_id_0
in
logprobs_1
[
idx
],
(
f
"Test
{
prompt_idx
}
:"
f
"
\n
{
name_0
}
:
\t
{
output_str_0
!
r
}
"
f
"
\n
{
name_1
}
:
\t
{
output_str_1
!
r
}
"
)
assert
output_id_1
in
logprobs_0
[
idx
],
(
f
"Test
{
prompt_idx
}
:"
f
"
\n
{
name_0
}
:
\t
{
output_str_0
!
r
}
"
f
"
\n
{
name_1
}
:
\t
{
output_str_1
!
r
}
"
)
# Break out since sequences will now diverge.
break
tests/quantization/test_autogptq_marlin_configs.py
deleted
100644 → 0
View file @
09bcf00b
"""Tests whether Marlin models can be loaded from the autogptq config.
Run `pytest tests/quantization/test_autogptq_marlin_configs.py --forked`.
"""
from
dataclasses
import
dataclass
import
pytest
from
vllm.config
import
ModelConfig
@
dataclass
class
ModelPair
:
model_marlin
:
str
model_gptq
:
str
# Model Id // Expected Kernel
MODELS_QUANT_TYPE
=
[
# compat: autogptq <=0.7.1 is_marlin_format: bool
(
"neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"
,
"marlin"
),
(
"TheBloke/Llama-2-7B-Chat-GPTQ"
,
"gptq"
),
# compat: autogptq >=0.8.0 use checkpoint_format: str
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"
,
"marlin"
),
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"gptq"
)
]
@
pytest
.
mark
.
parametrize
(
"model_quant_type"
,
MODELS_QUANT_TYPE
)
def
test_auto_gptq
(
model_quant_type
:
str
,
)
->
None
:
model_path
,
quant_type
=
model_quant_type
model_config_no_quant_arg
=
ModelConfig
(
model_path
,
model_path
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
"float16"
,
revision
=
None
,
quantization
=
None
# case 1
)
model_config_quant_arg
=
ModelConfig
(
model_path
,
model_path
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
"float16"
,
revision
=
None
,
quantization
=
"gptq"
# case 2
)
assert
model_config_no_quant_arg
.
quantization
==
quant_type
,
(
f
"Expected quant_type ==
{
quant_type
}
for
{
model_path
}
, "
f
"but found
{
model_config_no_quant_arg
.
quantization
}
"
"for no --quantization None case"
)
assert
model_config_quant_arg
.
quantization
==
quant_type
,
(
f
"Expected quant_type ==
{
quant_type
}
for
{
model_path
}
, "
f
"but found
{
model_config_quant_arg
.
quantization
}
"
"for --quantization gptq case"
)
tests/quantization/test_configs.py
0 → 100644
View file @
1591c68f
"""Tests whether Marlin models can be loaded from the autogptq config.
Run `pytest tests/quantization/test_configs.py --forked`.
"""
from
dataclasses
import
dataclass
import
pytest
from
vllm.config
import
ModelConfig
@
dataclass
class
ModelPair
:
model_marlin
:
str
model_gptq
:
str
# Model Id // Quantization Arg // Expected Type
MODEL_ARG_EXPTYPES
=
[
# AUTOGPTQ
# compat: autogptq <=0.7.1 is_marlin_format: bool
# Model Serialized in Marlin Format should always use Marlin kernel.
(
"neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"
,
None
,
"marlin"
),
(
"neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"
,
"marlin"
,
"marlin"
),
(
"neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"
,
"gptq"
,
"marlin"
),
(
"neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"
,
"awq"
,
"ERROR"
),
# Model Serialized in Exllama Format.
(
"TheBloke/Llama-2-7B-Chat-GPTQ"
,
None
,
"gptq_marlin"
),
(
"TheBloke/Llama-2-7B-Chat-GPTQ"
,
"marlin"
,
"gptq_marlin"
),
(
"TheBloke/Llama-2-7B-Chat-GPTQ"
,
"gptq"
,
"gptq"
),
(
"TheBloke/Llama-2-7B-Chat-GPTQ"
,
"awq"
,
"ERROR"
),
# compat: autogptq >=0.8.0 use checkpoint_format: str
# Model Serialized in Marlin Format should always use Marlin kernel.
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"
,
None
,
"marlin"
),
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"
,
"marlin"
,
"marlin"
),
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"
,
"gptq"
,
"marlin"
),
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"
,
"awq"
,
"ERROR"
),
# Model Serialized in Exllama Format.
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
None
,
"gptq_marlin"
),
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"marlin"
,
"gptq_marlin"
),
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"gptq"
,
"gptq"
),
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"awq"
,
"ERROR"
),
# AUTOAWQ
(
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"
,
None
,
"awq"
),
(
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"
,
"awq"
,
"awq"
),
(
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"
,
"marlin"
,
"ERROR"
),
(
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"
,
"gptq"
,
"ERROR"
),
]
@
pytest
.
mark
.
parametrize
(
"model_arg_exptype"
,
MODEL_ARG_EXPTYPES
)
def
test_auto_gptq
(
model_arg_exptype
:
str
)
->
None
:
model_path
,
quantization_arg
,
expected_type
=
model_arg_exptype
try
:
model_config
=
ModelConfig
(
model_path
,
model_path
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
"float16"
,
revision
=
None
,
quantization
=
quantization_arg
)
found_quantization_type
=
model_config
.
quantization
except
ValueError
:
found_quantization_type
=
"ERROR"
assert
found_quantization_type
==
expected_type
,
(
f
"Expected quant_type ==
{
expected_type
}
for
{
model_path
}
, "
f
"but found
{
found_quantization_type
}
"
f
"for no --quantization
{
quantization_arg
}
case"
)
Prev
1
2
3
4
5
6
7
8
9
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment