Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
500b93c8
Commit
500b93c8
authored
Jul 25, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.5.3.post1' into v0.5.3.post1-dtk24.04.1
parents
99426767
38c4b7e8
Changes
282
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
861 additions
and
340 deletions
+861
-340
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+45
-122
tests/engine/output_processor/test_stop_checker.py
tests/engine/output_processor/test_stop_checker.py
+2
-2
tests/engine/test_custom_executor.py
tests/engine/test_custom_executor.py
+91
-0
tests/entrypoints/openai/test_basic.py
tests/entrypoints/openai/test_basic.py
+61
-0
tests/entrypoints/openai/test_chat.py
tests/entrypoints/openai/test_chat.py
+24
-29
tests/entrypoints/openai/test_completion.py
tests/entrypoints/openai/test_completion.py
+115
-95
tests/entrypoints/openai/test_embedding.py
tests/entrypoints/openai/test_embedding.py
+11
-11
tests/entrypoints/openai/test_models.py
tests/entrypoints/openai/test_models.py
+21
-21
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+5
-3
tests/entrypoints/openai/test_tokenization.py
tests/entrypoints/openai/test_tokenization.py
+152
-0
tests/entrypoints/openai/test_vision.py
tests/entrypoints/openai/test_vision.py
+15
-17
tests/kernels/quant_utils.py
tests/kernels/quant_utils.py
+72
-0
tests/kernels/test_attention.py
tests/kernels/test_attention.py
+5
-3
tests/kernels/test_blocksparse_attention.py
tests/kernels/test_blocksparse_attention.py
+5
-3
tests/kernels/test_cache.py
tests/kernels/test_cache.py
+2
-2
tests/kernels/test_fp8_quant.py
tests/kernels/test_fp8_quant.py
+87
-0
tests/kernels/test_int8_quant.py
tests/kernels/test_int8_quant.py
+10
-16
tests/kernels/test_marlin_gemm.py
tests/kernels/test_marlin_gemm.py
+129
-13
tests/lora/conftest.py
tests/lora/conftest.py
+8
-2
tests/lora/test_long_context.py
tests/lora/test_long_context.py
+1
-1
No files found.
tests/distributed/test_pipeline_parallel.py
View file @
500b93c8
import
os
import
os
import
openai
# use the official client for correctness check
import
pytest
import
pytest
from
..utils
import
RemoteOpenAIServer
from
..utils
import
compare_two_settings
# downloading lora to test lora requests
VLLM_MULTI_NODE
=
os
.
getenv
(
"VLLM_MULTI_NODE"
,
"0"
)
==
"1"
# any model with a chat template should work here
MODEL_NAME
=
"meta-llama/Meta-Llama-3-8B"
EAGER_MODE
=
bool
(
int
(
os
.
getenv
(
"EAGER_MODE"
,
0
)))
CHUNKED_PREFILL
=
bool
(
int
(
os
.
getenv
(
"CHUNKED_PREFILL"
,
0
)))
TP_SIZE
=
int
(
os
.
getenv
(
"TP_SIZE"
,
1
))
PP_SIZE
=
int
(
os
.
getenv
(
"PP_SIZE"
,
1
))
pytestmark
=
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME, DIST_BACKEND"
,
[
@
pytest
.
fixture
(
scope
=
"module"
)
(
2
,
2
,
0
,
1
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
def
server
():
(
2
,
2
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
args
=
[
(
1
,
3
,
0
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
"--model"
,
(
1
,
4
,
0
,
1
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
MODEL_NAME
,
(
1
,
4
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
2
,
2
,
0
,
1
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
(
2
,
2
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
(
1
,
3
,
0
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
(
1
,
4
,
0
,
1
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
(
1
,
4
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
])
def
test_compare_tp
(
TP_SIZE
,
PP_SIZE
,
EAGER_MODE
,
CHUNKED_PREFILL
,
MODEL_NAME
,
DIST_BACKEND
):
if
VLLM_MULTI_NODE
and
DIST_BACKEND
==
"mp"
:
pytest
.
skip
(
"Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend"
)
pp_args
=
[
# use half precision for speed and memory savings in CI environment
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"--dtype"
,
"
b
float16"
,
"float16"
,
"--pipeline-parallel-size"
,
"--pipeline-parallel-size"
,
str
(
PP_SIZE
),
str
(
PP_SIZE
),
"--tensor-parallel-size"
,
"--tensor-parallel-size"
,
str
(
TP_SIZE
),
str
(
TP_SIZE
),
"--distributed-executor-backend"
,
"--distributed-executor-backend"
,
"ray"
,
DIST_BACKEND
,
]
# compare without pipeline parallelism
# NOTE: use mp backend for TP
# PP tests might involve multiple nodes, and ray might
# schedule all workers in a node other than the head node,
# which can cause the test to fail.
tp_args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--tensor-parallel-size"
,
str
(
max
(
TP_SIZE
,
2
)),
# We only use 2 GPUs in the CI.
"--distributed-executor-backend"
,
"mp"
,
]
]
if
CHUNKED_PREFILL
:
if
CHUNKED_PREFILL
:
args
+=
[
pp_args
.
append
(
"--enable-chunked-prefill"
)
"--enable-chunked-prefill"
,
tp_args
.
append
(
"--enable-chunked-prefill"
)
]
if
EAGER_MODE
:
if
EAGER_MODE
:
args
+=
[
pp_args
.
append
(
"--enforce-eager"
)
"--enforce-eager"
,
tp_args
.
append
(
"--enforce-eager"
)
]
with
RemoteOpenAIServer
(
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
async
def
test_check_models
(
server
,
client
:
openai
.
AsyncOpenAI
):
models
=
await
client
.
models
.
list
()
models
=
models
.
data
served_model
=
models
[
0
]
assert
served_model
.
id
==
MODEL_NAME
assert
all
(
model
.
root
==
MODEL_NAME
for
model
in
models
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
],
)
async
def
test_single_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
0.0
)
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
assert
completion
.
choices
[
0
].
text
is
not
None
and
len
(
completion
.
choices
[
0
].
text
)
>=
5
assert
completion
.
choices
[
0
].
finish_reason
==
"length"
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
5
,
prompt_tokens
=
6
,
total_tokens
=
11
)
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
completion
.
choices
[
0
].
text
is
not
None
and
len
(
completion
.
choices
[
0
].
text
)
>=
5
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
"model_name"
,
[
MODEL_NAME
],
)
async
def
test_batch_completions
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test simple list
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
batch
.
choices
)
==
2
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
1
].
text
# test n = 2
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
],
n
=
2
,
max_tokens
=
5
,
temperature
=
0.0
,
extra_body
=
dict
(
# NOTE: this has to be true for n > 1 in vLLM, but not necessary
# for official client.
use_beam_search
=
True
),
)
assert
len
(
batch
.
choices
)
==
4
assert
batch
.
choices
[
0
].
text
!=
batch
.
choices
[
1
].
text
,
"beam search should be different"
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
2
].
text
,
"two copies of the same prompt should be the same"
assert
batch
.
choices
[
1
].
text
==
batch
.
choices
[
3
].
text
,
"two copies of the same prompt should be the same"
# test streaming
compare_two_settings
(
MODEL_NAME
,
pp_args
,
tp_args
)
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
],
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
)
texts
=
[
""
]
*
2
async
for
chunk
in
batch
:
assert
len
(
chunk
.
choices
)
==
1
choice
=
chunk
.
choices
[
0
]
texts
[
choice
.
index
]
+=
choice
.
text
assert
texts
[
0
]
==
texts
[
1
]
tests/engine/output_processor/test_stop_checker.py
View file @
500b93c8
...
@@ -35,8 +35,8 @@ def sequence_with_eos(text: str, eos_token: str,
...
@@ -35,8 +35,8 @@ def sequence_with_eos(text: str, eos_token: str,
@
pytest
.
mark
.
parametrize
([
"text_wo_eos"
,
"eos_token"
,
"eos_token_id"
],
[
@
pytest
.
mark
.
parametrize
([
"text_wo_eos"
,
"eos_token"
,
"eos_token_id"
],
[
(
"This text ends with EOS token"
,
"</s>"
,
2
),
(
"This text ends with EOS token"
,
"</s>"
,
2
),
])
])
@
pytest
.
mark
.
parametrize
(
"ignore_eos"
,
[
True
,
False
,
None
])
@
pytest
.
mark
.
parametrize
(
"ignore_eos"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"include_stop_str_in_output"
,
[
True
,
False
,
None
])
@
pytest
.
mark
.
parametrize
(
"include_stop_str_in_output"
,
[
True
,
False
])
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
def
test_stop_on_eos_token
(
text_wo_eos
:
str
,
eos_token
:
str
,
eos_token_id
:
int
,
def
test_stop_on_eos_token
(
text_wo_eos
:
str
,
eos_token
:
str
,
eos_token_id
:
int
,
ignore_eos
:
bool
,
include_stop_str_in_output
:
bool
):
ignore_eos
:
bool
,
include_stop_str_in_output
:
bool
):
...
...
tests/engine/test_custom_executor.py
0 → 100644
View file @
500b93c8
import
asyncio
import
os
import
pytest
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
EngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.executor.gpu_executor
import
GPUExecutor
,
GPUExecutorAsync
from
vllm.sampling_params
import
SamplingParams
class
Mock
:
...
class
CustomGPUExecutor
(
GPUExecutor
):
def
execute_model
(
self
,
*
args
,
**
kwargs
):
# Drop marker to show that this was ran
with
open
(
".marker"
,
"w"
):
...
return
super
().
execute_model
(
*
args
,
**
kwargs
)
class
CustomGPUExecutorAsync
(
GPUExecutorAsync
):
async
def
execute_model_async
(
self
,
*
args
,
**
kwargs
):
with
open
(
".marker"
,
"w"
):
...
return
await
super
().
execute_model_async
(
*
args
,
**
kwargs
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
def
test_custom_executor_type_checking
(
model
):
with
pytest
.
raises
(
ValueError
):
engine_args
=
EngineArgs
(
model
=
model
,
distributed_executor_backend
=
Mock
)
LLMEngine
.
from_engine_args
(
engine_args
)
with
pytest
.
raises
(
ValueError
):
engine_args
=
AsyncEngineArgs
(
model
=
model
,
distributed_executor_backend
=
Mock
)
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
with
pytest
.
raises
(
TypeError
):
engine_args
=
AsyncEngineArgs
(
model
=
model
,
distributed_executor_backend
=
CustomGPUExecutor
)
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
def
test_custom_executor
(
model
,
tmpdir
):
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmpdir
)
try
:
assert
not
os
.
path
.
exists
(
".marker"
)
engine_args
=
EngineArgs
(
model
=
model
,
distributed_executor_backend
=
CustomGPUExecutor
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
engine
.
add_request
(
"0"
,
"foo"
,
sampling_params
)
engine
.
step
()
assert
os
.
path
.
exists
(
".marker"
)
finally
:
os
.
chdir
(
cwd
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
def
test_custom_executor_async
(
model
,
tmpdir
):
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmpdir
)
try
:
assert
not
os
.
path
.
exists
(
".marker"
)
engine_args
=
AsyncEngineArgs
(
model
=
model
,
distributed_executor_backend
=
CustomGPUExecutorAsync
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
async
def
t
():
stream
=
await
engine
.
add_request
(
"0"
,
"foo"
,
sampling_params
)
async
for
x
in
stream
:
...
asyncio
.
run
(
t
())
assert
os
.
path
.
exists
(
".marker"
)
finally
:
os
.
chdir
(
cwd
)
tests/entrypoints/openai/test_basic.py
0 → 100644
View file @
500b93c8
from
http
import
HTTPStatus
import
openai
import
pytest
import
requests
from
vllm.version
import
__version__
as
VLLM_VERSION
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
"--max-num-seqs"
,
"128"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
@
pytest
.
mark
.
asyncio
async
def
test_show_version
(
client
:
openai
.
AsyncOpenAI
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
response
=
requests
.
get
(
base_url
+
"/version"
)
response
.
raise_for_status
()
assert
response
.
json
()
==
{
"version"
:
VLLM_VERSION
}
@
pytest
.
mark
.
asyncio
async
def
test_check_health
(
client
:
openai
.
AsyncOpenAI
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
response
=
requests
.
get
(
base_url
+
"/health"
)
assert
response
.
status_code
==
HTTPStatus
.
OK
@
pytest
.
mark
.
asyncio
async
def
test_log_metrics
(
client
:
openai
.
AsyncOpenAI
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
response
=
requests
.
get
(
base_url
+
"/metrics"
)
assert
response
.
status_code
==
HTTPStatus
.
OK
tests/entrypoints/openai/test_chat.py
View file @
500b93c8
...
@@ -7,11 +7,11 @@ import jsonschema
...
@@ -7,11 +7,11 @@ import jsonschema
import
openai
# use the official client for correctness check
import
openai
# use the official client for correctness check
import
pytest
import
pytest
import
torch
import
torch
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
openai
import
BadRequestError
from
openai
import
BadRequestError
from
...utils
import
RemoteOpenAIServer
from
...utils
import
RemoteOpenAIServer
from
.test_completion
import
zephyr_lora_added_tokens_files
# noqa: F401
from
.test_completion
import
zephyr_lora_files
# noqa: F401
# any model with a chat template should work here
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
...
@@ -21,33 +21,28 @@ LORA_NAME = "typeof/zephyr-7b-beta-lora"
...
@@ -21,33 +21,28 @@ LORA_NAME = "typeof/zephyr-7b-beta-lora"
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
def
server
(
zephyr_lora_files
,
zephyr_lora_added_tokens_files
):
# noqa: F811
return
snapshot_download
(
repo_id
=
LORA_NAME
)
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
@
pytest
.
fixture
(
scope
=
"module"
)
"bfloat16"
,
def
server
(
zephyr_lora_files
):
"--max-model-len"
,
with
RemoteOpenAIServer
([
"8192"
,
"--model"
,
"--enforce-eager"
,
MODEL_NAME
,
# lora config below
# use half precision for speed and memory savings in CI environment
"--enable-lora"
,
"--dtype"
,
"--lora-modules"
,
"bfloat16"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
"--max-model-len"
,
f
"zephyr-lora2=
{
zephyr_lora_added_tokens_files
}
"
,
"8192"
,
"--max-lora-rank"
,
"--enforce-eager"
,
"64"
,
# lora config below
"--max-cpu-loras"
,
"--enable-lora"
,
"2"
,
"--lora-modules"
,
"--max-num-seqs"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
"128"
,
f
"zephyr-lora2=
{
zephyr_lora_files
}
"
,
]
"--max-lora-rank"
,
"64"
,
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
"--max-cpu-loras"
,
"2"
,
"--max-num-seqs"
,
"128"
,
])
as
remote_server
:
yield
remote_server
yield
remote_server
...
...
tests/entrypoints/openai/test_completion.py
View file @
500b93c8
# imports for guided decoding tests
# imports for guided decoding tests
import
json
import
json
import
re
import
re
import
shutil
from
tempfile
import
TemporaryDirectory
from
typing
import
List
from
typing
import
List
import
jsonschema
import
jsonschema
import
openai
# use the official client for correctness check
import
openai
# use the official client for correctness check
import
pytest
import
pytest
import
requests
# downloading lora to test lora requests
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
huggingface_hub
import
snapshot_download
from
openai
import
BadRequestError
from
openai
import
BadRequestError
from
transformers
import
AutoTokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
...
@@ -17,9 +19,13 @@ from ...utils import RemoteOpenAIServer
...
@@ -17,9 +19,13 @@ from ...utils import RemoteOpenAIServer
# any model with a chat template should work here
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
# technically th
is needs Mistral-7B-v0.1 as base, but we're not testing
# technically th
ese adapters use a different base model,
# generation quality here
#
but we're not testing
generation quality here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
PA_NAME
=
"swapnilbp/llama_tweet_ptune"
# if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also
# need to change to match the prompt adapter
PA_NUM_VIRTUAL_TOKENS
=
8
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
...
@@ -28,28 +34,58 @@ def zephyr_lora_files():
...
@@ -28,28 +34,58 @@ def zephyr_lora_files():
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
zephyr_lora_files
):
def
zephyr_lora_added_tokens_files
(
zephyr_lora_files
):
with
RemoteOpenAIServer
([
tmp_dir
=
TemporaryDirectory
()
"--model"
,
tmp_model_dir
=
f
"
{
tmp_dir
.
name
}
/zephyr"
MODEL_NAME
,
shutil
.
copytree
(
zephyr_lora_files
,
tmp_model_dir
)
# use half precision for speed and memory savings in CI environment
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_NAME
)
"--dtype"
,
# Copy tokenizer to adapter and add some unique tokens
"bfloat16"
,
# 32000, 32001, 32002
"--max-model-len"
,
added
=
tokenizer
.
add_tokens
([
"vllm1"
,
"vllm2"
,
"vllm3"
],
"8192"
,
special_tokens
=
True
)
"--enforce-eager"
,
assert
added
==
3
# lora config below
tokenizer
.
save_pretrained
(
tmp_model_dir
)
"--enable-lora"
,
yield
tmp_model_dir
"--lora-modules"
,
tmp_dir
.
cleanup
()
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
f
"zephyr-lora2=
{
zephyr_lora_files
}
"
,
"--max-lora-rank"
,
@
pytest
.
fixture
(
scope
=
"module"
)
"64"
,
def
zephyr_pa_files
():
"--max-cpu-loras"
,
return
snapshot_download
(
repo_id
=
PA_NAME
)
"2"
,
"--max-num-seqs"
,
"128"
,
@
pytest
.
fixture
(
scope
=
"module"
)
])
as
remote_server
:
def
server
(
zephyr_lora_files
,
zephyr_lora_added_tokens_files
,
zephyr_pa_files
):
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--max-num-seqs"
,
"128"
,
"--enforce-eager"
,
# lora config
"--enable-lora"
,
"--lora-modules"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
f
"zephyr-lora2=
{
zephyr_lora_added_tokens_files
}
"
,
"--max-lora-rank"
,
"64"
,
"--max-cpu-loras"
,
"2"
,
# pa config
"--enable-prompt-adapter"
,
"--prompt-adapters"
,
f
"zephyr-pa=
{
zephyr_pa_files
}
"
,
f
"zephyr-pa2=
{
zephyr_pa_files
}
"
,
"--max-prompt-adapters"
,
"2"
,
"--max-prompt-adapter-token"
,
"128"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
yield
remote_server
...
@@ -60,11 +96,14 @@ def client(server):
...
@@ -60,11 +96,14 @@ def client(server):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras
# first test base model, then test loras, then test prompt adapters
"model_name"
,
"model_name,num_virtual_tokens"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
[(
MODEL_NAME
,
0
),
(
"zephyr-lora"
,
0
),
(
"zephyr-lora2"
,
0
),
(
"zephyr-pa"
,
PA_NUM_VIRTUAL_TOKENS
),
(
"zephyr-pa2"
,
PA_NUM_VIRTUAL_TOKENS
)],
)
)
async
def
test_single_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_single_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
num_virtual_tokens
:
int
):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
max_tokens
=
5
,
...
@@ -77,28 +116,58 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
...
@@ -77,28 +116,58 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
assert
len
(
choice
.
text
)
>=
5
assert
len
(
choice
.
text
)
>=
5
assert
choice
.
finish_reason
==
"length"
assert
choice
.
finish_reason
==
"length"
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
5
,
prompt_tokens
=
6
,
total_tokens
=
11
)
completion_tokens
=
5
,
prompt_tokens
=
6
+
num_virtual_tokens
,
total_tokens
=
11
+
num_virtual_tokens
)
# test using token IDs
# test using token IDs
completion
=
await
client
.
completions
.
create
(
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
max_tokens
=
5
,
temperature
=
0.0
,
temperature
=
0.0
,
)
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
5
assert
len
(
completion
.
choices
[
0
].
text
)
>=
1
@
pytest
.
mark
.
asyncio
async
def
test_added_lora_tokens
(
client
:
openai
.
AsyncOpenAI
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
"zephyr-lora2"
,
prompt
=
[
0
,
0
,
32000
,
32001
,
32002
],
echo
=
True
,
max_tokens
=
5
,
temperature
=
0.0
,
)
# Added tokens should appear in tokenized prompt
assert
completion
.
choices
[
0
].
text
.
startswith
(
"<unk><unk>vllm1vllm2vllm3"
)
@
pytest
.
mark
.
asyncio
async
def
test_added_lora_tokens_base_model
(
client
:
openai
.
AsyncOpenAI
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
32000
,
32001
,
32002
],
echo
=
True
,
max_tokens
=
5
,
temperature
=
0.0
,
)
# Added tokens should not appear in tokenized prompt
assert
"vllm"
not
in
completion
.
choices
[
0
].
text
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras
# first test base model, then test loras
, then test prompt adapters
"model_name"
,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
,
"zephyr-pa"
,
"zephyr-pa2"
],
)
)
async
def
test_no_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_no_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
# test using token IDs
completion
=
await
client
.
completions
.
create
(
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
max_tokens
=
5
,
temperature
=
0.0
,
temperature
=
0.0
,
...
@@ -110,14 +179,14 @@ async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
...
@@ -110,14 +179,14 @@ async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
# just test 1 lora
and 1 pa
hereafter
"model_name"
,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
)
async
def
test_zero_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_zero_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
# test using token IDs
completion
=
await
client
.
completions
.
create
(
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
max_tokens
=
5
,
temperature
=
0.0
,
temperature
=
0.0
,
...
@@ -133,12 +202,12 @@ async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
...
@@ -133,12 +202,12 @@ async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model_name"
,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
)
async
def
test_some_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_some_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
# test using token IDs
completion
=
await
client
.
completions
.
create
(
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
max_tokens
=
5
,
temperature
=
0.0
,
temperature
=
0.0
,
...
@@ -154,7 +223,7 @@ async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
...
@@ -154,7 +223,7 @@ async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model_name"
,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
)
async
def
test_too_many_completion_logprobs
(
client
:
openai
.
AsyncOpenAI
,
async
def
test_too_many_completion_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
...
@@ -162,7 +231,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
...
@@ -162,7 +231,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
with
pytest
.
raises
(
with
pytest
.
raises
(
(
openai
.
BadRequestError
,
openai
.
APIError
)):
# test using token IDs
(
openai
.
BadRequestError
,
openai
.
APIError
)):
# test using token IDs
await
client
.
completions
.
create
(
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
max_tokens
=
5
,
temperature
=
0.0
,
temperature
=
0.0
,
...
@@ -174,7 +243,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
...
@@ -174,7 +243,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
with
pytest
.
raises
(
with
pytest
.
raises
(
(
openai
.
BadRequestError
,
openai
.
APIError
)):
# test using token IDs
(
openai
.
BadRequestError
,
openai
.
APIError
)):
# test using token IDs
stream
=
await
client
.
completions
.
create
(
stream
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
max_tokens
=
5
,
temperature
=
0.0
,
temperature
=
0.0
,
...
@@ -199,7 +268,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
...
@@ -199,7 +268,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model_name"
,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
)
async
def
test_completion_streaming
(
client
:
openai
.
AsyncOpenAI
,
async
def
test_completion_streaming
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
...
@@ -233,7 +302,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
...
@@ -233,7 +302,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model_name"
,
"model_name"
,
[
"HuggingFaceH4/
zephyr-
7b-bet
a"
,
"zephyr-
lor
a"
],
[
MODEL_NAME
,
"
zephyr-
lor
a"
,
"zephyr-
p
a"
],
)
)
async
def
test_completion_stream_options
(
client
:
openai
.
AsyncOpenAI
,
async
def
test_completion_stream_options
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
...
@@ -369,9 +438,8 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
...
@@ -369,9 +438,8 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
"model_name"
,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
)
async
def
test_batch_completions
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_batch_completions
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test both text and token IDs
# test both text and token IDs
...
@@ -614,51 +682,3 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
...
@@ -614,51 +682,3 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
prompt
=
"Give an example string that fits this regex"
,
prompt
=
"Give an example string that fits this regex"
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_json
=
sample_json_schema
))
guided_json
=
sample_json_schema
))
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
],
)
async
def
test_tokenize
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
,
tokenizer_mode
=
"fast"
)
for
add_special
in
[
False
,
True
]:
prompt
=
"This is a test prompt."
tokens
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
add_special
)
response
=
requests
.
post
(
base_url
+
"/tokenize"
,
json
=
{
"add_special_tokens"
:
add_special
,
"model"
:
model_name
,
"prompt"
:
prompt
})
response
.
raise_for_status
()
assert
response
.
json
()
==
{
"tokens"
:
tokens
,
"count"
:
len
(
tokens
),
"max_model_len"
:
8192
}
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
],
)
async
def
test_detokenize
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
base_url
=
str
(
client
.
base_url
)[:
-
3
]
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
,
tokenizer_mode
=
"fast"
)
prompt
=
"This is a test prompt."
tokens
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
False
)
response
=
requests
.
post
(
base_url
+
"detokenize"
,
json
=
{
"model"
:
model_name
,
"tokens"
:
tokens
})
response
.
raise_for_status
()
assert
response
.
json
()
==
{
"prompt"
:
prompt
}
tests/entrypoints/openai/test_embedding.py
View file @
500b93c8
...
@@ -11,17 +11,17 @@ EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
...
@@ -11,17 +11,17 @@ EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
embedding_server
():
def
embedding_server
():
with
RemoteOpenAIServer
(
[
args
=
[
"--model"
,
# use half precision for speed and memory savings in CI environment
EMBEDDING_MODEL_NAME
,
"--dtype"
,
# use half precision for speed and memory savings in CI environment
"bfloat16"
,
"--dtype
"
,
"--enforce-eager
"
,
"bfloat16
"
,
"--max-model-len
"
,
"--enforce-eager
"
,
"8192
"
,
"--max-model-len
"
,
"--enforce-eager
"
,
"8192"
,
]
"--enforce-eager"
,
]
)
as
remote_server
:
with
RemoteOpenAIServer
(
EMBEDDING_MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
yield
remote_server
...
...
tests/entrypoints/openai/test_models.py
View file @
500b93c8
...
@@ -19,27 +19,27 @@ def zephyr_lora_files():
...
@@ -19,27 +19,27 @@ def zephyr_lora_files():
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
zephyr_lora_files
):
def
server
(
zephyr_lora_files
):
with
RemoteOpenAIServer
(
[
args
=
[
"--model"
,
# use half precision for speed and memory savings in CI environment
MODEL_NAME
,
"--dtype"
,
# use half precision for speed and memory savings in CI environment
"bfloat16"
,
"--dtype
"
,
"--max-model-len
"
,
"bfloat16
"
,
"8192
"
,
"--max-model-len
"
,
"--enforce-eager
"
,
"8192"
,
# lora config below
"--en
force-eager
"
,
"--en
able-lora
"
,
# lora config below
"--lora-modules"
,
"--enable-lora
"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
"--lora-modu
les"
,
f
"zephyr-lora2=
{
zephyr_lora_fi
les
}
"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
"--max-lora-rank
"
,
f
"zephyr-lora2=
{
zephyr_lora_files
}
"
,
"64
"
,
"--max-
lora-rank
"
,
"--max-
cpu-loras
"
,
"64
"
,
"2
"
,
"--max-
cpu-lora
s"
,
"--max-
num-seq
s"
,
"2
"
,
"128
"
,
"--max-num-seqs"
,
]
"128"
,
]
)
as
remote_server
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
yield
remote_server
...
...
tests/entrypoints/openai/test_serving_chat.py
View file @
500b93c8
...
@@ -32,11 +32,13 @@ async def _async_serving_chat_init():
...
@@ -32,11 +32,13 @@ async def _async_serving_chat_init():
model_config
,
model_config
,
served_model_names
=
[
MODEL_NAME
],
served_model_names
=
[
MODEL_NAME
],
response_role
=
"assistant"
,
response_role
=
"assistant"
,
chat_template
=
CHAT_TEMPLATE
)
chat_template
=
CHAT_TEMPLATE
,
lora_modules
=
None
,
prompt_adapters
=
None
,
request_logger
=
None
)
return
serving_completion
return
serving_completion
def
test_async_serving_chat_init
():
def
test_async_serving_chat_init
():
serving_completion
=
asyncio
.
run
(
_async_serving_chat_init
())
serving_completion
=
asyncio
.
run
(
_async_serving_chat_init
())
assert
serving_completion
.
tokenizer
is
not
None
assert
serving_completion
.
chat_template
==
CHAT_TEMPLATE
assert
serving_completion
.
tokenizer
.
chat_template
==
CHAT_TEMPLATE
tests/entrypoints/openai/test_tokenization.py
0 → 100644
View file @
500b93c8
import
openai
# use the official client for correctness check
import
pytest
import
requests
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
...utils
import
RemoteOpenAIServer
from
.test_completion
import
zephyr_lora_added_tokens_files
# noqa: F401
from
.test_completion
import
zephyr_lora_files
# noqa: F401
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
zephyr_lora_added_tokens_files
:
str
):
# noqa: F811
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
"--max-num-seqs"
,
"128"
,
# lora config
"--enable-lora"
,
"--lora-modules"
,
f
"zephyr-lora2=
{
zephyr_lora_added_tokens_files
}
"
,
"--max-lora-rank"
,
"64"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
tokenizer_name
(
model_name
:
str
,
zephyr_lora_added_tokens_files
:
str
):
# noqa: F811
return
zephyr_lora_added_tokens_files
if
(
model_name
==
"zephyr-lora2"
)
else
model_name
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name,tokenizer_name"
,
[(
MODEL_NAME
,
MODEL_NAME
),
(
"zephyr-lora2"
,
"zephyr-lora2"
)],
indirect
=
[
"tokenizer_name"
],
)
async
def
test_tokenize_completions
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
tokenizer_name
:
str
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
tokenizer_name
,
tokenizer_mode
=
"fast"
)
for
add_special
in
[
False
,
True
]:
prompt
=
"vllm1 This is a test prompt."
tokens
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
add_special
)
response
=
requests
.
post
(
base_url
+
"/tokenize"
,
json
=
{
"add_special_tokens"
:
add_special
,
"model"
:
model_name
,
"prompt"
:
prompt
})
response
.
raise_for_status
()
assert
response
.
json
()
==
{
"tokens"
:
tokens
,
"count"
:
len
(
tokens
),
"max_model_len"
:
8192
}
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name,tokenizer_name"
,
[(
MODEL_NAME
,
MODEL_NAME
),
(
"zephyr-lora2"
,
"zephyr-lora2"
)],
indirect
=
[
"tokenizer_name"
],
)
async
def
test_tokenize_chat
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
tokenizer_name
:
str
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
tokenizer_name
,
tokenizer_mode
=
"fast"
)
for
add_generation
in
[
False
,
True
]:
for
add_special
in
[
False
,
True
]:
conversation
=
[{
"role"
:
"user"
,
"content"
:
"Hi there!"
},
{
"role"
:
"assistant"
,
"content"
:
"Nice to meet you!"
},
{
"role"
:
"user"
,
"content"
:
"Can I ask a question? vllm1"
}]
prompt
=
tokenizer
.
apply_chat_template
(
add_generation_prompt
=
add_generation
,
conversation
=
conversation
,
tokenize
=
False
)
tokens
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
add_special
)
response
=
requests
.
post
(
base_url
+
"/tokenize"
,
json
=
{
"add_generation_prompt"
:
add_generation
,
"add_special_tokens"
:
add_special
,
"messages"
:
conversation
,
"model"
:
model_name
})
response
.
raise_for_status
()
assert
response
.
json
()
==
{
"tokens"
:
tokens
,
"count"
:
len
(
tokens
),
"max_model_len"
:
8192
}
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name,tokenizer_name"
,
[(
MODEL_NAME
,
MODEL_NAME
),
(
"zephyr-lora2"
,
"zephyr-lora2"
)],
indirect
=
[
"tokenizer_name"
],
)
async
def
test_detokenize
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
tokenizer_name
:
str
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
tokenizer_name
,
tokenizer_mode
=
"fast"
)
prompt
=
"This is a test prompt. vllm1"
tokens
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
False
)
print
(
f
"CALLING
{
base_url
}
FOR
{
model_name
}
"
)
response
=
requests
.
post
(
base_url
+
"/detokenize"
,
json
=
{
"model"
:
model_name
,
"tokens"
:
tokens
})
response
.
raise_for_status
()
assert
response
.
json
()
==
{
"prompt"
:
prompt
}
tests/entrypoints/openai/test_vision.py
View file @
500b93c8
...
@@ -2,9 +2,8 @@ from typing import Dict, List
...
@@ -2,9 +2,8 @@ from typing import Dict, List
import
openai
import
openai
import
pytest
import
pytest
import
pytest_asyncio
from
vllm.multimodal.utils
import
ImageFetchAiohttp
,
encode_image_base64
from
vllm.multimodal.utils
import
encode_image_base64
,
fetch_image
from
...utils
import
VLLM_PATH
,
RemoteOpenAIServer
from
...utils
import
VLLM_PATH
,
RemoteOpenAIServer
...
@@ -23,17 +22,17 @@ TEST_IMAGE_URLS = [
...
@@ -23,17 +22,17 @@ TEST_IMAGE_URLS = [
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
def
server
():
with
RemoteOpenAIServer
(
[
args
=
[
"--model
"
,
"--dtype
"
,
MODEL_NAME
,
"bfloat16"
,
"--dtype
"
,
"--max-model-len
"
,
"bfloat1
6"
,
"409
6"
,
"--max-model-len
"
,
"--enforce-eager
"
,
"4096
"
,
"--chat-template
"
,
"--enforce-eager"
,
str
(
LLAVA_CHAT_TEMPLATE
)
,
"--chat-template"
,
]
str
(
LLAVA_CHAT_TEMPLATE
),
]
)
as
remote_server
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
yield
remote_server
...
@@ -42,11 +41,10 @@ def client(server):
...
@@ -42,11 +41,10 @@ def client(server):
return
server
.
get_async_client
()
return
server
.
get_async_client
()
@
pytest
_asyncio
.
fixture
(
scope
=
"session"
)
@
pytest
.
fixture
(
scope
=
"session"
)
async
def
base64_encoded_image
()
->
Dict
[
str
,
str
]:
def
base64_encoded_image
()
->
Dict
[
str
,
str
]:
return
{
return
{
image_url
:
image_url
:
encode_image_base64
(
fetch_image
(
image_url
))
encode_image_base64
(
await
ImageFetchAiohttp
.
fetch_image
(
image_url
))
for
image_url
in
TEST_IMAGE_URLS
for
image_url
in
TEST_IMAGE_URLS
}
}
...
...
tests/kernels/quant_utils.py
0 → 100644
View file @
500b93c8
from
typing
import
Optional
,
Tuple
,
Union
import
torch
def
as_float32_tensor
(
x
:
Union
[
float
,
torch
.
tensor
])
->
torch
.
tensor
:
return
torch
.
as_tensor
(
x
,
dtype
=
torch
.
float32
,
device
=
'cuda'
)
def
ref_dynamic_per_token_quant
(
x
:
torch
.
tensor
,
quant_dtype
:
torch
.
dtype
,
scale_ub
:
Optional
[
torch
.
tensor
]
=
None
)
\
->
Tuple
[
torch
.
tensor
,
torch
.
tensor
]:
assert
quant_dtype
in
[
torch
.
int8
,
torch
.
float8_e4m3fn
]
if
scale_ub
is
not
None
:
assert
quant_dtype
==
torch
.
float8_e4m3fn
qtype_traits
=
torch
.
iinfo
(
quant_dtype
)
if
quant_dtype
==
torch
.
int8
\
else
torch
.
finfo
(
quant_dtype
)
qtype_max
=
as_float32_tensor
(
qtype_traits
.
max
)
s_1
=
as_float32_tensor
(
1.0
)
s_512
=
as_float32_tensor
(
512.0
)
# For fp8, in order to match the cuda kernel output, we have to do exactly
# the same operations as in the corresponding fp8 kernel to prevent
# rounding errors.
# Compute scales
x_token_max
,
_
=
x
.
abs
().
max
(
dim
=-
1
)
x_token_max
=
as_float32_tensor
(
x_token_max
)
if
scale_ub
is
not
None
:
x_token_max
=
x_token_max
.
clamp
(
max
=
scale_ub
)
scales
=
(
x_token_max
/
qtype_max
)[:,
None
]
# Quant
if
quant_dtype
==
torch
.
int8
:
iscales
=
as_float32_tensor
(
s_1
/
scales
)
torch_out
=
as_float32_tensor
(
x
)
*
iscales
torch_out
=
torch_out
.
round
()
torch_out
=
torch_out
.
clamp
(
qtype_traits
.
min
,
qtype_traits
.
max
).
to
(
quant_dtype
)
else
:
assert
quant_dtype
==
torch
.
float8_e4m3fn
min_scaling_factor
=
s_1
/
(
qtype_max
*
s_512
)
scales
=
scales
.
clamp
(
min
=
min_scaling_factor
)
torch_out
=
as_float32_tensor
(
x
)
/
scales
torch_out
=
torch_out
.
clamp
(
qtype_traits
.
min
,
qtype_traits
.
max
).
to
(
quant_dtype
)
return
torch_out
,
scales
# The int8 version is very similar. Incorporate the int8 version, like in
# ref_dynamic_per_token_quant, when we have a dynamic_per_tensor int8 quant
# kernel
def
ref_dynamic_per_tensor_fp8_quant
(
x
:
torch
.
tensor
)
\
->
Tuple
[
torch
.
tensor
,
torch
.
tensor
]:
fp8_traits
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
fp8_max
=
as_float32_tensor
(
fp8_traits
.
max
)
one
=
as_float32_tensor
(
1.0
)
# For fp8, in order to match the cuda kernel output, we have to do exactly
# the same operations as in the corresponding fp8 kernel to prevent
# rounding errors.
x_max
=
as_float32_tensor
(
x
.
abs
().
max
())
ref_scale
=
x_max
/
fp8_max
ref_iscale
=
one
/
ref_scale
ref_out
=
(
as_float32_tensor
(
x
)
*
ref_iscale
).
clamp
(
fp8_traits
.
min
,
fp8_traits
.
max
).
to
(
dtype
=
torch
.
float8_e4m3fn
)
return
ref_out
,
ref_scale
tests/kernels/test_attention.py
View file @
500b93c8
...
@@ -176,7 +176,7 @@ def test_paged_attention(
...
@@ -176,7 +176,7 @@ def test_paged_attention(
key_cache
,
value_cache
=
key_caches
[
0
],
value_caches
[
0
]
key_cache
,
value_cache
=
key_caches
[
0
],
value_caches
[
0
]
# Using default kv_scale
# Using default kv_scale
kv_scale
=
1.0
k
_scale
=
v_scale
=
1.0
# Call the paged attention kernel.
# Call the paged attention kernel.
output
=
torch
.
empty_like
(
query
)
output
=
torch
.
empty_like
(
query
)
...
@@ -194,7 +194,8 @@ def test_paged_attention(
...
@@ -194,7 +194,8 @@ def test_paged_attention(
max_seq_len
,
max_seq_len
,
alibi_slopes
,
alibi_slopes
,
kv_cache_dtype
,
kv_cache_dtype
,
kv_scale
,
k_scale
,
v_scale
,
)
)
elif
version
==
"v2"
:
elif
version
==
"v2"
:
num_partitions
=
((
max_seq_len
+
PARTITION_SIZE
-
1
)
//
PARTITION_SIZE
)
num_partitions
=
((
max_seq_len
+
PARTITION_SIZE
-
1
)
//
PARTITION_SIZE
)
...
@@ -225,7 +226,8 @@ def test_paged_attention(
...
@@ -225,7 +226,8 @@ def test_paged_attention(
max_seq_len
,
max_seq_len
,
alibi_slopes
,
alibi_slopes
,
kv_cache_dtype
,
kv_cache_dtype
,
kv_scale
,
k_scale
,
v_scale
,
)
)
else
:
else
:
raise
AssertionError
(
f
"Unknown version:
{
version
}
"
)
raise
AssertionError
(
f
"Unknown version:
{
version
}
"
)
...
...
tests/kernels/test_blocksparse_attention.py
View file @
500b93c8
...
@@ -212,7 +212,7 @@ def test_paged_attention(
...
@@ -212,7 +212,7 @@ def test_paged_attention(
key_cache
,
value_cache
=
key_caches
[
0
],
value_caches
[
0
]
key_cache
,
value_cache
=
key_caches
[
0
],
value_caches
[
0
]
# Using default kv_scale
# Using default kv_scale
kv_scale
=
1.0
k
_scale
=
v_scale
=
1.0
tp_rank
=
0
tp_rank
=
0
# Call the paged attention kernel.
# Call the paged attention kernel.
...
@@ -231,7 +231,8 @@ def test_paged_attention(
...
@@ -231,7 +231,8 @@ def test_paged_attention(
max_seq_len
,
max_seq_len
,
alibi_slopes
,
alibi_slopes
,
kv_cache_dtype
,
kv_cache_dtype
,
kv_scale
,
k_scale
,
v_scale
,
tp_rank
=
tp_rank
,
tp_rank
=
tp_rank
,
blocksparse_local_blocks
=
blocksparse_local_blocks
,
blocksparse_local_blocks
=
blocksparse_local_blocks
,
blocksparse_vert_stride
=
blocksparse_vert_stride
,
blocksparse_vert_stride
=
blocksparse_vert_stride
,
...
@@ -267,7 +268,8 @@ def test_paged_attention(
...
@@ -267,7 +268,8 @@ def test_paged_attention(
max_seq_len
,
max_seq_len
,
alibi_slopes
,
alibi_slopes
,
kv_cache_dtype
,
kv_cache_dtype
,
kv_scale
,
k_scale
,
v_scale
,
tp_rank
=
tp_rank
,
tp_rank
=
tp_rank
,
blocksparse_local_blocks
=
blocksparse_local_blocks
,
blocksparse_local_blocks
=
blocksparse_local_blocks
,
blocksparse_vert_stride
=
blocksparse_vert_stride
,
blocksparse_vert_stride
=
blocksparse_vert_stride
,
...
...
tests/kernels/test_cache.py
View file @
500b93c8
...
@@ -156,11 +156,11 @@ def test_reshape_and_cache(
...
@@ -156,11 +156,11 @@ def test_reshape_and_cache(
cloned_value_cache
=
value_cache
.
clone
()
cloned_value_cache
=
value_cache
.
clone
()
# Using default kv_scale
# Using default kv_scale
kv_scale
=
1.0
k
_scale
=
v_scale
=
1.0
# Call the reshape_and_cache kernel.
# Call the reshape_and_cache kernel.
ops
.
reshape_and_cache
(
key
,
value
,
key_cache
,
value_cache
,
slot_mapping
,
ops
.
reshape_and_cache
(
key
,
value
,
key_cache
,
value_cache
,
slot_mapping
,
kv_cache_dtype
,
kv_scale
)
kv_cache_dtype
,
k
_scale
,
v_scale
)
if
kv_cache_dtype
==
"fp8"
:
if
kv_cache_dtype
==
"fp8"
:
result_key_cache
=
torch
.
empty_like
(
key_cache
,
dtype
=
torch
.
float16
)
result_key_cache
=
torch
.
empty_like
(
key_cache
,
dtype
=
torch
.
float16
)
...
...
tests/kernels/test_fp8_quant.py
0 → 100644
View file @
500b93c8
import
pytest
import
torch
import
vllm._custom_ops
as
ops
from
tests.kernels.quant_utils
import
(
ref_dynamic_per_tensor_fp8_quant
,
ref_dynamic_per_token_quant
)
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
HIDDEN_SIZES
=
[
1
,
2
,
3
,
4
,
16
,
67
,
768
,
2048
,
5120
,
5137
,
8192
,
8193
]
# Arbitrary values for testing
HIDDEN_SIZES
+=
list
(
range
(
1024
,
1033
))
# vectorized conversion edge cases
NUM_TOKENS
=
[
1
,
7
,
83
,
4096
]
# Arbitrary values for testing
SCALE_UBS
=
[
True
,
False
]
SEEDS
=
[
0
]
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"scale_ub"
,
SCALE_UBS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
torch
.
inference_mode
()
def
test_dynamic_per_token_fp8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
scale_ub
:
bool
,
seed
:
int
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
+
1e-6
# avoid nans
scale_ub
=
torch
.
mean
(
x
).
to
(
dtype
=
torch
.
float32
,
device
=
'cuda'
)
\
if
scale_ub
else
None
ref_out
,
ref_scales
=
ref_dynamic_per_token_quant
(
x
,
torch
.
float8_e4m3fn
,
scale_ub
)
ops_out
,
ops_scales
=
ops
.
scaled_fp8_quant
(
x
,
scale_ub
=
scale_ub
,
use_per_token_if_dynamic
=
True
)
assert
torch
.
allclose
(
ref_scales
,
ops_scales
)
assert
torch
.
allclose
(
ref_out
.
to
(
dtype
=
torch
.
float32
),
ops_out
.
to
(
dtype
=
torch
.
float32
))
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
torch
.
inference_mode
()
def
test_dynamic_per_tensor_fp8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
ref_out
,
ref_scale
=
ref_dynamic_per_tensor_fp8_quant
(
x
)
ops_out
,
ops_scale
=
ops
.
scaled_fp8_quant
(
x
)
assert
torch
.
allclose
(
ref_scale
,
ops_scale
)
assert
torch
.
allclose
(
ref_out
.
to
(
dtype
=
torch
.
float32
),
ops_out
.
to
(
dtype
=
torch
.
float32
))
# Regression test for a case with large activations where an int32 index cannot
# represent the number of elements.
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
def
test_fp8_quant_large
(
seed
:
int
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
num_tokens
=
1024000
# Mistral-Nemo's max_position_embeddings
hidden_size
=
1152
# Smallest hidden_size to reproduce the error
dtype
=
torch
.
bfloat16
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
ref_out
,
scale
=
ref_dynamic_per_tensor_fp8_quant
(
x
)
ops_out
,
_
=
ops
.
scaled_fp8_quant
(
x
,
scale
)
# Minimize memory footprint in this test by freeing x and upconverting
# the outputs in place. (torch.allclose does not support fp8)
del
x
ref_out
=
ref_out
.
to
(
dtype
=
dtype
)
ops_out
=
ops_out
.
to
(
dtype
=
dtype
)
assert
torch
.
allclose
(
ref_out
,
ops_out
)
tests/kernels/test_int8_quant.py
View file @
500b93c8
...
@@ -3,6 +3,8 @@ import torch
...
@@ -3,6 +3,8 @@ import torch
# ruff: noqa: F401
# ruff: noqa: F401
import
vllm._C
import
vllm._C
from
tests.kernels.quant_utils
import
ref_dynamic_per_token_quant
from
vllm._custom_ops
import
scaled_int8_quant
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
HIDDEN_SIZES
=
[
16
,
67
,
768
,
2048
,
5120
,
5137
,
8192
,
HIDDEN_SIZES
=
[
16
,
67
,
768
,
2048
,
5120
,
5137
,
8192
,
...
@@ -21,23 +23,16 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
...
@@ -21,23 +23,16 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
x_token_max
,
_
=
x
.
max
(
dim
=
1
)
# reference
x_token_max
=
x_token_max
.
to
(
dtype
=
torch
.
float32
)
ref_out
,
ref_scales
=
ref_dynamic_per_token_quant
(
x
,
torch
.
int8
)
scales
=
(
x_token_max
/
float
(
127.0
))[:,
None
].
to
(
device
=
"cuda"
,
# kernel
dtype
=
torch
.
float32
)
ops_out
,
ops_scales
=
scaled_int8_quant
(
x
)
torch_out
=
(
x
/
scales
).
round
().
clamp
(
int8_traits
.
min
,
int8_traits
.
max
).
to
(
torch
.
int8
)
ops_out
=
torch
.
empty_like
(
x
,
dtype
=
torch
.
int8
,
device
=
"cuda"
)
scales_out
=
torch
.
empty_like
(
scales
,
dtype
=
torch
.
float32
,
device
=
"cuda"
)
torch
.
ops
.
_C
.
dynamic_scaled_int8_quant
(
ops_out
,
x
,
scales_out
)
assert
torch
.
allclose
(
scales
_out
,
scales
)
assert
torch
.
allclose
(
ops_
scales
,
ref_
scales
)
assert
torch
.
allclose
(
torch
_out
,
ops
_out
,
assert
torch
.
allclose
(
ops
_out
,
ref
_out
,
atol
=
1
)
# big atol to account for rounding errors
atol
=
1
)
# big atol to account for rounding errors
...
@@ -55,12 +50,11 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
...
@@ -55,12 +50,11 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
scale
=
torch
.
tensor
([
scale
],
dtype
=
torch
.
float32
,
device
=
"cuda"
)
out1
=
(
x
/
scale
).
round
().
clamp
(
int8_traits
.
min
,
out1
=
(
x
/
scale
).
round
().
clamp
(
int8_traits
.
min
,
int8_traits
.
max
).
to
(
torch
.
int8
)
int8_traits
.
max
).
to
(
torch
.
int8
)
out2
=
torch
.
empty_like
(
x
,
dtype
=
torch
.
int8
)
out2
,
_
=
scaled_int8_quant
(
x
,
scale
)
scale_argument
=
torch
.
tensor
([
scale
],
dtype
=
torch
.
float32
,
device
=
"cuda"
)
torch
.
ops
.
_C
.
static_scaled_int8_quant
(
out2
,
x
,
scale_argument
)
assert
torch
.
allclose
(
out1
,
out2
,
assert
torch
.
allclose
(
out1
,
out2
,
atol
=
1
)
# big atol to account for rounding errors
atol
=
1
)
# big atol to account for rounding errors
tests/kernels/test_marlin_gemm.py
View file @
500b93c8
...
@@ -12,17 +12,18 @@ from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
...
@@ -12,17 +12,18 @@ from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
,
GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
)
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
,
GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
)
from
vllm.model_executor.layers.quantization.utils.marlin_utils
import
(
from
vllm.model_executor.layers.quantization.utils.marlin_utils
import
(
GPTQ_MARLIN_MAX_PARALLEL
,
GPTQ_MARLIN_MIN_THREAD_N
,
GPTQ_MARLIN_MAX_PARALLEL
,
GPTQ_MARLIN_MIN_THREAD_N
,
GPTQ_
MARLIN_SUPPORTED_GROUP_SIZES
,
GPTQ_
MARLIN_SUPPORTED_NUM_BITS
,
MARLIN_SUPPORTED_GROUP_SIZES
,
MARLIN_SUPPORTED_NUM_BITS
,
marlin_permute_scales
)
marlin_make_empty_g_idx
,
marlin_permute_scales
)
from
vllm.model_executor.layers.quantization.utils.marlin_utils_fp8
import
(
from
vllm.model_executor.layers.quantization.utils.marlin_utils_fp8
import
(
pack_fp8_to_int32
)
pack_fp8_to_int32
)
from
vllm.model_executor.layers.quantization.utils.marlin_utils_test
import
(
from
vllm.model_executor.layers.quantization.utils.marlin_utils_test
import
(
MarlinWorkspace
,
get_weight_perm
,
marlin_quantize
,
marlin_weights
)
MarlinWorkspace
,
awq_marlin_quantize
,
get_weight_perm
,
marlin_quantize
,
marlin_weights
)
from
vllm.model_executor.layers.quantization.utils.marlin_utils_test_24
import
(
from
vllm.model_executor.layers.quantization.utils.marlin_utils_test_24
import
(
marlin_24_quantize
)
marlin_24_quantize
)
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
gptq_pack
,
quantize_weights
,
sort_weights
)
awq_pack
,
gptq_pack
,
quantize_weights
,
quantize_weights_with_zp
,
from
vllm.utils
import
is_hip
sort_weights
)
ACT_ORDER_OPTS
=
[
False
,
True
]
ACT_ORDER_OPTS
=
[
False
,
True
]
K_FULL_OPTS
=
[
False
,
True
]
K_FULL_OPTS
=
[
False
,
True
]
...
@@ -58,12 +59,12 @@ def rand_data(shape, dtype=torch.float16):
...
@@ -58,12 +59,12 @@ def rand_data(shape, dtype=torch.float16):
reason
=
"Marlin is not supported on this GPU type."
)
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_N_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_N_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
GPTQ_
MARLIN_SUPPORTED_NUM_BITS
)
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
MARLIN_SUPPORTED_NUM_BITS
)
@
pytest
.
mark
.
parametrize
(
"group_size"
,
GPTQ_
MARLIN_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"group_size"
,
MARLIN_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"act_order"
,
ACT_ORDER_OPTS
)
@
pytest
.
mark
.
parametrize
(
"act_order"
,
ACT_ORDER_OPTS
)
@
pytest
.
mark
.
parametrize
(
"mnk_factors"
,
MNK_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"mnk_factors"
,
MNK_FACTORS
)
def
test_marlin_repack
(
k_chunk
,
n_chunk
,
num_bits
,
group_size
,
act_order
,
def
test_
gptq_
marlin_repack
(
k_chunk
,
n_chunk
,
num_bits
,
group_size
,
act_order
,
mnk_factors
):
mnk_factors
):
m_factor
,
n_factor
,
k_factor
=
mnk_factors
m_factor
,
n_factor
,
k_factor
=
mnk_factors
size_m
=
m_factor
size_m
=
m_factor
...
@@ -121,12 +122,60 @@ def test_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
...
@@ -121,12 +122,60 @@ def test_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
reason
=
"Marlin is not supported on this GPU type."
)
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_N_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_N_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
GPTQ_MARLIN_SUPPORTED_NUM_BITS
)
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
MARLIN_SUPPORTED_NUM_BITS
)
@
pytest
.
mark
.
parametrize
(
"group_size"
,
GPTQ_MARLIN_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"group_size"
,
MARLIN_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"mnk_factors"
,
MNK_FACTORS
)
def
test_awq_marlin_repack
(
k_chunk
,
n_chunk
,
num_bits
,
group_size
,
mnk_factors
):
m_factor
,
n_factor
,
k_factor
=
mnk_factors
size_m
=
m_factor
size_k
=
k_chunk
*
k_factor
size_n
=
n_chunk
*
n_factor
print
(
f
"MNK =
{
size_m
}
{
size_n
}
{
size_k
}
"
)
# Normalize group_size
if
group_size
==
-
1
:
group_size
=
size_k
assert
group_size
<=
size_k
# Create input
b_weight
=
rand_data
((
size_k
,
size_n
))
# Quantize
w_ref
,
q_w
,
s
,
zp
=
quantize_weights_with_zp
(
b_weight
,
num_bits
,
group_size
)
# Pack to AWQ format
q_w_awq
=
awq_pack
(
q_w
,
num_bits
,
size_k
,
size_n
)
# Pack to Marlin format
weight_perm
=
get_weight_perm
(
num_bits
)
marlin_q_w_1
=
marlin_weights
(
q_w
,
size_k
,
size_n
,
num_bits
,
weight_perm
)
# Run Marlin repack GPU kernel
marlin_q_w_2
=
ops
.
awq_marlin_repack
(
q_w_awq
,
size_k
,
size_n
,
num_bits
,
)
torch
.
cuda
.
synchronize
()
assert
torch
.
allclose
(
marlin_q_w_1
,
marlin_q_w_2
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
),
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_N_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
MARLIN_SUPPORTED_NUM_BITS
)
@
pytest
.
mark
.
parametrize
(
"group_size"
,
MARLIN_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"mnk_factors"
,
MNK_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"mnk_factors"
,
MNK_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"act_order"
,
ACT_ORDER_OPTS
)
@
pytest
.
mark
.
parametrize
(
"act_order"
,
ACT_ORDER_OPTS
)
@
pytest
.
mark
.
parametrize
(
"is_k_full"
,
K_FULL_OPTS
)
@
pytest
.
mark
.
parametrize
(
"is_k_full"
,
K_FULL_OPTS
)
def
test_marlin_gemm
(
def
test_
gptq_
marlin_gemm
(
k_chunk
,
k_chunk
,
n_chunk
,
n_chunk
,
num_bits
,
num_bits
,
...
@@ -156,6 +205,8 @@ def test_marlin_gemm(
...
@@ -156,6 +205,8 @@ def test_marlin_gemm(
w_ref
,
marlin_q_w
,
marlin_s
,
g_idx
,
sort_indices
,
_
=
marlin_quantize
(
w_ref
,
marlin_q_w
,
marlin_s
,
g_idx
,
sort_indices
,
_
=
marlin_quantize
(
b_weight
,
num_bits
,
group_size
,
act_order
)
b_weight
,
num_bits
,
group_size
,
act_order
)
marlin_zp
=
marlin_make_empty_g_idx
(
marlin_s
.
device
)
workspace
=
MarlinWorkspace
(
size_n
,
GPTQ_MARLIN_MIN_THREAD_N
,
workspace
=
MarlinWorkspace
(
size_n
,
GPTQ_MARLIN_MIN_THREAD_N
,
GPTQ_MARLIN_MAX_PARALLEL
)
GPTQ_MARLIN_MAX_PARALLEL
)
...
@@ -163,6 +214,7 @@ def test_marlin_gemm(
...
@@ -163,6 +214,7 @@ def test_marlin_gemm(
a_input
,
a_input
,
marlin_q_w
,
marlin_q_w
,
marlin_s
,
marlin_s
,
marlin_zp
,
g_idx
,
g_idx
,
sort_indices
,
sort_indices
,
workspace
.
scratch
,
workspace
.
scratch
,
...
@@ -171,6 +223,7 @@ def test_marlin_gemm(
...
@@ -171,6 +223,7 @@ def test_marlin_gemm(
b_weight
.
shape
[
1
],
b_weight
.
shape
[
1
],
a_input
.
shape
[
1
],
a_input
.
shape
[
1
],
is_k_full
,
is_k_full
,
has_zp
=
False
,
)
)
output_ref
=
torch
.
matmul
(
a_input
,
w_ref
)
output_ref
=
torch
.
matmul
(
a_input
,
w_ref
)
...
@@ -189,7 +242,8 @@ def test_marlin_gemm(
...
@@ -189,7 +242,8 @@ def test_marlin_gemm(
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
)
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
)
@
pytest
.
mark
.
parametrize
(
"group_size"
,
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"group_size"
,
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"mnk_factors"
,
MNK_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"mnk_factors"
,
MNK_FACTORS
)
def
test_marlin_24_gemm
(
k_chunk
,
n_chunk
,
num_bits
,
group_size
,
mnk_factors
):
def
test_gptq_marlin_24_gemm
(
k_chunk
,
n_chunk
,
num_bits
,
group_size
,
mnk_factors
):
m_factor
,
n_factor
,
k_factor
=
mnk_factors
m_factor
,
n_factor
,
k_factor
=
mnk_factors
size_m
=
m_factor
size_m
=
m_factor
...
@@ -302,3 +356,65 @@ def test_fp8_marlin_gemm(
...
@@ -302,3 +356,65 @@ def test_fp8_marlin_gemm(
print
(
"max_diff = {}"
.
format
(
max_diff
))
print
(
"max_diff = {}"
.
format
(
max_diff
))
assert
max_diff
<
0.04
assert
max_diff
<
0.04
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
),
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_N_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
MARLIN_SUPPORTED_NUM_BITS
)
@
pytest
.
mark
.
parametrize
(
"group_size"
,
MARLIN_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"mnk_factors"
,
MNK_FACTORS
)
def
test_awq_marlin_gemm
(
k_chunk
,
n_chunk
,
num_bits
,
group_size
,
mnk_factors
,
):
m_factor
,
n_factor
,
k_factor
=
mnk_factors
size_m
=
m_factor
size_k
=
k_chunk
*
k_factor
size_n
=
n_chunk
*
n_factor
print
(
f
"MNK =
{
size_m
}
{
size_n
}
{
size_k
}
"
)
print
(
f
"groupsize =
{
group_size
}
"
)
a_input
=
rand_data
((
size_m
,
size_k
))
b_weight
=
rand_data
((
size_k
,
size_n
))
w_ref
,
marlin_q_w
,
marlin_s
,
marlin_zp
=
awq_marlin_quantize
(
b_weight
,
num_bits
,
group_size
)
g_idx
=
torch
.
empty
(
0
,
dtype
=
torch
.
int
,
device
=
marlin_q_w
.
device
)
sort_indices
=
torch
.
empty
(
0
,
dtype
=
torch
.
int
,
device
=
marlin_q_w
.
device
)
is_k_full
=
True
has_zp
=
True
workspace
=
MarlinWorkspace
(
size_n
,
GPTQ_MARLIN_MIN_THREAD_N
,
GPTQ_MARLIN_MAX_PARALLEL
)
output
=
ops
.
gptq_marlin_gemm
(
a_input
,
marlin_q_w
,
marlin_s
,
marlin_zp
,
g_idx
,
sort_indices
,
workspace
.
scratch
,
num_bits
,
a_input
.
shape
[
0
],
b_weight
.
shape
[
1
],
a_input
.
shape
[
1
],
is_k_full
,
has_zp
,
)
output_ref
=
torch
.
matmul
(
a_input
,
w_ref
)
torch
.
cuda
.
synchronize
()
max_diff
=
compute_max_diff
(
output
,
output_ref
)
print
(
"max_diff = {}"
.
format
(
max_diff
))
assert
max_diff
<
0.04
tests/lora/conftest.py
View file @
500b93c8
...
@@ -159,8 +159,14 @@ def dummy_model_gate_up() -> nn.Module:
...
@@ -159,8 +159,14 @@ def dummy_model_gate_up() -> nn.Module:
@
pytest
.
fixture
(
scope
=
"session"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
sql_lora_files
():
def
sql_lora_huggingface_id
():
return
snapshot_download
(
repo_id
=
"yard1/llama-2-7b-sql-lora-test"
)
# huggingface repo id is used to test lora runtime downloading.
return
"yard1/llama-2-7b-sql-lora-test"
@
pytest
.
fixture
(
scope
=
"session"
)
def
sql_lora_files
(
sql_lora_huggingface_id
):
return
snapshot_download
(
repo_id
=
sql_lora_huggingface_id
)
@
pytest
.
fixture
(
scope
=
"session"
)
@
pytest
.
fixture
(
scope
=
"session"
)
...
...
tests/lora/test_long_context.py
View file @
500b93c8
...
@@ -29,7 +29,7 @@ def _create_lora_request(lora_id, long_context_infos):
...
@@ -29,7 +29,7 @@ def _create_lora_request(lora_id, long_context_infos):
context_len
=
long_context_infos
[
lora_id
][
"context_length"
]
context_len
=
long_context_infos
[
lora_id
][
"context_length"
]
scaling_factor
=
context_len_to_scaling_factor
[
context_len
]
scaling_factor
=
context_len_to_scaling_factor
[
context_len
]
return
LoRARequest
(
context_len
,
lora_id
,
return
LoRARequest
(
context_len
,
lora_id
,
long_context_infos
[
lora_id
][
"lora"
],
long_context_infos
[
lora_id
][
"lora"
],
None
,
4096
*
scaling_factor
)
4096
*
scaling_factor
)
...
...
Prev
1
2
3
4
5
6
7
8
9
…
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment