Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a5753ff5
Commit
a5753ff5
authored
Jun 19, 2024
by
zhuwenwen
Browse files
v0.5.0.post1
parents
21c06ecb
0f0d8bc0
Changes
108
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
528 additions
and
323 deletions
+528
-323
tests/distributed/test_utils.py
tests/distributed/test_utils.py
+31
-0
tests/entrypoints/test_openai_embedding.py
tests/entrypoints/test_openai_embedding.py
+113
-0
tests/entrypoints/test_openai_server.py
tests/entrypoints/test_openai_server.py
+47
-159
tests/entrypoints/test_openai_vision.py
tests/entrypoints/test_openai_vision.py
+17
-18
tests/kernels/test_cutlass.py
tests/kernels/test_cutlass.py
+9
-9
tests/lora/conftest.py
tests/lora/conftest.py
+13
-10
tests/models/test_aqlm.py
tests/models/test_aqlm.py
+2
-11
tests/models/test_fp8.py
tests/models/test_fp8.py
+10
-10
tests/models/test_gptq_marlin.py
tests/models/test_gptq_marlin.py
+2
-11
tests/models/test_gptq_marlin_24.py
tests/models/test_gptq_marlin_24.py
+2
-11
tests/models/test_marlin.py
tests/models/test_marlin.py
+2
-11
tests/quantization/test_bitsandbytes.py
tests/quantization/test_bitsandbytes.py
+3
-7
tests/quantization/test_compressed_tensors.py
tests/quantization/test_compressed_tensors.py
+25
-2
tests/quantization/test_fp8.py
tests/quantization/test_fp8.py
+49
-7
tests/quantization/utils.py
tests/quantization/utils.py
+14
-0
tests/tensorizer_loader/test_tensorizer.py
tests/tensorizer_loader/test_tensorizer.py
+95
-18
tests/test_sharded_state_loader.py
tests/test_sharded_state_loader.py
+1
-1
tests/utils.py
tests/utils.py
+88
-36
tests/worker/test_model_runner.py
tests/worker/test_model_runner.py
+3
-1
vllm/__init__.py
vllm/__init__.py
+2
-1
No files found.
tests/distributed/test_utils.py
0 → 100644
View file @
a5753ff5
import
os
import
ray
from
vllm.utils
import
cuda_device_count_stateless
@
ray
.
remote
class
_CUDADeviceCountStatelessTestActor
():
def
get_count
(
self
):
return
cuda_device_count_stateless
()
def
set_cuda_visible_devices
(
self
,
cuda_visible_devices
:
str
):
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
cuda_visible_devices
def
get_cuda_visible_devices
(
self
):
return
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
def
test_cuda_device_count_stateless
():
"""Test that cuda_device_count_stateless changes return value if
CUDA_VISIBLE_DEVICES is changed."""
actor
=
_CUDADeviceCountStatelessTestActor
.
options
(
num_gpus
=
2
).
remote
()
assert
ray
.
get
(
actor
.
get_cuda_visible_devices
.
remote
())
==
"0,1"
assert
ray
.
get
(
actor
.
get_count
.
remote
())
==
2
ray
.
get
(
actor
.
set_cuda_visible_devices
.
remote
(
"0"
))
assert
ray
.
get
(
actor
.
get_count
.
remote
())
==
1
ray
.
get
(
actor
.
set_cuda_visible_devices
.
remote
(
""
))
assert
ray
.
get
(
actor
.
get_count
.
remote
())
==
0
tests/entrypoints/test_openai_embedding.py
0 → 100644
View file @
a5753ff5
import
openai
import
pytest
import
ray
from
..utils
import
VLLM_PATH
,
RemoteOpenAIServer
EMBEDDING_MODEL_NAME
=
"intfloat/e5-mistral-7b-instruct"
pytestmark
=
pytest
.
mark
.
openai
@
pytest
.
fixture
(
scope
=
"module"
)
def
ray_ctx
():
ray
.
init
(
runtime_env
=
{
"working_dir"
:
VLLM_PATH
})
yield
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"module"
)
def
embedding_server
(
ray_ctx
):
return
RemoteOpenAIServer
([
"--model"
,
EMBEDDING_MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--enforce-eager"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
])
@
pytest
.
mark
.
asyncio
@
pytest
.
fixture
(
scope
=
"module"
)
def
embedding_client
(
embedding_server
):
return
embedding_server
.
get_async_client
()
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
EMBEDDING_MODEL_NAME
],
)
async
def
test_single_embedding
(
embedding_client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
input_texts
=
[
"The chef prepared a delicious meal."
,
]
# test single embedding
embeddings
=
await
embedding_client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_texts
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
9
assert
embeddings
.
usage
.
total_tokens
==
9
# test using token IDs
input_tokens
=
[
1
,
1
,
1
,
1
,
1
]
embeddings
=
await
embedding_client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_tokens
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
5
assert
embeddings
.
usage
.
total_tokens
==
5
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
EMBEDDING_MODEL_NAME
],
)
async
def
test_batch_embedding
(
embedding_client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test List[str]
input_texts
=
[
"The cat sat on the mat."
,
"A feline was resting on a rug."
,
"Stars twinkle brightly in the night sky."
]
embeddings
=
await
embedding_client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_texts
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
3
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
# test List[List[int]]
input_tokens
=
[[
4
,
5
,
7
,
9
,
20
],
[
15
,
29
,
499
],
[
24
,
24
,
24
,
24
,
24
],
[
25
,
32
,
64
,
77
]]
embeddings
=
await
embedding_client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_tokens
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
4
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
17
assert
embeddings
.
usage
.
total_tokens
==
17
tests/entrypoints/test_openai_server.py
View file @
a5753ff5
...
...
@@ -15,11 +15,10 @@ from openai import BadRequestError
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
..utils
import
ServerRunn
er
from
..utils
import
VLLM_PATH
,
RemoteOpenAIServ
er
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
EMBEDDING_MODEL_NAME
=
"intfloat/e5-mistral-7b-instruct"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
...
...
@@ -80,9 +79,15 @@ def zephyr_lora_files():
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
zephyr_lora_files
):
ray
.
init
()
server_runner
=
ServerRunner
.
remote
([
def
ray_ctx
():
ray
.
init
(
runtime_env
=
{
"working_dir"
:
VLLM_PATH
})
yield
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
zephyr_lora_files
,
ray_ctx
):
return
RemoteOpenAIServer
([
"--model"
,
MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
...
...
@@ -91,8 +96,6 @@ def server(zephyr_lora_files):
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
"--gpu-memory-utilization"
,
"0.75"
,
# lora config below
"--enable-lora"
,
"--lora-modules"
,
...
...
@@ -105,43 +108,14 @@ def server(zephyr_lora_files):
"--max-num-seqs"
,
"128"
,
])
ray
.
get
(
server_runner
.
ready
.
remote
())
yield
server_runner
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"module"
)
def
embedding_server
(
zephyr_lora_files
):
ray
.
shutdown
()
ray
.
init
()
server_runner
=
ServerRunner
.
remote
([
"--model"
,
EMBEDDING_MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--enforce-eager"
,
"--gpu-memory-utilization"
,
"0.75"
,
"--max-model-len"
,
"8192"
,
])
ray
.
get
(
server_runner
.
ready
.
remote
())
yield
server_runner
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
():
client
=
openai
.
AsyncOpenAI
(
base_url
=
"http://localhost:8000/v1"
,
api_key
=
"token-abc123"
,
)
yield
client
def
client
(
server
):
return
server
.
get_async_client
()
@
pytest
.
mark
.
asyncio
async
def
test_check_models
(
server
,
client
:
openai
.
AsyncOpenAI
):
async
def
test_check_models
(
client
:
openai
.
AsyncOpenAI
):
models
=
await
client
.
models
.
list
()
models
=
models
.
data
served_model
=
models
[
0
]
...
...
@@ -158,8 +132,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI):
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
async
def
test_single_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_single_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
...
...
@@ -190,8 +163,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
async
def
test_no_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_no_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
...
...
@@ -210,8 +182,7 @@ async def test_no_logprobs(server, client: openai.AsyncOpenAI,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_zero_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_zero_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
...
...
@@ -232,8 +203,7 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_some_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_some_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
...
...
@@ -254,7 +224,7 @@ async def test_some_logprobs(server, client: openai.AsyncOpenAI,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_too_many_completion_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_too_many_completion_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
with
pytest
.
raises
(
...
...
@@ -300,8 +270,7 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
async
def
test_no_logprobs_chat
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_no_logprobs_chat
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
...
...
@@ -326,8 +295,7 @@ async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_zero_logprobs_chat
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_zero_logprobs_chat
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
...
...
@@ -354,8 +322,7 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_some_logprobs_chat
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_some_logprobs_chat
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
...
...
@@ -382,7 +349,7 @@ async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_too_many_chat_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_too_many_chat_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
...
...
@@ -425,7 +392,7 @@ async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_single_chat_session
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_single_chat_session
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
...
...
@@ -470,7 +437,7 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_completion_streaming
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_completion_streaming
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is an LLM?"
...
...
@@ -505,8 +472,7 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_chat_streaming
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_chat_streaming
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
...
...
@@ -555,8 +521,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
"model_name"
,
[
"HuggingFaceH4/zephyr-7b-beta"
,
"zephyr-lora"
],
)
async
def
test_chat_completion_stream_options
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_chat_completion_stream_options
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
...
...
@@ -626,7 +591,7 @@ async def test_chat_completion_stream_options(server,
"model_name"
,
[
"HuggingFaceH4/zephyr-7b-beta"
,
"zephyr-lora"
],
)
async
def
test_completion_stream_options
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_completion_stream_options
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is the capital of France?"
...
...
@@ -688,8 +653,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_batch_completions
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_batch_completions
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test simple list
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
...
...
@@ -737,7 +701,7 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
async
def
test_logits_bias
(
server
,
client
:
openai
.
AsyncOpenAI
):
async
def
test_logits_bias
(
client
:
openai
.
AsyncOpenAI
):
prompt
=
"Hello, my name is"
max_tokens
=
5
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
...
...
@@ -786,7 +750,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_json_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_guided_json_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
...
...
@@ -808,7 +772,7 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_json_chat
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_guided_json_chat
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
messages
=
[{
"role"
:
"system"
,
...
...
@@ -855,7 +819,7 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_regex_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_guided_regex_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
...
...
@@ -875,7 +839,7 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_regex_chat
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_guided_regex_chat
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
messages
=
[{
"role"
:
"system"
,
...
...
@@ -913,7 +877,7 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_choice_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_guided_choice_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
...
...
@@ -933,7 +897,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_choice_chat
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_guided_choice_chat
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
messages
=
[{
"role"
:
"system"
,
...
...
@@ -972,7 +936,7 @@ async def test_guided_choice_chat(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_decoding_type_error
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_guided_decoding_type_error
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
completions
.
create
(
...
...
@@ -1008,7 +972,7 @@ async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_choice_chat_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_guided_choice_chat_logprobs
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
messages
=
[{
"role"
:
"system"
,
...
...
@@ -1040,7 +1004,7 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_named_tool_use
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_named_tool_use
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
messages
=
[{
"role"
:
"system"
,
...
...
@@ -1131,7 +1095,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
])
async
def
test_required_tool_use_not_yet_supported
(
server
,
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
...
...
@@ -1177,7 +1141,7 @@ async def test_required_tool_use_not_yet_supported(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
])
async
def
test_inconsistent_tool_choice_and_tools
(
server
,
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
...
...
@@ -1223,7 +1187,7 @@ async def test_inconsistent_tool_choice_and_tools(
@
pytest
.
mark
.
asyncio
async
def
test_response_format_json_object
(
server
,
client
:
openai
.
AsyncOpenAI
):
async
def
test_response_format_json_object
(
client
:
openai
.
AsyncOpenAI
):
for
_
in
range
(
2
):
resp
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
...
...
@@ -1243,7 +1207,7 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
@
pytest
.
mark
.
asyncio
async
def
test_extra_fields
(
server
,
client
:
openai
.
AsyncOpenAI
):
async
def
test_extra_fields
(
client
:
openai
.
AsyncOpenAI
):
with
pytest
.
raises
(
BadRequestError
)
as
exc_info
:
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
...
...
@@ -1259,7 +1223,7 @@ async def test_extra_fields(server, client: openai.AsyncOpenAI):
@
pytest
.
mark
.
asyncio
async
def
test_complex_message_content
(
server
,
client
:
openai
.
AsyncOpenAI
):
async
def
test_complex_message_content
(
client
:
openai
.
AsyncOpenAI
):
resp
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
[{
...
...
@@ -1279,7 +1243,7 @@ async def test_complex_message_content(server, client: openai.AsyncOpenAI):
@
pytest
.
mark
.
asyncio
async
def
test_custom_role
(
server
,
client
:
openai
.
AsyncOpenAI
):
async
def
test_custom_role
(
client
:
openai
.
AsyncOpenAI
):
# Not sure how the model handles custom roles so we just check that
# both string and complex message content are handled in the same way
...
...
@@ -1310,7 +1274,7 @@ async def test_custom_role(server, client: openai.AsyncOpenAI):
@
pytest
.
mark
.
asyncio
async
def
test_guided_grammar
(
server
,
client
:
openai
.
AsyncOpenAI
):
async
def
test_guided_grammar
(
client
:
openai
.
AsyncOpenAI
):
simple_sql_grammar
=
"""
start: select_statement
...
...
@@ -1351,7 +1315,7 @@ number: "1" | "2"
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
@
pytest
.
mark
.
parametrize
(
"logprobs_arg"
,
[
1
,
0
])
async
def
test_echo_logprob_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_echo_logprob_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
logprobs_arg
:
int
):
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# test using text and token IDs
...
...
@@ -1380,7 +1344,7 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
async
def
test_long_seed
(
server
,
client
:
openai
.
AsyncOpenAI
):
async
def
test_long_seed
(
client
:
openai
.
AsyncOpenAI
):
for
seed
in
[
torch
.
iinfo
(
torch
.
long
).
min
-
1
,
torch
.
iinfo
(
torch
.
long
).
max
+
1
...
...
@@ -1399,81 +1363,5 @@ async def test_long_seed(server, client: openai.AsyncOpenAI):
or
"less_than_equal"
in
exc_info
.
value
.
message
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
EMBEDDING_MODEL_NAME
],
)
async
def
test_single_embedding
(
embedding_server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
input_texts
=
[
"The chef prepared a delicious meal."
,
]
# test single embedding
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_texts
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
9
assert
embeddings
.
usage
.
total_tokens
==
9
# test using token IDs
input_tokens
=
[
1
,
1
,
1
,
1
,
1
]
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_tokens
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
5
assert
embeddings
.
usage
.
total_tokens
==
5
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
EMBEDDING_MODEL_NAME
],
)
async
def
test_batch_embedding
(
embedding_server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test List[str]
input_texts
=
[
"The cat sat on the mat."
,
"A feline was resting on a rug."
,
"Stars twinkle brightly in the night sky."
]
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_texts
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
3
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
# test List[List[int]]
input_tokens
=
[[
4
,
5
,
7
,
9
,
20
],
[
15
,
29
,
499
],
[
24
,
24
,
24
,
24
,
24
],
[
25
,
32
,
64
,
77
]]
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_tokens
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
4
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
17
assert
embeddings
.
usage
.
total_tokens
==
17
if
__name__
==
"__main__"
:
pytest
.
main
([
__file__
])
tests/entrypoints/test_openai_vision.py
View file @
a5753ff5
...
...
@@ -8,7 +8,7 @@ import ray
from
vllm.multimodal.utils
import
ImageFetchAiohttp
,
encode_image_base64
from
..utils
import
ServerRunn
er
from
..utils
import
VLLM_PATH
,
RemoteOpenAIServ
er
MODEL_NAME
=
"llava-hf/llava-1.5-7b-hf"
LLAVA_CHAT_TEMPLATE
=
(
Path
(
__file__
).
parent
.
parent
.
parent
/
...
...
@@ -25,10 +25,16 @@ TEST_IMAGE_URLS = [
pytestmark
=
pytest
.
mark
.
openai
@
pytest
.
fixture
(
scope
=
"module"
)
def
ray_ctx
():
ray
.
init
(
runtime_env
=
{
"working_dir"
:
VLLM_PATH
})
yield
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
ray
.
init
()
server_runner
=
ServerRunner
.
remote
([
return
RemoteOpenAIServer
([
"--model"
,
MODEL_NAME
,
"--dtype"
,
...
...
@@ -47,18 +53,11 @@ def server():
"--chat-template"
,
str
(
LLAVA_CHAT_TEMPLATE
),
])
ray
.
get
(
server_runner
.
ready
.
remote
())
yield
server_runner
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"session"
)
def
client
():
client
=
openai
.
AsyncOpenAI
(
base_url
=
"http://localhost:8000/v1"
,
api_key
=
"token-abc123"
,
)
yield
client
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
@
pytest_asyncio
.
fixture
(
scope
=
"session"
)
...
...
@@ -73,7 +72,7 @@ async def base64_encoded_image() -> Dict[str, str]:
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_single_chat_session_image
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_single_chat_session_image
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
):
messages
=
[{
"role"
:
...
...
@@ -126,7 +125,7 @@ async def test_single_chat_session_image(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_single_chat_session_image_base64encoded
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
,
base64_encoded_image
:
Dict
[
str
,
str
]):
messages
=
[{
...
...
@@ -180,7 +179,7 @@ async def test_single_chat_session_image_base64encoded(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_chat_streaming_image
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_chat_streaming_image
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
):
messages
=
[{
"role"
:
...
...
@@ -237,8 +236,8 @@ async def test_chat_streaming_image(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_multi_image_input
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
):
async
def
test_multi_image_input
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
):
messages
=
[{
"role"
:
...
...
tests/kernels/test_cutlass.py
View file @
a5753ff5
...
...
@@ -47,7 +47,7 @@ def cutlass_fp8_gemm_helper(m: int,
scale_b
=
(
torch
.
randn
(
(
1
,
n_b_scales
),
device
=
device
,
dtype
=
torch
.
float32
)
/
10
)
out
=
ops
.
cutlass_scaled_mm
_dq
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
)
out
=
ops
.
cutlass_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
)
baseline
=
torch
.
mm
(
scale_a
*
a
.
to
(
dtype
=
torch
.
float32
),
scale_b
*
b
.
to
(
dtype
=
torch
.
float32
)).
to
(
out_dtype
)
...
...
@@ -74,7 +74,7 @@ def cutlass_int8_gemm_helper(m: int,
scale_b
=
(
torch
.
randn
(
(
1
,
n_b_scales
),
device
=
device
,
dtype
=
torch
.
float32
)
/
10
)
out
=
ops
.
cutlass_scaled_mm
_dq
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
)
out
=
ops
.
cutlass_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
)
baseline
=
torch
.
mm
(
scale_a
*
a
.
to
(
dtype
=
torch
.
float32
),
scale_b
*
b
.
to
(
dtype
=
torch
.
float32
)).
to
(
dtype
=
out_dtype
)
...
...
@@ -180,11 +180,11 @@ def test_cutlass_subset():
scale_a
=
torch
.
randn
((
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
/
10
scale_b
=
torch
.
randn
((
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
/
10
out
=
ops
.
cutlass_scaled_mm
_dq
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
=
torch
.
bfloat16
)
out
=
ops
.
cutlass_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
=
torch
.
bfloat16
)
baseline
=
torch
.
mm
(
scale_a
*
a
.
to
(
dtype
=
torch
.
float32
),
scale_b
*
b
.
to
(
dtype
=
torch
.
float32
)).
to
(
dtype
=
torch
.
bfloat16
)
...
...
@@ -203,8 +203,8 @@ class CutlassLayer(torch.nn.Module):
self
.
out_dtype
=
out_dtype
def
forward
(
self
,
a
):
return
ops
.
cutlass_scaled_mm
_dq
(
a
,
self
.
b
,
self
.
scale_a
,
self
.
scale_b
,
self
.
out_dtype
)
return
ops
.
cutlass_scaled_mm
(
a
,
self
.
b
,
self
.
scale_a
,
self
.
scale_b
,
self
.
out_dtype
)
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
...
...
tests/lora/conftest.py
View file @
a5753ff5
...
...
@@ -12,7 +12,10 @@ from huggingface_hub import snapshot_download
import
vllm
from
vllm.config
import
LoRAConfig
from
vllm.distributed
import
destroy_model_parallel
,
initialize_model_parallel
from
vllm.distributed
import
(
destroy_distributed_environment
,
destroy_model_parallel
,
init_distributed_environment
,
initialize_model_parallel
)
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
MergedColumnParallelLinear
,
RowParallelLinear
)
...
...
@@ -35,6 +38,7 @@ LONG_LORA_INFOS = [{
def
cleanup
():
destroy_model_parallel
()
destroy_distributed_environment
()
with
contextlib
.
suppress
(
AssertionError
):
torch
.
distributed
.
destroy_process_group
()
gc
.
collect
()
...
...
@@ -64,15 +68,14 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
@
pytest
.
fixture
def
dist_init
():
if
not
torch
.
distributed
.
is_initialized
():
temp_file
=
tempfile
.
mkstemp
()[
1
]
torch
.
distributed
.
init_process_group
(
backend
=
"nccl"
,
world_size
=
1
,
rank
=
0
,
init_method
=
f
"file://
{
temp_file
}
"
,
)
torch
.
distributed
.
all_reduce
(
torch
.
zeros
(
1
).
cuda
())
temp_file
=
tempfile
.
mkstemp
()[
1
]
init_distributed_environment
(
world_size
=
1
,
rank
=
0
,
distributed_init_method
=
f
"file://
{
temp_file
}
"
,
local_rank
=
0
,
backend
=
"nccl"
,
)
initialize_model_parallel
(
1
,
1
)
yield
cleanup
()
...
...
tests/models/test_aqlm.py
View file @
a5753ff5
...
...
@@ -4,17 +4,8 @@ Run `pytest tests/models/test_aqlm.py`.
"""
import
pytest
import
torch
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
aqlm_not_supported
=
True
if
torch
.
cuda
.
is_available
():
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
aqlm_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"aqlm"
].
get_min_capability
())
from
tests.quantization.utils
import
is_quant_method_supported
# In this test we hardcode prompts and generations for the model so we don't
# need to require the AQLM package as a dependency
...
...
@@ -67,7 +58,7 @@ ground_truth_generations = [
]
@
pytest
.
mark
.
skipif
(
aqlm_not
_supported
,
@
pytest
.
mark
.
skipif
(
not
is_quant_method
_supported
(
"aqlm"
)
,
reason
=
"AQLM is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
...
...
tests/models/test_fp8.py
View file @
a5753ff5
...
...
@@ -8,8 +8,8 @@ import pytest
import
torch
from
transformers
import
AutoTokenizer
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm
import
LLM
,
SamplingParams
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"true"
...
...
@@ -67,16 +67,16 @@ EXPECTED_STRS_MAP = {
},
}
fp8_not_supported
=
True
if
torch
.
cuda
.
is_available
():
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
fp8_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"fp8"
].
get_min_capability
())
@
pytest
.
mark
.
skipif
(
fp8_not_supported
,
# This test compares against golden strings for exact match since
# there is no baseline implementation to compare against
# and is unstable w.r.t specifics of the fp8 implementation or
# the hardware being run on.
# Disabled to prevent it from breaking the build
@
pytest
.
mark
.
skip
(
reason
=
"Prevent unstable test based on golden strings from breaking the build."
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"fp8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
[
"auto"
,
"fp8"
])
...
...
tests/models/test_gptq_marlin.py
View file @
a5753ff5
...
...
@@ -11,9 +11,8 @@ Run `pytest tests/models/test_gptq_marlin.py`.
import
os
import
pytest
import
torch
from
vllm.model_executor.layer
s.quantization
import
QUANTIZATION_METHODS
from
test
s.quantization
.utils
import
is_quant_method_supported
from
vllm.model_executor.layers.rotary_embedding
import
_ROPE_DICT
from
.utils
import
check_logprobs_close
...
...
@@ -22,14 +21,6 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
MAX_MODEL_LEN
=
1024
gptq_marlin_not_supported
=
True
if
torch
.
cuda
.
is_available
():
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
gptq_marlin_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"gptq_marlin"
].
get_min_capability
())
MODELS
=
[
# act_order==False, group_size=channelwise
(
"robertgshaw2/zephyr-7b-beta-channelwise-gptq"
,
"main"
),
...
...
@@ -53,7 +44,7 @@ MODELS = [
@
pytest
.
mark
.
flaky
(
reruns
=
3
)
@
pytest
.
mark
.
skipif
(
gptq_marlin_not_supported
,
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
)
,
reason
=
"gptq_marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
,
"bfloat16"
])
...
...
tests/models/test_gptq_marlin_24.py
View file @
a5753ff5
...
...
@@ -9,18 +9,9 @@ Run `pytest tests/models/test_marlin_24.py`.
from
dataclasses
import
dataclass
import
pytest
import
torch
from
tests.models.utils
import
check_logprobs_close
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
marlin_not_supported
=
True
if
torch
.
cuda
.
is_available
():
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
marlin_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"marlin"
].
get_min_capability
())
from
tests.quantization.utils
import
is_quant_method_supported
@
dataclass
...
...
@@ -47,7 +38,7 @@ model_pairs = [
@
pytest
.
mark
.
flaky
(
reruns
=
2
)
@
pytest
.
mark
.
skipif
(
marlin_not_supported
,
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin_24"
)
,
reason
=
"Marlin24 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_pair"
,
model_pairs
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
...
...
tests/models/test_marlin.py
View file @
a5753ff5
...
...
@@ -13,20 +13,11 @@ Run `pytest tests/models/test_marlin.py`.
from
dataclasses
import
dataclass
import
pytest
import
torch
from
vllm.model_executor.layer
s.quantization
import
QUANTIZATION_METHODS
from
test
s.quantization
.utils
import
is_quant_method_supported
from
.utils
import
check_logprobs_close
marlin_not_supported
=
True
if
torch
.
cuda
.
is_available
():
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
marlin_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"marlin"
].
get_min_capability
())
@
dataclass
class
ModelPair
:
...
...
@@ -45,7 +36,7 @@ model_pairs = [
@
pytest
.
mark
.
flaky
(
reruns
=
2
)
@
pytest
.
mark
.
skipif
(
marlin_not_supported
,
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"marlin"
)
,
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_pair"
,
model_pairs
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
...
...
tests/quantization/test_bitsandbytes.py
View file @
a5753ff5
...
...
@@ -5,16 +5,12 @@ Run `pytest tests/quantization/test_bitsandbytes.py`.
import
pytest
import
torch
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm
import
SamplingParams
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
@
pytest
.
mark
.
skipif
(
capability
<
QUANTIZATION_METHODS
[
'bitsandbytes'
].
get_min_capability
(),
reason
=
'bitsandbytes is not supported on this GPU type.'
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"bitsandbytes"
),
reason
=
'bitsandbytes is not supported on this GPU type.'
)
def
test_load_bnb_model
(
vllm_runner
)
->
None
:
with
vllm_runner
(
'huggyllama/llama-7b'
,
quantization
=
'bitsandbytes'
,
...
...
tests/quantization/test_compressed_tensors.py
View file @
a5753ff5
...
...
@@ -3,12 +3,13 @@
Run `pytest tests/quantization/test_compressed_tensors.py`.
"""
import
pytest
import
torch
from
vllm
import
SamplingParams
from
vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors
import
(
# noqa: E501
CompressedTensorsLinearMethod
,
CompressedTensorsW
8A8DynamicToken
,
CompressedTensorsW8A8StaticTensor
)
CompressedTensorsLinearMethod
,
CompressedTensorsW
4A16
,
CompressedTensorsW8A8DynamicToken
,
CompressedTensorsW8A8StaticTensor
)
def
test_compressed_tensors_w8a8_static_setup
(
vllm_runner
):
...
...
@@ -60,3 +61,25 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
assert
isinstance
(
qkv_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
qkv_proj
.
scheme
,
CompressedTensorsW8A8DynamicToken
)
assert
qkv_proj
.
weight
.
dtype
is
torch
.
int8
@
pytest
.
mark
.
parametrize
(
"w4a16_args"
,
[
(
"nm-testing/tinyllama-oneshot-w4a16-channel-v2"
,
"channel"
,
None
),
(
"nm-testing/tinyllama-oneshot-w4a16-group128-v2"
,
"group"
,
128
),
])
def
test_compressed_tensors_w4a16
(
vllm_runner
,
w4a16_args
):
model
,
strategy
,
group
=
w4a16_args
with
vllm_runner
(
model
)
as
llm
:
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
# noqa: E501
layer
=
model
.
model
.
layers
[
0
]
qkv_proj
=
layer
.
self_attn
.
qkv_proj
assert
isinstance
(
qkv_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
qkv_proj
.
scheme
,
CompressedTensorsW4A16
)
assert
qkv_proj
.
scheme
.
strategy
==
strategy
assert
qkv_proj
.
scheme
.
group_size
==
group
assert
qkv_proj
.
weight_packed
.
dtype
is
torch
.
int32
assert
qkv_proj
.
weight_scale
.
dtype
is
torch
.
float16
assert
qkv_proj
.
weight_packed
.
pack_factor
==
8
tests/quantization/test_fp8.py
View file @
a5753ff5
...
...
@@ -5,16 +5,13 @@ Run `pytest tests/quantization/test_fp8.py --forked`.
import
pytest
import
torch
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm._custom_ops
import
scaled_fp8_quant
from
vllm.model_executor.layers.quantization.fp8
import
Fp8LinearMethod
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
@
pytest
.
mark
.
skipif
(
capability
<
QUANTIZATION_METHODS
[
"fp8"
].
get_min_capability
(),
reason
=
"FP8 is not supported on this GPU type."
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"FP8 is not supported on this GPU type."
)
def
test_load_fp16_model
(
vllm_runner
)
->
None
:
with
vllm_runner
(
"facebook/opt-125m"
,
quantization
=
"fp8"
)
as
llm
:
...
...
@@ -22,3 +19,48 @@ def test_load_fp16_model(vllm_runner) -> None:
fc1
=
model
.
model
.
decoder
.
layers
[
0
].
fc1
assert
isinstance
(
fc1
.
quant_method
,
Fp8LinearMethod
)
assert
fc1
.
weight
.
dtype
==
torch
.
float8_e4m3fn
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"FP8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float16
,
torch
.
bfloat16
])
def
test_scaled_fp8_quant
(
dtype
)
->
None
:
def
quantize_ref
(
tensor
,
inv_scale
):
# The reference implementation that fully aligns to
# the kernel being tested.
finfo
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
scale
=
inv_scale
.
reciprocal
()
qweight
=
(
tensor
.
to
(
torch
.
float32
)
*
scale
).
clamp
(
min
=
finfo
.
min
,
max
=
finfo
.
max
)
qweight
=
qweight
.
to
(
torch
.
float8_e4m3fn
)
return
qweight
def
per_tensor_dequantize
(
tensor
,
inv_scale
,
dtype
):
fake_qweight
=
tensor
.
to
(
dtype
)
dq_weight
=
fake_qweight
*
inv_scale
return
dq_weight
# Note that we use a shape % 4 != 0 to cover edge cases,
# because scaled_fp8_quant is vectorized by 4.
x
=
(
torch
.
randn
(
size
=
(
11
,
11
),
device
=
"cuda"
)
*
13
).
to
(
dtype
)
# Dynamic quantization
ref_y
,
inv_scale
=
scaled_fp8_quant
(
x
,
None
)
ref_y
=
per_tensor_dequantize
(
ref_y
,
inv_scale
,
dtype
)
# Reference dynamic quantizaton
y
=
quantize_ref
(
x
,
inv_scale
)
assert
torch
.
allclose
(
ref_y
,
per_tensor_dequantize
(
y
,
inv_scale
,
dtype
))
# Static quantization
y
,
_
=
scaled_fp8_quant
(
x
,
inv_scale
)
assert
torch
.
allclose
(
ref_y
,
per_tensor_dequantize
(
y
,
inv_scale
,
dtype
))
# Padding
y
,
_
=
scaled_fp8_quant
(
x
,
inv_scale
,
batch_dim_padding
=
17
)
assert
y
.
shape
[
0
]
==
17
assert
torch
.
allclose
(
ref_y
,
per_tensor_dequantize
(
torch
.
narrow
(
y
,
0
,
0
,
x
.
shape
[
0
]),
inv_scale
,
dtype
))
tests/quantization/utils.py
0 → 100644
View file @
a5753ff5
import
torch
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
def
is_quant_method_supported
(
quant_method
:
str
)
->
bool
:
# Currently, all quantization methods require Nvidia or AMD GPUs
if
not
torch
.
cuda
.
is_available
():
return
False
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
return
(
capability
<
QUANTIZATION_METHODS
[
quant_method
].
get_min_capability
())
tests/tensorizer_loader/test_tensorizer.py
View file @
a5753ff5
import
json
import
os
import
pathlib
import
subprocess
from
unittest.mock
import
MagicMock
,
patch
import
openai
import
pytest
import
ray
import
torch
from
tensorizer
import
EncryptionParams
from
vllm
import
SamplingParams
from
vllm.engine.arg_utils
import
EngineArgs
# yapf: disable
from
vllm.model_executor.model_loader.tensorizer
import
(
TensorizerConfig
,
TensorSerializer
,
is_vllm_tensorized
,
load_with_tensorizer
,
open_stream
,
serialize_vllm_model
)
serialize_vllm_model
,
tensorize_vllm_model
)
from
..utils
import
ServerRunner
from
..conftest
import
VllmRunner
,
cleanup
from
..utils
import
RemoteOpenAIServer
# yapf conflicts with isort for this docstring
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
...
...
@@ -42,6 +49,20 @@ def is_curl_installed():
except
(
subprocess
.
CalledProcessError
,
FileNotFoundError
):
return
False
def
get_torch_model
(
vllm_runner
:
VllmRunner
):
return
vllm_runner
\
.
model
\
.
llm_engine
\
.
model_executor
\
.
driver_worker
\
.
model_runner
\
.
model
def
write_keyfile
(
keyfile_path
:
str
):
encryption_params
=
EncryptionParams
.
random
()
pathlib
.
Path
(
keyfile_path
).
parent
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
with
open
(
keyfile_path
,
'wb'
)
as
f
:
f
.
write
(
encryption_params
.
key
)
@
pytest
.
fixture
(
autouse
=
True
)
def
tensorizer_config
():
...
...
@@ -88,12 +109,17 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
with
vllm_runner
(
model_ref
)
as
vllm_model
:
model_path
=
tmp_path
/
(
model_ref
+
".tensors"
)
key_path
=
tmp_path
/
(
model_ref
+
".key"
)
write_keyfile
(
key_path
)
outputs
=
vllm_model
.
generate
(
prompts
,
sampling_params
)
config_for_serializing
=
TensorizerConfig
(
tensorizer_uri
=
model_path
)
serialize_vllm_model
(
vllm_model
.
model
.
llm_engine
,
config_for_serializing
,
encryption_key_path
=
key_path
)
config_for_serializing
=
TensorizerConfig
(
tensorizer_uri
=
model_path
,
encryption_keyfile
=
key_path
)
serialize_vllm_model
(
get_torch_model
(
vllm_model
),
config_for_serializing
)
config_for_deserializing
=
TensorizerConfig
(
tensorizer_uri
=
model_path
,
encryption_keyfile
=
key_path
)
...
...
@@ -145,7 +171,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
with
vllm_runner
(
model_ref
,
)
as
vllm_model
:
model_path
=
tmp_path
/
(
model_ref
+
".tensors"
)
serialize_vllm_model
(
vllm_model
.
model
.
llm_
engine
,
serialize_vllm_model
(
get_torch_
model
(
v
llm_
model
)
,
TensorizerConfig
(
tensorizer_uri
=
model_path
))
with
vllm_runner
(
...
...
@@ -180,7 +206,7 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
with
vllm_runner
(
model_ref
,
)
as
vllm_model
:
model_path
=
tmp_path
/
(
model_ref
+
".tensors"
)
serialize_vllm_model
(
vllm_model
.
model
.
llm_
engine
,
serialize_vllm_model
(
get_torch_
model
(
v
llm_
model
)
,
TensorizerConfig
(
tensorizer_uri
=
model_path
))
model_loader_extra_config
=
{
...
...
@@ -191,18 +217,13 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
openai_args
=
[
"--model"
,
model_ref
,
"--dtype"
,
"float16"
,
"--load-format"
,
"tensorizer"
,
"--model-loader-extra-config"
,
json
.
dumps
(
model_loader_extra_config
),
"--port"
,
"8000"
json
.
dumps
(
model_loader_extra_config
),
]
server
=
ServerRunner
.
remote
(
openai_args
)
assert
ray
.
get
(
server
.
ready
.
remote
())
server
=
RemoteOpenAIServer
(
openai_args
)
print
(
"Server ready."
)
client
=
openai
.
OpenAI
(
base_url
=
"http://localhost:8000/v1"
,
api_key
=
"token-abc123"
,
)
client
=
server
.
get_client
()
completion
=
client
.
completions
.
create
(
model
=
model_ref
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
...
...
@@ -224,7 +245,9 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner):
model_loader_extra_config
=
TensorizerConfig
(
tensorizer_uri
=
"test"
))
def
test_tensorizer_with_tp
(
vllm_runner
):
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Requires 2 GPUs"
)
def
test_tensorizer_with_tp_path_without_template
(
vllm_runner
):
with
pytest
.
raises
(
ValueError
):
model_ref
=
"EleutherAI/pythia-1.4b"
tensorized_path
=
f
"s3://tensorized/
{
model_ref
}
/fp16/model.tensors"
...
...
@@ -238,8 +261,62 @@ def test_tensorizer_with_tp(vllm_runner):
s3_endpoint
=
"object.ord1.coreweave.com"
,
),
tensor_parallel_size
=
2
,
disable_custom_all_reduce
=
True
,
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Requires 2 GPUs"
)
def
test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs
(
vllm_runner
,
tmp_path
):
model_ref
=
"EleutherAI/pythia-1.4b"
# record outputs from un-sharded un-tensorized model
base_model
=
vllm_runner
(
model_ref
,
disable_custom_all_reduce
=
True
,
enforce_eager
=
True
,
)
outputs
=
base_model
.
generate
(
prompts
,
sampling_params
)
base_model
.
model
.
llm_engine
.
model_executor
.
shutdown
()
del
base_model
cleanup
()
ray
.
shutdown
()
# load model with two shards and serialize with encryption
model_path
=
str
(
tmp_path
/
(
model_ref
+
"-%02d.tensors"
))
key_path
=
tmp_path
/
(
model_ref
+
".key"
)
tensorizer_config
=
TensorizerConfig
(
tensorizer_uri
=
model_path
,
encryption_keyfile
=
key_path
,
)
tensorize_vllm_model
(
engine_args
=
EngineArgs
(
model
=
model_ref
,
tensor_parallel_size
=
2
,
disable_custom_all_reduce
=
True
,
enforce_eager
=
True
,
),
tensorizer_config
=
tensorizer_config
,
)
assert
os
.
path
.
isfile
(
model_path
%
0
),
"Serialization subprocess failed"
assert
os
.
path
.
isfile
(
model_path
%
1
),
"Serialization subprocess failed"
cleanup
()
ray
.
shutdown
()
loaded_vllm_model
=
vllm_runner
(
model_ref
,
tensor_parallel_size
=
2
,
load_format
=
"tensorizer"
,
disable_custom_all_reduce
=
True
,
enforce_eager
=
True
,
model_loader_extra_config
=
tensorizer_config
)
deserialized_outputs
=
loaded_vllm_model
.
generate
(
prompts
,
sampling_params
)
assert
outputs
==
deserialized_outputs
def
test_vllm_tensorized_model_has_same_outputs
(
vllm_runner
,
tmp_path
):
model_ref
=
"facebook/opt-125m"
...
...
@@ -248,7 +325,7 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
with
vllm_runner
(
model_ref
)
as
vllm_model
:
outputs
=
vllm_model
.
generate
(
prompts
,
sampling_params
)
serialize_vllm_model
(
vllm_model
.
model
.
llm_
engine
,
config
)
serialize_vllm_model
(
get_torch_
model
(
v
llm_
model
)
,
config
)
assert
is_vllm_tensorized
(
config
)
...
...
tests/test_sharded_state_loader.py
View file @
a5753ff5
...
...
@@ -39,7 +39,7 @@ def test_filter_subtensors():
filtered_state_dict
=
ShardedStateLoader
.
_filter_subtensors
(
state_dict
)
assert
tuple
(
filtered_state_dict
.
keys
())
==
(
"a"
,
"b"
,
"c"
)
for
key
,
tensor
in
filtered_state_dict
.
items
():
# NOTE: don't use `e
u
qal` here, as the tensor might contain NaNs
# NOTE: don't use `eq
u
al` here, as the tensor might contain NaNs
assert
tensor
is
state_dict
[
key
]
...
...
tests/utils.py
View file @
a5753ff5
...
...
@@ -4,57 +4,109 @@ import sys
import
time
import
warnings
from
contextlib
import
contextmanager
from
typing
import
List
import
openai
import
ray
import
requests
from
vllm.distributed
import
(
ensure_model_parallel_initialized
,
init_distributed_environment
)
from
vllm.entrypoints.openai.cli_args
import
make_arg_parser
from
vllm.utils
import
get_open_port
# Path to root of repository so that utilities can be imported by ray workers
VLLM_PATH
=
os
.
path
.
abspath
(
os
.
path
.
join
(
__file__
,
os
.
pardir
,
os
.
pardir
))
@
ray
.
remote
(
num_gpus
=
1
)
class
ServerRunner
:
class
RemoteOpenAIServer
:
DUMMY_API_KEY
=
"token-abc123"
# vLLM's OpenAI server does not need API key
MAX_SERVER_START_WAIT_S
=
600
# wait for server to start for 60 seconds
def
__init__
(
self
,
args
):
env
=
os
.
environ
.
copy
()
env
[
"PYTHONUNBUFFERED"
]
=
"1"
self
.
proc
=
subprocess
.
Popen
(
[
sys
.
executable
,
"-m"
,
"vllm.entrypoints.openai.api_server"
]
+
args
,
env
=
env
,
stdout
=
sys
.
stdout
,
stderr
=
sys
.
stderr
,
@
ray
.
remote
(
num_gpus
=
1
)
class
_RemoteRunner
:
def
__init__
(
self
,
cli_args
:
List
[
str
],
*
,
wait_url
:
str
,
wait_timeout
:
float
)
->
None
:
env
=
os
.
environ
.
copy
()
env
[
"PYTHONUNBUFFERED"
]
=
"1"
self
.
proc
=
subprocess
.
Popen
(
[
sys
.
executable
,
"-m"
,
"vllm.entrypoints.openai.api_server"
,
*
cli_args
],
env
=
env
,
stdout
=
sys
.
stdout
,
stderr
=
sys
.
stderr
,
)
self
.
_wait_for_server
(
url
=
wait_url
,
timeout
=
wait_timeout
)
def
ready
(
self
):
return
True
def
_wait_for_server
(
self
,
*
,
url
:
str
,
timeout
:
float
):
# run health check
start
=
time
.
time
()
while
True
:
try
:
if
requests
.
get
(
url
).
status_code
==
200
:
break
except
Exception
as
err
:
if
self
.
proc
.
poll
()
is
not
None
:
raise
RuntimeError
(
"Server exited unexpectedly."
)
from
err
time
.
sleep
(
0.5
)
if
time
.
time
()
-
start
>
timeout
:
raise
RuntimeError
(
"Server failed to start in time."
)
from
err
def
__del__
(
self
):
if
hasattr
(
self
,
"proc"
):
self
.
proc
.
terminate
()
def
__init__
(
self
,
cli_args
:
List
[
str
],
*
,
auto_port
:
bool
=
True
)
->
None
:
if
auto_port
:
if
"-p"
in
cli_args
or
"--port"
in
cli_args
:
raise
ValueError
(
"You have manually specified the port"
"when `auto_port=True`."
)
cli_args
=
cli_args
+
[
"--port"
,
str
(
get_open_port
())]
parser
=
make_arg_parser
()
args
=
parser
.
parse_args
(
cli_args
)
self
.
host
=
str
(
args
.
host
or
'localhost'
)
self
.
port
=
int
(
args
.
port
)
self
.
_runner
=
self
.
_RemoteRunner
.
remote
(
cli_args
,
wait_url
=
self
.
url_for
(
"health"
),
wait_timeout
=
self
.
MAX_SERVER_START_WAIT_S
)
self
.
_wait_until_ready
()
@
property
def
url_root
(
self
)
->
str
:
return
f
"http://
{
self
.
host
}
:
{
self
.
port
}
"
def
url_for
(
self
,
*
parts
:
str
)
->
str
:
return
self
.
url_root
+
"/"
+
"/"
.
join
(
parts
)
def
_wait_until_ready
(
self
)
->
None
:
ray
.
get
(
self
.
_runner
.
ready
.
remote
())
def
get_client
(
self
):
return
openai
.
OpenAI
(
base_url
=
self
.
url_for
(
"v1"
),
api_key
=
self
.
DUMMY_API_KEY
,
)
def
get_async_client
(
self
):
return
openai
.
AsyncOpenAI
(
base_url
=
self
.
url_for
(
"v1"
),
api_key
=
self
.
DUMMY_API_KEY
,
)
self
.
_wait_for_server
()
def
ready
(
self
):
return
True
def
_wait_for_server
(
self
):
# run health check
start
=
time
.
time
()
while
True
:
try
:
if
requests
.
get
(
"http://localhost:8000/health"
).
status_code
==
200
:
break
except
Exception
as
err
:
if
self
.
proc
.
poll
()
is
not
None
:
raise
RuntimeError
(
"Server exited unexpectedly."
)
from
err
time
.
sleep
(
0.5
)
if
time
.
time
()
-
start
>
self
.
MAX_SERVER_START_WAIT_S
:
raise
RuntimeError
(
"Server failed to start in time."
)
from
err
def
__del__
(
self
):
if
hasattr
(
self
,
"proc"
):
self
.
proc
.
terminate
()
def
init_test_distributed_environment
(
...
...
tests/worker/test_model_runner.py
View file @
a5753ff5
import
pytest
import
torch
from
vllm.distributed.parallel_state
import
init_distributed_environment
from
vllm.distributed.parallel_state
import
(
ensure_model_parallel_initialized
,
init_distributed_environment
)
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
SamplingParams
,
SequenceData
,
SequenceGroupMetadata
...
...
@@ -292,6 +293,7 @@ def distributed_init():
rank
=
0
,
distributed_init_method
=
f
"tcp://127.0.0.1:
{
get_open_port
()
}
"
,
local_rank
=
0
)
ensure_model_parallel_initialized
(
1
,
1
)
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
list
(
range
(
2
,
128
)))
...
...
vllm/__init__.py
View file @
a5753ff5
...
...
@@ -13,9 +13,10 @@ from vllm.pooling_params import PoolingParams
from
vllm.sampling_params
import
SamplingParams
from
vllm.version
import
__dcu_version__
__version__
=
"0.5.0"
from
.version
import
__version__
__all__
=
[
"__version__"
,
"LLM"
,
"ModelRegistry"
,
"PromptStrictInputs"
,
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment