Commit a5753ff5 authored by zhuwenwen's avatar zhuwenwen
Browse files

v0.5.0.post1

parents 21c06ecb 0f0d8bc0
import os
import ray
from vllm.utils import cuda_device_count_stateless
@ray.remote
class _CUDADeviceCountStatelessTestActor():
def get_count(self):
return cuda_device_count_stateless()
def set_cuda_visible_devices(self, cuda_visible_devices: str):
os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
def get_cuda_visible_devices(self):
return os.environ["CUDA_VISIBLE_DEVICES"]
def test_cuda_device_count_stateless():
"""Test that cuda_device_count_stateless changes return value if
CUDA_VISIBLE_DEVICES is changed."""
actor = _CUDADeviceCountStatelessTestActor.options(num_gpus=2).remote()
assert ray.get(actor.get_cuda_visible_devices.remote()) == "0,1"
assert ray.get(actor.get_count.remote()) == 2
ray.get(actor.set_cuda_visible_devices.remote("0"))
assert ray.get(actor.get_count.remote()) == 1
ray.get(actor.set_cuda_visible_devices.remote(""))
assert ray.get(actor.get_count.remote()) == 0
import openai
import pytest
import ray
from ..utils import VLLM_PATH, RemoteOpenAIServer
EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
pytestmark = pytest.mark.openai
@pytest.fixture(scope="module")
def ray_ctx():
ray.init(runtime_env={"working_dir": VLLM_PATH})
yield
ray.shutdown()
@pytest.fixture(scope="module")
def embedding_server(ray_ctx):
return RemoteOpenAIServer([
"--model",
EMBEDDING_MODEL_NAME,
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--enforce-eager",
"--max-model-len",
"8192",
"--enforce-eager",
])
@pytest.mark.asyncio
@pytest.fixture(scope="module")
def embedding_client(embedding_server):
return embedding_server.get_async_client()
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[EMBEDDING_MODEL_NAME],
)
async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
model_name: str):
input_texts = [
"The chef prepared a delicious meal.",
]
# test single embedding
embeddings = await embedding_client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 9
assert embeddings.usage.total_tokens == 9
# test using token IDs
input_tokens = [1, 1, 1, 1, 1]
embeddings = await embedding_client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 5
assert embeddings.usage.total_tokens == 5
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[EMBEDDING_MODEL_NAME],
)
async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
model_name: str):
# test List[str]
input_texts = [
"The cat sat on the mat.", "A feline was resting on a rug.",
"Stars twinkle brightly in the night sky."
]
embeddings = await embedding_client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
)
assert embeddings.id is not None
assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) == 4096
# test List[List[int]]
input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
[25, 32, 64, 77]]
embeddings = await embedding_client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
)
assert embeddings.id is not None
assert len(embeddings.data) == 4
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 17
assert embeddings.usage.total_tokens == 17
...@@ -15,11 +15,10 @@ from openai import BadRequestError ...@@ -15,11 +15,10 @@ from openai import BadRequestError
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
from ..utils import ServerRunner from ..utils import VLLM_PATH, RemoteOpenAIServer
# any model with a chat template should work here # any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing # technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here # generation quality here
LORA_NAME = "typeof/zephyr-7b-beta-lora" LORA_NAME = "typeof/zephyr-7b-beta-lora"
...@@ -80,9 +79,15 @@ def zephyr_lora_files(): ...@@ -80,9 +79,15 @@ def zephyr_lora_files():
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(zephyr_lora_files): def ray_ctx():
ray.init() ray.init(runtime_env={"working_dir": VLLM_PATH})
server_runner = ServerRunner.remote([ yield
ray.shutdown()
@pytest.fixture(scope="module")
def server(zephyr_lora_files, ray_ctx):
return RemoteOpenAIServer([
"--model", "--model",
MODEL_NAME, MODEL_NAME,
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
...@@ -91,8 +96,6 @@ def server(zephyr_lora_files): ...@@ -91,8 +96,6 @@ def server(zephyr_lora_files):
"--max-model-len", "--max-model-len",
"8192", "8192",
"--enforce-eager", "--enforce-eager",
"--gpu-memory-utilization",
"0.75",
# lora config below # lora config below
"--enable-lora", "--enable-lora",
"--lora-modules", "--lora-modules",
...@@ -105,43 +108,14 @@ def server(zephyr_lora_files): ...@@ -105,43 +108,14 @@ def server(zephyr_lora_files):
"--max-num-seqs", "--max-num-seqs",
"128", "128",
]) ])
ray.get(server_runner.ready.remote())
yield server_runner
ray.shutdown()
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def embedding_server(zephyr_lora_files): def client(server):
ray.shutdown() return server.get_async_client()
ray.init()
server_runner = ServerRunner.remote([
"--model",
EMBEDDING_MODEL_NAME,
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--enforce-eager",
"--gpu-memory-utilization",
"0.75",
"--max-model-len",
"8192",
])
ray.get(server_runner.ready.remote())
yield server_runner
ray.shutdown()
@pytest.fixture(scope="module")
def client():
client = openai.AsyncOpenAI(
base_url="http://localhost:8000/v1",
api_key="token-abc123",
)
yield client
@pytest.mark.asyncio async def test_check_models(client: openai.AsyncOpenAI):
async def test_check_models(server, client: openai.AsyncOpenAI):
models = await client.models.list() models = await client.models.list()
models = models.data models = models.data
served_model = models[0] served_model = models[0]
...@@ -158,8 +132,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI): ...@@ -158,8 +132,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI):
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"], [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
) )
async def test_single_completion(server, client: openai.AsyncOpenAI, async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
model_name: str):
completion = await client.completions.create(model=model_name, completion = await client.completions.create(model=model_name,
prompt="Hello, my name is", prompt="Hello, my name is",
max_tokens=5, max_tokens=5,
...@@ -190,8 +163,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI, ...@@ -190,8 +163,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI,
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"], [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
) )
async def test_no_logprobs(server, client: openai.AsyncOpenAI, async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
model_name: str):
# test using token IDs # test using token IDs
completion = await client.completions.create( completion = await client.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
...@@ -210,8 +182,7 @@ async def test_no_logprobs(server, client: openai.AsyncOpenAI, ...@@ -210,8 +182,7 @@ async def test_no_logprobs(server, client: openai.AsyncOpenAI,
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora"], [MODEL_NAME, "zephyr-lora"],
) )
async def test_zero_logprobs(server, client: openai.AsyncOpenAI, async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
model_name: str):
# test using token IDs # test using token IDs
completion = await client.completions.create( completion = await client.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
...@@ -232,8 +203,7 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI, ...@@ -232,8 +203,7 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora"], [MODEL_NAME, "zephyr-lora"],
) )
async def test_some_logprobs(server, client: openai.AsyncOpenAI, async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
model_name: str):
# test using token IDs # test using token IDs
completion = await client.completions.create( completion = await client.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
...@@ -254,7 +224,7 @@ async def test_some_logprobs(server, client: openai.AsyncOpenAI, ...@@ -254,7 +224,7 @@ async def test_some_logprobs(server, client: openai.AsyncOpenAI,
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora"], [MODEL_NAME, "zephyr-lora"],
) )
async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI, async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
model_name: str): model_name: str):
with pytest.raises( with pytest.raises(
...@@ -300,8 +270,7 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI, ...@@ -300,8 +270,7 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"], [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
) )
async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI, async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
model_name: str):
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -326,8 +295,7 @@ async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI, ...@@ -326,8 +295,7 @@ async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI,
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora"], [MODEL_NAME, "zephyr-lora"],
) )
async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI, async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
model_name: str):
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -354,8 +322,7 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI, ...@@ -354,8 +322,7 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI,
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora"], [MODEL_NAME, "zephyr-lora"],
) )
async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI, async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
model_name: str):
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -382,7 +349,7 @@ async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI, ...@@ -382,7 +349,7 @@ async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI,
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora"], [MODEL_NAME, "zephyr-lora"],
) )
async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI, async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
model_name: str): model_name: str):
messages = [{ messages = [{
"role": "system", "role": "system",
...@@ -425,7 +392,7 @@ async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI, ...@@ -425,7 +392,7 @@ async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora"], [MODEL_NAME, "zephyr-lora"],
) )
async def test_single_chat_session(server, client: openai.AsyncOpenAI, async def test_single_chat_session(client: openai.AsyncOpenAI,
model_name: str): model_name: str):
messages = [{ messages = [{
"role": "system", "role": "system",
...@@ -470,7 +437,7 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI, ...@@ -470,7 +437,7 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI,
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora"], [MODEL_NAME, "zephyr-lora"],
) )
async def test_completion_streaming(server, client: openai.AsyncOpenAI, async def test_completion_streaming(client: openai.AsyncOpenAI,
model_name: str): model_name: str):
prompt = "What is an LLM?" prompt = "What is an LLM?"
...@@ -505,8 +472,7 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI, ...@@ -505,8 +472,7 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora"], [MODEL_NAME, "zephyr-lora"],
) )
async def test_chat_streaming(server, client: openai.AsyncOpenAI, async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
model_name: str):
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -555,8 +521,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI, ...@@ -555,8 +521,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
"model_name", "model_name",
["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"], ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
) )
async def test_chat_completion_stream_options(server, async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
client: openai.AsyncOpenAI,
model_name: str): model_name: str):
messages = [{ messages = [{
"role": "system", "role": "system",
...@@ -626,7 +591,7 @@ async def test_chat_completion_stream_options(server, ...@@ -626,7 +591,7 @@ async def test_chat_completion_stream_options(server,
"model_name", "model_name",
["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"], ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
) )
async def test_completion_stream_options(server, client: openai.AsyncOpenAI, async def test_completion_stream_options(client: openai.AsyncOpenAI,
model_name: str): model_name: str):
prompt = "What is the capital of France?" prompt = "What is the capital of France?"
...@@ -688,8 +653,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI, ...@@ -688,8 +653,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora"], [MODEL_NAME, "zephyr-lora"],
) )
async def test_batch_completions(server, client: openai.AsyncOpenAI, async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
model_name: str):
# test simple list # test simple list
batch = await client.completions.create( batch = await client.completions.create(
model=model_name, model=model_name,
...@@ -737,7 +701,7 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI, ...@@ -737,7 +701,7 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_logits_bias(server, client: openai.AsyncOpenAI): async def test_logits_bias(client: openai.AsyncOpenAI):
prompt = "Hello, my name is" prompt = "Hello, my name is"
max_tokens = 5 max_tokens = 5
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
...@@ -786,7 +750,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI): ...@@ -786,7 +750,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"]) ["outlines", "lm-format-enforcer"])
async def test_guided_json_completion(server, client: openai.AsyncOpenAI, async def test_guided_json_completion(client: openai.AsyncOpenAI,
guided_decoding_backend: str): guided_decoding_backend: str):
completion = await client.completions.create( completion = await client.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
...@@ -808,7 +772,7 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI, ...@@ -808,7 +772,7 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"]) ["outlines", "lm-format-enforcer"])
async def test_guided_json_chat(server, client: openai.AsyncOpenAI, async def test_guided_json_chat(client: openai.AsyncOpenAI,
guided_decoding_backend: str): guided_decoding_backend: str):
messages = [{ messages = [{
"role": "system", "role": "system",
...@@ -855,7 +819,7 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI, ...@@ -855,7 +819,7 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"]) ["outlines", "lm-format-enforcer"])
async def test_guided_regex_completion(server, client: openai.AsyncOpenAI, async def test_guided_regex_completion(client: openai.AsyncOpenAI,
guided_decoding_backend: str): guided_decoding_backend: str):
completion = await client.completions.create( completion = await client.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
...@@ -875,7 +839,7 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI, ...@@ -875,7 +839,7 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"]) ["outlines", "lm-format-enforcer"])
async def test_guided_regex_chat(server, client: openai.AsyncOpenAI, async def test_guided_regex_chat(client: openai.AsyncOpenAI,
guided_decoding_backend: str): guided_decoding_backend: str):
messages = [{ messages = [{
"role": "system", "role": "system",
...@@ -913,7 +877,7 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI, ...@@ -913,7 +877,7 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"]) ["outlines", "lm-format-enforcer"])
async def test_guided_choice_completion(server, client: openai.AsyncOpenAI, async def test_guided_choice_completion(client: openai.AsyncOpenAI,
guided_decoding_backend: str): guided_decoding_backend: str):
completion = await client.completions.create( completion = await client.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
...@@ -933,7 +897,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI, ...@@ -933,7 +897,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"]) ["outlines", "lm-format-enforcer"])
async def test_guided_choice_chat(server, client: openai.AsyncOpenAI, async def test_guided_choice_chat(client: openai.AsyncOpenAI,
guided_decoding_backend: str): guided_decoding_backend: str):
messages = [{ messages = [{
"role": "system", "role": "system",
...@@ -972,7 +936,7 @@ async def test_guided_choice_chat(server, client: openai.AsyncOpenAI, ...@@ -972,7 +936,7 @@ async def test_guided_choice_chat(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"]) ["outlines", "lm-format-enforcer"])
async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI, async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
guided_decoding_backend: str): guided_decoding_backend: str):
with pytest.raises(openai.BadRequestError): with pytest.raises(openai.BadRequestError):
_ = await client.completions.create( _ = await client.completions.create(
...@@ -1008,7 +972,7 @@ async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI, ...@@ -1008,7 +972,7 @@ async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"]) ["outlines", "lm-format-enforcer"])
async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI, async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
guided_decoding_backend: str): guided_decoding_backend: str):
messages = [{ messages = [{
"role": "system", "role": "system",
...@@ -1040,7 +1004,7 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI, ...@@ -1040,7 +1004,7 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"]) ["outlines", "lm-format-enforcer"])
async def test_named_tool_use(server, client: openai.AsyncOpenAI, async def test_named_tool_use(client: openai.AsyncOpenAI,
guided_decoding_backend: str): guided_decoding_backend: str):
messages = [{ messages = [{
"role": "system", "role": "system",
...@@ -1131,7 +1095,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI, ...@@ -1131,7 +1095,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", ["outlines"]) @pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
async def test_required_tool_use_not_yet_supported( async def test_required_tool_use_not_yet_supported(
server, client: openai.AsyncOpenAI, guided_decoding_backend: str): client: openai.AsyncOpenAI, guided_decoding_backend: str):
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -1177,7 +1141,7 @@ async def test_required_tool_use_not_yet_supported( ...@@ -1177,7 +1141,7 @@ async def test_required_tool_use_not_yet_supported(
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", ["outlines"]) @pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
async def test_inconsistent_tool_choice_and_tools( async def test_inconsistent_tool_choice_and_tools(
server, client: openai.AsyncOpenAI, guided_decoding_backend: str): client: openai.AsyncOpenAI, guided_decoding_backend: str):
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -1223,7 +1187,7 @@ async def test_inconsistent_tool_choice_and_tools( ...@@ -1223,7 +1187,7 @@ async def test_inconsistent_tool_choice_and_tools(
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_response_format_json_object(server, client: openai.AsyncOpenAI): async def test_response_format_json_object(client: openai.AsyncOpenAI):
for _ in range(2): for _ in range(2):
resp = await client.chat.completions.create( resp = await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
...@@ -1243,7 +1207,7 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI): ...@@ -1243,7 +1207,7 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_extra_fields(server, client: openai.AsyncOpenAI): async def test_extra_fields(client: openai.AsyncOpenAI):
with pytest.raises(BadRequestError) as exc_info: with pytest.raises(BadRequestError) as exc_info:
await client.chat.completions.create( await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
...@@ -1259,7 +1223,7 @@ async def test_extra_fields(server, client: openai.AsyncOpenAI): ...@@ -1259,7 +1223,7 @@ async def test_extra_fields(server, client: openai.AsyncOpenAI):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_complex_message_content(server, client: openai.AsyncOpenAI): async def test_complex_message_content(client: openai.AsyncOpenAI):
resp = await client.chat.completions.create( resp = await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
messages=[{ messages=[{
...@@ -1279,7 +1243,7 @@ async def test_complex_message_content(server, client: openai.AsyncOpenAI): ...@@ -1279,7 +1243,7 @@ async def test_complex_message_content(server, client: openai.AsyncOpenAI):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_custom_role(server, client: openai.AsyncOpenAI): async def test_custom_role(client: openai.AsyncOpenAI):
# Not sure how the model handles custom roles so we just check that # Not sure how the model handles custom roles so we just check that
# both string and complex message content are handled in the same way # both string and complex message content are handled in the same way
...@@ -1310,7 +1274,7 @@ async def test_custom_role(server, client: openai.AsyncOpenAI): ...@@ -1310,7 +1274,7 @@ async def test_custom_role(server, client: openai.AsyncOpenAI):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_guided_grammar(server, client: openai.AsyncOpenAI): async def test_guided_grammar(client: openai.AsyncOpenAI):
simple_sql_grammar = """ simple_sql_grammar = """
start: select_statement start: select_statement
...@@ -1351,7 +1315,7 @@ number: "1" | "2" ...@@ -1351,7 +1315,7 @@ number: "1" | "2"
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"], [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
) )
@pytest.mark.parametrize("logprobs_arg", [1, 0]) @pytest.mark.parametrize("logprobs_arg", [1, 0])
async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI, async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
model_name: str, logprobs_arg: int): model_name: str, logprobs_arg: int):
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
# test using text and token IDs # test using text and token IDs
...@@ -1380,7 +1344,7 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI, ...@@ -1380,7 +1344,7 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_long_seed(server, client: openai.AsyncOpenAI): async def test_long_seed(client: openai.AsyncOpenAI):
for seed in [ for seed in [
torch.iinfo(torch.long).min - 1, torch.iinfo(torch.long).min - 1,
torch.iinfo(torch.long).max + 1 torch.iinfo(torch.long).max + 1
...@@ -1399,81 +1363,5 @@ async def test_long_seed(server, client: openai.AsyncOpenAI): ...@@ -1399,81 +1363,5 @@ async def test_long_seed(server, client: openai.AsyncOpenAI):
or "less_than_equal" in exc_info.value.message) or "less_than_equal" in exc_info.value.message)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[EMBEDDING_MODEL_NAME],
)
async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI,
model_name: str):
input_texts = [
"The chef prepared a delicious meal.",
]
# test single embedding
embeddings = await client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 9
assert embeddings.usage.total_tokens == 9
# test using token IDs
input_tokens = [1, 1, 1, 1, 1]
embeddings = await client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 5
assert embeddings.usage.total_tokens == 5
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[EMBEDDING_MODEL_NAME],
)
async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
model_name: str):
# test List[str]
input_texts = [
"The cat sat on the mat.", "A feline was resting on a rug.",
"Stars twinkle brightly in the night sky."
]
embeddings = await client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
)
assert embeddings.id is not None
assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) == 4096
# test List[List[int]]
input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
[25, 32, 64, 77]]
embeddings = await client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
)
assert embeddings.id is not None
assert len(embeddings.data) == 4
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 17
assert embeddings.usage.total_tokens == 17
if __name__ == "__main__": if __name__ == "__main__":
pytest.main([__file__]) pytest.main([__file__])
...@@ -8,7 +8,7 @@ import ray ...@@ -8,7 +8,7 @@ import ray
from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64 from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64
from ..utils import ServerRunner from ..utils import VLLM_PATH, RemoteOpenAIServer
MODEL_NAME = "llava-hf/llava-1.5-7b-hf" MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
LLAVA_CHAT_TEMPLATE = (Path(__file__).parent.parent.parent / LLAVA_CHAT_TEMPLATE = (Path(__file__).parent.parent.parent /
...@@ -25,10 +25,16 @@ TEST_IMAGE_URLS = [ ...@@ -25,10 +25,16 @@ TEST_IMAGE_URLS = [
pytestmark = pytest.mark.openai pytestmark = pytest.mark.openai
@pytest.fixture(scope="module")
def ray_ctx():
ray.init(runtime_env={"working_dir": VLLM_PATH})
yield
ray.shutdown()
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
ray.init() return RemoteOpenAIServer([
server_runner = ServerRunner.remote([
"--model", "--model",
MODEL_NAME, MODEL_NAME,
"--dtype", "--dtype",
...@@ -47,18 +53,11 @@ def server(): ...@@ -47,18 +53,11 @@ def server():
"--chat-template", "--chat-template",
str(LLAVA_CHAT_TEMPLATE), str(LLAVA_CHAT_TEMPLATE),
]) ])
ray.get(server_runner.ready.remote())
yield server_runner
ray.shutdown()
@pytest.fixture(scope="session") @pytest.fixture(scope="module")
def client(): def client(server):
client = openai.AsyncOpenAI( return server.get_async_client()
base_url="http://localhost:8000/v1",
api_key="token-abc123",
)
yield client
@pytest_asyncio.fixture(scope="session") @pytest_asyncio.fixture(scope="session")
...@@ -73,7 +72,7 @@ async def base64_encoded_image() -> Dict[str, str]: ...@@ -73,7 +72,7 @@ async def base64_encoded_image() -> Dict[str, str]:
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_single_chat_session_image(server, client: openai.AsyncOpenAI, async def test_single_chat_session_image(client: openai.AsyncOpenAI,
model_name: str, image_url: str): model_name: str, image_url: str):
messages = [{ messages = [{
"role": "role":
...@@ -126,7 +125,7 @@ async def test_single_chat_session_image(server, client: openai.AsyncOpenAI, ...@@ -126,7 +125,7 @@ async def test_single_chat_session_image(server, client: openai.AsyncOpenAI,
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_single_chat_session_image_base64encoded( async def test_single_chat_session_image_base64encoded(
server, client: openai.AsyncOpenAI, model_name: str, image_url: str, client: openai.AsyncOpenAI, model_name: str, image_url: str,
base64_encoded_image: Dict[str, str]): base64_encoded_image: Dict[str, str]):
messages = [{ messages = [{
...@@ -180,7 +179,7 @@ async def test_single_chat_session_image_base64encoded( ...@@ -180,7 +179,7 @@ async def test_single_chat_session_image_base64encoded(
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_chat_streaming_image(server, client: openai.AsyncOpenAI, async def test_chat_streaming_image(client: openai.AsyncOpenAI,
model_name: str, image_url: str): model_name: str, image_url: str):
messages = [{ messages = [{
"role": "role":
...@@ -237,8 +236,8 @@ async def test_chat_streaming_image(server, client: openai.AsyncOpenAI, ...@@ -237,8 +236,8 @@ async def test_chat_streaming_image(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_multi_image_input(server, client: openai.AsyncOpenAI, async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
model_name: str, image_url: str): image_url: str):
messages = [{ messages = [{
"role": "role":
......
...@@ -47,7 +47,7 @@ def cutlass_fp8_gemm_helper(m: int, ...@@ -47,7 +47,7 @@ def cutlass_fp8_gemm_helper(m: int,
scale_b = (torch.randn( scale_b = (torch.randn(
(1, n_b_scales), device=device, dtype=torch.float32) / 10) (1, n_b_scales), device=device, dtype=torch.float32) / 10)
out = ops.cutlass_scaled_mm_dq(a, b, scale_a, scale_b, out_dtype) out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype)
baseline = torch.mm(scale_a * a.to(dtype=torch.float32), baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
scale_b * b.to(dtype=torch.float32)).to(out_dtype) scale_b * b.to(dtype=torch.float32)).to(out_dtype)
...@@ -74,7 +74,7 @@ def cutlass_int8_gemm_helper(m: int, ...@@ -74,7 +74,7 @@ def cutlass_int8_gemm_helper(m: int,
scale_b = (torch.randn( scale_b = (torch.randn(
(1, n_b_scales), device=device, dtype=torch.float32) / 10) (1, n_b_scales), device=device, dtype=torch.float32) / 10)
out = ops.cutlass_scaled_mm_dq(a, b, scale_a, scale_b, out_dtype) out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype)
baseline = torch.mm(scale_a * a.to(dtype=torch.float32), baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
scale_b * scale_b *
b.to(dtype=torch.float32)).to(dtype=out_dtype) b.to(dtype=torch.float32)).to(dtype=out_dtype)
...@@ -180,11 +180,11 @@ def test_cutlass_subset(): ...@@ -180,11 +180,11 @@ def test_cutlass_subset():
scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10 scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10 scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
out = ops.cutlass_scaled_mm_dq(a, out = ops.cutlass_scaled_mm(a,
b, b,
scale_a, scale_a,
scale_b, scale_b,
out_dtype=torch.bfloat16) out_dtype=torch.bfloat16)
baseline = torch.mm(scale_a * a.to(dtype=torch.float32), baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
scale_b * scale_b *
b.to(dtype=torch.float32)).to(dtype=torch.bfloat16) b.to(dtype=torch.float32)).to(dtype=torch.bfloat16)
...@@ -203,8 +203,8 @@ class CutlassLayer(torch.nn.Module): ...@@ -203,8 +203,8 @@ class CutlassLayer(torch.nn.Module):
self.out_dtype = out_dtype self.out_dtype = out_dtype
def forward(self, a): def forward(self, a):
return ops.cutlass_scaled_mm_dq(a, self.b, self.scale_a, self.scale_b, return ops.cutlass_scaled_mm(a, self.b, self.scale_a, self.scale_b,
self.out_dtype) self.out_dtype)
@pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_act_token", [True, False])
......
...@@ -12,7 +12,10 @@ from huggingface_hub import snapshot_download ...@@ -12,7 +12,10 @@ from huggingface_hub import snapshot_download
import vllm import vllm
from vllm.config import LoRAConfig from vllm.config import LoRAConfig
from vllm.distributed import destroy_model_parallel, initialize_model_parallel from vllm.distributed import (destroy_distributed_environment,
destroy_model_parallel,
init_distributed_environment,
initialize_model_parallel)
from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.linear import (ColumnParallelLinear,
MergedColumnParallelLinear, MergedColumnParallelLinear,
RowParallelLinear) RowParallelLinear)
...@@ -35,6 +38,7 @@ LONG_LORA_INFOS = [{ ...@@ -35,6 +38,7 @@ LONG_LORA_INFOS = [{
def cleanup(): def cleanup():
destroy_model_parallel() destroy_model_parallel()
destroy_distributed_environment()
with contextlib.suppress(AssertionError): with contextlib.suppress(AssertionError):
torch.distributed.destroy_process_group() torch.distributed.destroy_process_group()
gc.collect() gc.collect()
...@@ -64,15 +68,14 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool): ...@@ -64,15 +68,14 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
@pytest.fixture @pytest.fixture
def dist_init(): def dist_init():
if not torch.distributed.is_initialized(): temp_file = tempfile.mkstemp()[1]
temp_file = tempfile.mkstemp()[1] init_distributed_environment(
torch.distributed.init_process_group( world_size=1,
backend="nccl", rank=0,
world_size=1, distributed_init_method=f"file://{temp_file}",
rank=0, local_rank=0,
init_method=f"file://{temp_file}", backend="nccl",
) )
torch.distributed.all_reduce(torch.zeros(1).cuda())
initialize_model_parallel(1, 1) initialize_model_parallel(1, 1)
yield yield
cleanup() cleanup()
......
...@@ -4,17 +4,8 @@ Run `pytest tests/models/test_aqlm.py`. ...@@ -4,17 +4,8 @@ Run `pytest tests/models/test_aqlm.py`.
""" """
import pytest import pytest
import torch
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from tests.quantization.utils import is_quant_method_supported
aqlm_not_supported = True
if torch.cuda.is_available():
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
aqlm_not_supported = (capability <
QUANTIZATION_METHODS["aqlm"].get_min_capability())
# In this test we hardcode prompts and generations for the model so we don't # In this test we hardcode prompts and generations for the model so we don't
# need to require the AQLM package as a dependency # need to require the AQLM package as a dependency
...@@ -67,7 +58,7 @@ ground_truth_generations = [ ...@@ -67,7 +58,7 @@ ground_truth_generations = [
] ]
@pytest.mark.skipif(aqlm_not_supported, @pytest.mark.skipif(not is_quant_method_supported("aqlm"),
reason="AQLM is not supported on this GPU type.") reason="AQLM is not supported on this GPU type.")
@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"]) @pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
......
...@@ -8,8 +8,8 @@ import pytest ...@@ -8,8 +8,8 @@ import pytest
import torch import torch
from transformers import AutoTokenizer from transformers import AutoTokenizer
from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
os.environ["TOKENIZERS_PARALLELISM"] = "true" os.environ["TOKENIZERS_PARALLELISM"] = "true"
...@@ -67,16 +67,16 @@ EXPECTED_STRS_MAP = { ...@@ -67,16 +67,16 @@ EXPECTED_STRS_MAP = {
}, },
} }
fp8_not_supported = True
if torch.cuda.is_available(): # This test compares against golden strings for exact match since
capability = torch.cuda.get_device_capability() # there is no baseline implementation to compare against
capability = capability[0] * 10 + capability[1] # and is unstable w.r.t specifics of the fp8 implementation or
fp8_not_supported = (capability < # the hardware being run on.
QUANTIZATION_METHODS["fp8"].get_min_capability()) # Disabled to prevent it from breaking the build
@pytest.mark.skip(
reason=
@pytest.mark.skipif(fp8_not_supported, "Prevent unstable test based on golden strings from breaking the build.")
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.") reason="fp8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS) @pytest.mark.parametrize("model_name", MODELS)
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"]) @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
......
...@@ -11,9 +11,8 @@ Run `pytest tests/models/test_gptq_marlin.py`. ...@@ -11,9 +11,8 @@ Run `pytest tests/models/test_gptq_marlin.py`.
import os import os
import pytest import pytest
import torch
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from tests.quantization.utils import is_quant_method_supported
from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
from .utils import check_logprobs_close from .utils import check_logprobs_close
...@@ -22,14 +21,6 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true" ...@@ -22,14 +21,6 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
MAX_MODEL_LEN = 1024 MAX_MODEL_LEN = 1024
gptq_marlin_not_supported = True
if torch.cuda.is_available():
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
gptq_marlin_not_supported = (
capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability())
MODELS = [ MODELS = [
# act_order==False, group_size=channelwise # act_order==False, group_size=channelwise
("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"), ("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"),
...@@ -53,7 +44,7 @@ MODELS = [ ...@@ -53,7 +44,7 @@ MODELS = [
@pytest.mark.flaky(reruns=3) @pytest.mark.flaky(reruns=3)
@pytest.mark.skipif(gptq_marlin_not_supported, @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
reason="gptq_marlin is not supported on this GPU type.") reason="gptq_marlin is not supported on this GPU type.")
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half", "bfloat16"]) @pytest.mark.parametrize("dtype", ["half", "bfloat16"])
......
...@@ -9,18 +9,9 @@ Run `pytest tests/models/test_marlin_24.py`. ...@@ -9,18 +9,9 @@ Run `pytest tests/models/test_marlin_24.py`.
from dataclasses import dataclass from dataclasses import dataclass
import pytest import pytest
import torch
from tests.models.utils import check_logprobs_close from tests.models.utils import check_logprobs_close
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from tests.quantization.utils import is_quant_method_supported
marlin_not_supported = True
if torch.cuda.is_available():
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
marlin_not_supported = (
capability < QUANTIZATION_METHODS["marlin"].get_min_capability())
@dataclass @dataclass
...@@ -47,7 +38,7 @@ model_pairs = [ ...@@ -47,7 +38,7 @@ model_pairs = [
@pytest.mark.flaky(reruns=2) @pytest.mark.flaky(reruns=2)
@pytest.mark.skipif(marlin_not_supported, @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"),
reason="Marlin24 is not supported on this GPU type.") reason="Marlin24 is not supported on this GPU type.")
@pytest.mark.parametrize("model_pair", model_pairs) @pytest.mark.parametrize("model_pair", model_pairs)
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
......
...@@ -13,20 +13,11 @@ Run `pytest tests/models/test_marlin.py`. ...@@ -13,20 +13,11 @@ Run `pytest tests/models/test_marlin.py`.
from dataclasses import dataclass from dataclasses import dataclass
import pytest import pytest
import torch
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from tests.quantization.utils import is_quant_method_supported
from .utils import check_logprobs_close from .utils import check_logprobs_close
marlin_not_supported = True
if torch.cuda.is_available():
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
marlin_not_supported = (
capability < QUANTIZATION_METHODS["marlin"].get_min_capability())
@dataclass @dataclass
class ModelPair: class ModelPair:
...@@ -45,7 +36,7 @@ model_pairs = [ ...@@ -45,7 +36,7 @@ model_pairs = [
@pytest.mark.flaky(reruns=2) @pytest.mark.flaky(reruns=2)
@pytest.mark.skipif(marlin_not_supported, @pytest.mark.skipif(not is_quant_method_supported("marlin"),
reason="Marlin is not supported on this GPU type.") reason="Marlin is not supported on this GPU type.")
@pytest.mark.parametrize("model_pair", model_pairs) @pytest.mark.parametrize("model_pair", model_pairs)
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
......
...@@ -5,16 +5,12 @@ Run `pytest tests/quantization/test_bitsandbytes.py`. ...@@ -5,16 +5,12 @@ Run `pytest tests/quantization/test_bitsandbytes.py`.
import pytest import pytest
import torch import torch
from tests.quantization.utils import is_quant_method_supported
from vllm import SamplingParams from vllm import SamplingParams
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
@pytest.mark.skipif( reason='bitsandbytes is not supported on this GPU type.')
capability < QUANTIZATION_METHODS['bitsandbytes'].get_min_capability(),
reason='bitsandbytes is not supported on this GPU type.')
def test_load_bnb_model(vllm_runner) -> None: def test_load_bnb_model(vllm_runner) -> None:
with vllm_runner('huggyllama/llama-7b', with vllm_runner('huggyllama/llama-7b',
quantization='bitsandbytes', quantization='bitsandbytes',
......
...@@ -3,12 +3,13 @@ ...@@ -3,12 +3,13 @@
Run `pytest tests/quantization/test_compressed_tensors.py`. Run `pytest tests/quantization/test_compressed_tensors.py`.
""" """
import pytest
import torch import torch
from vllm import SamplingParams from vllm import SamplingParams
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
CompressedTensorsLinearMethod, CompressedTensorsW8A8DynamicToken, CompressedTensorsLinearMethod, CompressedTensorsW4A16,
CompressedTensorsW8A8StaticTensor) CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor)
def test_compressed_tensors_w8a8_static_setup(vllm_runner): def test_compressed_tensors_w8a8_static_setup(vllm_runner):
...@@ -60,3 +61,25 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner): ...@@ -60,3 +61,25 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken) assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken)
assert qkv_proj.weight.dtype is torch.int8 assert qkv_proj.weight.dtype is torch.int8
@pytest.mark.parametrize("w4a16_args", [
("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None),
("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128),
])
def test_compressed_tensors_w4a16(vllm_runner, w4a16_args):
model, strategy, group = w4a16_args
with vllm_runner(model) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16)
assert qkv_proj.scheme.strategy == strategy
assert qkv_proj.scheme.group_size == group
assert qkv_proj.weight_packed.dtype is torch.int32
assert qkv_proj.weight_scale.dtype is torch.float16
assert qkv_proj.weight_packed.pack_factor == 8
...@@ -5,16 +5,13 @@ Run `pytest tests/quantization/test_fp8.py --forked`. ...@@ -5,16 +5,13 @@ Run `pytest tests/quantization/test_fp8.py --forked`.
import pytest import pytest
import torch import torch
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from tests.quantization.utils import is_quant_method_supported
from vllm._custom_ops import scaled_fp8_quant
from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
@pytest.mark.skipif( reason="FP8 is not supported on this GPU type.")
capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
reason="FP8 is not supported on this GPU type.")
def test_load_fp16_model(vllm_runner) -> None: def test_load_fp16_model(vllm_runner) -> None:
with vllm_runner("facebook/opt-125m", quantization="fp8") as llm: with vllm_runner("facebook/opt-125m", quantization="fp8") as llm:
...@@ -22,3 +19,48 @@ def test_load_fp16_model(vllm_runner) -> None: ...@@ -22,3 +19,48 @@ def test_load_fp16_model(vllm_runner) -> None:
fc1 = model.model.decoder.layers[0].fc1 fc1 = model.model.decoder.layers[0].fc1
assert isinstance(fc1.quant_method, Fp8LinearMethod) assert isinstance(fc1.quant_method, Fp8LinearMethod)
assert fc1.weight.dtype == torch.float8_e4m3fn assert fc1.weight.dtype == torch.float8_e4m3fn
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
def test_scaled_fp8_quant(dtype) -> None:
def quantize_ref(tensor, inv_scale):
# The reference implementation that fully aligns to
# the kernel being tested.
finfo = torch.finfo(torch.float8_e4m3fn)
scale = inv_scale.reciprocal()
qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min,
max=finfo.max)
qweight = qweight.to(torch.float8_e4m3fn)
return qweight
def per_tensor_dequantize(tensor, inv_scale, dtype):
fake_qweight = tensor.to(dtype)
dq_weight = fake_qweight * inv_scale
return dq_weight
# Note that we use a shape % 4 != 0 to cover edge cases,
# because scaled_fp8_quant is vectorized by 4.
x = (torch.randn(size=(11, 11), device="cuda") * 13).to(dtype)
# Dynamic quantization
ref_y, inv_scale = scaled_fp8_quant(x, None)
ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype)
# Reference dynamic quantizaton
y = quantize_ref(x, inv_scale)
assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
# Static quantization
y, _ = scaled_fp8_quant(x, inv_scale)
assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
# Padding
y, _ = scaled_fp8_quant(x, inv_scale, batch_dim_padding=17)
assert y.shape[0] == 17
assert torch.allclose(
ref_y,
per_tensor_dequantize(torch.narrow(y, 0, 0, x.shape[0]), inv_scale,
dtype))
import torch
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
def is_quant_method_supported(quant_method: str) -> bool:
# Currently, all quantization methods require Nvidia or AMD GPUs
if not torch.cuda.is_available():
return False
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
return (capability <
QUANTIZATION_METHODS[quant_method].get_min_capability())
import json import json
import os import os
import pathlib
import subprocess import subprocess
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
import openai import openai
import pytest import pytest
import ray import ray
import torch
from tensorizer import EncryptionParams
from vllm import SamplingParams from vllm import SamplingParams
from vllm.engine.arg_utils import EngineArgs
# yapf: disable # yapf: disable
from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig, from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
TensorSerializer, TensorSerializer,
is_vllm_tensorized, is_vllm_tensorized,
load_with_tensorizer, load_with_tensorizer,
open_stream, open_stream,
serialize_vllm_model) serialize_vllm_model,
tensorize_vllm_model)
from ..utils import ServerRunner from ..conftest import VllmRunner, cleanup
from ..utils import RemoteOpenAIServer
# yapf conflicts with isort for this docstring # yapf conflicts with isort for this docstring
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
"The president of the United States is", "The president of the United States is",
...@@ -42,6 +49,20 @@ def is_curl_installed(): ...@@ -42,6 +49,20 @@ def is_curl_installed():
except (subprocess.CalledProcessError, FileNotFoundError): except (subprocess.CalledProcessError, FileNotFoundError):
return False return False
def get_torch_model(vllm_runner: VllmRunner):
return vllm_runner \
.model \
.llm_engine \
.model_executor \
.driver_worker \
.model_runner \
.model
def write_keyfile(keyfile_path: str):
encryption_params = EncryptionParams.random()
pathlib.Path(keyfile_path).parent.mkdir(parents=True, exist_ok=True)
with open(keyfile_path, 'wb') as f:
f.write(encryption_params.key)
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def tensorizer_config(): def tensorizer_config():
...@@ -88,12 +109,17 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs( ...@@ -88,12 +109,17 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
with vllm_runner(model_ref) as vllm_model: with vllm_runner(model_ref) as vllm_model:
model_path = tmp_path / (model_ref + ".tensors") model_path = tmp_path / (model_ref + ".tensors")
key_path = tmp_path / (model_ref + ".key") key_path = tmp_path / (model_ref + ".key")
write_keyfile(key_path)
outputs = vllm_model.generate(prompts, sampling_params) outputs = vllm_model.generate(prompts, sampling_params)
config_for_serializing = TensorizerConfig(tensorizer_uri=model_path) config_for_serializing = TensorizerConfig(
serialize_vllm_model(vllm_model.model.llm_engine, tensorizer_uri=model_path,
config_for_serializing, encryption_keyfile=key_path
encryption_key_path=key_path) )
serialize_vllm_model(get_torch_model(vllm_model),
config_for_serializing)
config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path, config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
encryption_keyfile=key_path) encryption_keyfile=key_path)
...@@ -145,7 +171,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): ...@@ -145,7 +171,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
with vllm_runner(model_ref, ) as vllm_model: with vllm_runner(model_ref, ) as vllm_model:
model_path = tmp_path / (model_ref + ".tensors") model_path = tmp_path / (model_ref + ".tensors")
serialize_vllm_model(vllm_model.model.llm_engine, serialize_vllm_model(get_torch_model(vllm_model),
TensorizerConfig(tensorizer_uri=model_path)) TensorizerConfig(tensorizer_uri=model_path))
with vllm_runner( with vllm_runner(
...@@ -180,7 +206,7 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): ...@@ -180,7 +206,7 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
with vllm_runner(model_ref, ) as vllm_model: with vllm_runner(model_ref, ) as vllm_model:
model_path = tmp_path / (model_ref + ".tensors") model_path = tmp_path / (model_ref + ".tensors")
serialize_vllm_model(vllm_model.model.llm_engine, serialize_vllm_model(get_torch_model(vllm_model),
TensorizerConfig(tensorizer_uri=model_path)) TensorizerConfig(tensorizer_uri=model_path))
model_loader_extra_config = { model_loader_extra_config = {
...@@ -191,18 +217,13 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): ...@@ -191,18 +217,13 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
openai_args = [ openai_args = [
"--model", model_ref, "--dtype", "float16", "--load-format", "--model", model_ref, "--dtype", "float16", "--load-format",
"tensorizer", "--model-loader-extra-config", "tensorizer", "--model-loader-extra-config",
json.dumps(model_loader_extra_config), "--port", "8000" json.dumps(model_loader_extra_config),
] ]
server = ServerRunner.remote(openai_args) server = RemoteOpenAIServer(openai_args)
assert ray.get(server.ready.remote())
print("Server ready.") print("Server ready.")
client = openai.OpenAI( client = server.get_client()
base_url="http://localhost:8000/v1",
api_key="token-abc123",
)
completion = client.completions.create(model=model_ref, completion = client.completions.create(model=model_ref,
prompt="Hello, my name is", prompt="Hello, my name is",
max_tokens=5, max_tokens=5,
...@@ -224,7 +245,9 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner): ...@@ -224,7 +245,9 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner):
model_loader_extra_config=TensorizerConfig(tensorizer_uri="test")) model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
def test_tensorizer_with_tp(vllm_runner): @pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Requires 2 GPUs")
def test_tensorizer_with_tp_path_without_template(vllm_runner):
with pytest.raises(ValueError): with pytest.raises(ValueError):
model_ref = "EleutherAI/pythia-1.4b" model_ref = "EleutherAI/pythia-1.4b"
tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors" tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
...@@ -238,8 +261,62 @@ def test_tensorizer_with_tp(vllm_runner): ...@@ -238,8 +261,62 @@ def test_tensorizer_with_tp(vllm_runner):
s3_endpoint="object.ord1.coreweave.com", s3_endpoint="object.ord1.coreweave.com",
), ),
tensor_parallel_size=2, tensor_parallel_size=2,
disable_custom_all_reduce=True,
) )
@pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Requires 2 GPUs")
def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
tmp_path):
model_ref = "EleutherAI/pythia-1.4b"
# record outputs from un-sharded un-tensorized model
base_model = vllm_runner(
model_ref,
disable_custom_all_reduce=True,
enforce_eager=True,
)
outputs = base_model.generate(prompts, sampling_params)
base_model.model.llm_engine.model_executor.shutdown()
del base_model
cleanup()
ray.shutdown()
# load model with two shards and serialize with encryption
model_path = str(tmp_path / (model_ref + "-%02d.tensors"))
key_path = tmp_path / (model_ref + ".key")
tensorizer_config = TensorizerConfig(
tensorizer_uri=model_path,
encryption_keyfile=key_path,
)
tensorize_vllm_model(
engine_args=EngineArgs(
model=model_ref,
tensor_parallel_size=2,
disable_custom_all_reduce=True,
enforce_eager=True,
),
tensorizer_config=tensorizer_config,
)
assert os.path.isfile(model_path % 0), "Serialization subprocess failed"
assert os.path.isfile(model_path % 1), "Serialization subprocess failed"
cleanup()
ray.shutdown()
loaded_vllm_model = vllm_runner(
model_ref,
tensor_parallel_size=2,
load_format="tensorizer",
disable_custom_all_reduce=True,
enforce_eager=True,
model_loader_extra_config=tensorizer_config)
deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
assert outputs == deserialized_outputs
def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
model_ref = "facebook/opt-125m" model_ref = "facebook/opt-125m"
...@@ -248,7 +325,7 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): ...@@ -248,7 +325,7 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
with vllm_runner(model_ref) as vllm_model: with vllm_runner(model_ref) as vllm_model:
outputs = vllm_model.generate(prompts, sampling_params) outputs = vllm_model.generate(prompts, sampling_params)
serialize_vllm_model(vllm_model.model.llm_engine, config) serialize_vllm_model(get_torch_model(vllm_model), config)
assert is_vllm_tensorized(config) assert is_vllm_tensorized(config)
......
...@@ -39,7 +39,7 @@ def test_filter_subtensors(): ...@@ -39,7 +39,7 @@ def test_filter_subtensors():
filtered_state_dict = ShardedStateLoader._filter_subtensors(state_dict) filtered_state_dict = ShardedStateLoader._filter_subtensors(state_dict)
assert tuple(filtered_state_dict.keys()) == ("a", "b", "c") assert tuple(filtered_state_dict.keys()) == ("a", "b", "c")
for key, tensor in filtered_state_dict.items(): for key, tensor in filtered_state_dict.items():
# NOTE: don't use `euqal` here, as the tensor might contain NaNs # NOTE: don't use `equal` here, as the tensor might contain NaNs
assert tensor is state_dict[key] assert tensor is state_dict[key]
......
...@@ -4,57 +4,109 @@ import sys ...@@ -4,57 +4,109 @@ import sys
import time import time
import warnings import warnings
from contextlib import contextmanager from contextlib import contextmanager
from typing import List
import openai
import ray import ray
import requests import requests
from vllm.distributed import (ensure_model_parallel_initialized, from vllm.distributed import (ensure_model_parallel_initialized,
init_distributed_environment) init_distributed_environment)
from vllm.entrypoints.openai.cli_args import make_arg_parser
from vllm.utils import get_open_port from vllm.utils import get_open_port
# Path to root of repository so that utilities can be imported by ray workers # Path to root of repository so that utilities can be imported by ray workers
VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)) VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir))
@ray.remote(num_gpus=1) class RemoteOpenAIServer:
class ServerRunner: DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key
MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds
def __init__(self, args): @ray.remote(num_gpus=1)
env = os.environ.copy() class _RemoteRunner:
env["PYTHONUNBUFFERED"] = "1"
self.proc = subprocess.Popen( def __init__(self, cli_args: List[str], *, wait_url: str,
[sys.executable, "-m", "vllm.entrypoints.openai.api_server"] + wait_timeout: float) -> None:
args, env = os.environ.copy()
env=env, env["PYTHONUNBUFFERED"] = "1"
stdout=sys.stdout, self.proc = subprocess.Popen(
stderr=sys.stderr, [
sys.executable, "-m", "vllm.entrypoints.openai.api_server",
*cli_args
],
env=env,
stdout=sys.stdout,
stderr=sys.stderr,
)
self._wait_for_server(url=wait_url, timeout=wait_timeout)
def ready(self):
return True
def _wait_for_server(self, *, url: str, timeout: float):
# run health check
start = time.time()
while True:
try:
if requests.get(url).status_code == 200:
break
except Exception as err:
if self.proc.poll() is not None:
raise RuntimeError(
"Server exited unexpectedly.") from err
time.sleep(0.5)
if time.time() - start > timeout:
raise RuntimeError(
"Server failed to start in time.") from err
def __del__(self):
if hasattr(self, "proc"):
self.proc.terminate()
def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None:
if auto_port:
if "-p" in cli_args or "--port" in cli_args:
raise ValueError("You have manually specified the port"
"when `auto_port=True`.")
cli_args = cli_args + ["--port", str(get_open_port())]
parser = make_arg_parser()
args = parser.parse_args(cli_args)
self.host = str(args.host or 'localhost')
self.port = int(args.port)
self._runner = self._RemoteRunner.remote(
cli_args,
wait_url=self.url_for("health"),
wait_timeout=self.MAX_SERVER_START_WAIT_S)
self._wait_until_ready()
@property
def url_root(self) -> str:
return f"http://{self.host}:{self.port}"
def url_for(self, *parts: str) -> str:
return self.url_root + "/" + "/".join(parts)
def _wait_until_ready(self) -> None:
ray.get(self._runner.ready.remote())
def get_client(self):
return openai.OpenAI(
base_url=self.url_for("v1"),
api_key=self.DUMMY_API_KEY,
)
def get_async_client(self):
return openai.AsyncOpenAI(
base_url=self.url_for("v1"),
api_key=self.DUMMY_API_KEY,
) )
self._wait_for_server()
def ready(self):
return True
def _wait_for_server(self):
# run health check
start = time.time()
while True:
try:
if requests.get(
"http://localhost:8000/health").status_code == 200:
break
except Exception as err:
if self.proc.poll() is not None:
raise RuntimeError("Server exited unexpectedly.") from err
time.sleep(0.5)
if time.time() - start > self.MAX_SERVER_START_WAIT_S:
raise RuntimeError(
"Server failed to start in time.") from err
def __del__(self):
if hasattr(self, "proc"):
self.proc.terminate()
def init_test_distributed_environment( def init_test_distributed_environment(
......
import pytest import pytest
import torch import torch
from vllm.distributed.parallel_state import init_distributed_environment from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
init_distributed_environment)
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
...@@ -292,6 +293,7 @@ def distributed_init(): ...@@ -292,6 +293,7 @@ def distributed_init():
rank=0, rank=0,
distributed_init_method=f"tcp://127.0.0.1:{get_open_port()}", distributed_init_method=f"tcp://127.0.0.1:{get_open_port()}",
local_rank=0) local_rank=0)
ensure_model_parallel_initialized(1, 1)
@pytest.mark.parametrize("batch_size", list(range(2, 128))) @pytest.mark.parametrize("batch_size", list(range(2, 128)))
......
...@@ -13,9 +13,10 @@ from vllm.pooling_params import PoolingParams ...@@ -13,9 +13,10 @@ from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.version import __dcu_version__ from vllm.version import __dcu_version__
__version__ = "0.5.0" from .version import __version__
__all__ = [ __all__ = [
"__version__",
"LLM", "LLM",
"ModelRegistry", "ModelRegistry",
"PromptStrictInputs", "PromptStrictInputs",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment