Unverified Commit 5bf35a91 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Doc][CI/Build] Update docs and tests to use `vllm serve` (#6431)

parent a19e8d37
......@@ -23,17 +23,17 @@ TEST_IMAGE_URLS = [
@pytest.fixture(scope="module")
def server():
with RemoteOpenAIServer([
"--model",
MODEL_NAME,
"--dtype",
"bfloat16",
"--max-model-len",
"4096",
"--enforce-eager",
"--chat-template",
str(LLAVA_CHAT_TEMPLATE),
]) as remote_server:
args = [
"--dtype",
"bfloat16",
"--max-model-len",
"4096",
"--enforce-eager",
"--chat-template",
str(LLAVA_CHAT_TEMPLATE),
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
......
......@@ -214,12 +214,12 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
## Start OpenAI API server
openai_args = [
"--model", model_ref, "--dtype", "float16", "--load-format",
"--dtype", "float16", "--load-format",
"tensorizer", "--model-loader-extra-config",
json.dumps(model_loader_extra_config),
]
with RemoteOpenAIServer(openai_args) as server:
with RemoteOpenAIServer(model_ref, openai_args) as server:
print("Server ready.")
client = server.get_client()
......
......@@ -49,7 +49,13 @@ class RemoteOpenAIServer:
DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key
MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds
def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None:
def __init__(
self,
model: str,
cli_args: List[str],
*,
auto_port: bool = True,
) -> None:
if auto_port:
if "-p" in cli_args or "--port" in cli_args:
raise ValueError("You have manually specified the port"
......@@ -68,12 +74,10 @@ class RemoteOpenAIServer:
# the current process might initialize cuda,
# to be safe, we should use spawn method
env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
self.proc = subprocess.Popen(
[sys.executable, "-m", "vllm.entrypoints.openai.api_server"] +
cli_args,
env=env,
stdout=sys.stdout,
stderr=sys.stderr)
self.proc = subprocess.Popen(["vllm", "serve"] + [model] + cli_args,
env=env,
stdout=sys.stdout,
stderr=sys.stderr)
self._wait_for_server(url=self.url_for("health"),
timeout=self.MAX_SERVER_START_WAIT_S)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment