Unverified Commit 1666e664 authored by Xihui Cang's avatar Xihui Cang Committed by GitHub
Browse files

Add "/server_info" endpoint in api_server to retrieve the vllm_config.  (#16572)


Signed-off-by: default avatarXihui Cang <xihuicang@gmail.com>
parent 1575c170
...@@ -1167,6 +1167,10 @@ class AsyncLLMEngine(EngineClient): ...@@ -1167,6 +1167,10 @@ class AsyncLLMEngine(EngineClient):
exception=asyncio.CancelledError, exception=asyncio.CancelledError,
verbose=self.log_requests) verbose=self.log_requests)
async def get_vllm_config(self) -> VllmConfig:
"""Get the vllm configuration of the vLLM engine."""
return self.engine.get_vllm_config()
async def get_model_config(self) -> ModelConfig: async def get_model_config(self) -> ModelConfig:
"""Get the model configuration of the vLLM engine.""" """Get the model configuration of the vLLM engine."""
return self.engine.get_model_config() return self.engine.get_model_config()
......
...@@ -914,6 +914,10 @@ class LLMEngine: ...@@ -914,6 +914,10 @@ class LLMEngine:
scheduler.abort_seq_group( scheduler.abort_seq_group(
request_id, seq_id_to_seq_group=self.seq_id_to_seq_group) request_id, seq_id_to_seq_group=self.seq_id_to_seq_group)
def get_vllm_config(self) -> VllmConfig:
"""Gets the vllm configuration."""
return self.vllm_config
def get_model_config(self) -> ModelConfig: def get_model_config(self) -> ModelConfig:
"""Gets the model configuration.""" """Gets the model configuration."""
return self.model_config return self.model_config
......
...@@ -93,6 +93,7 @@ class MQLLMEngineClient(EngineClient): ...@@ -93,6 +93,7 @@ class MQLLMEngineClient(EngineClient):
self._errored_with: Optional[BaseException] = None self._errored_with: Optional[BaseException] = None
# Get the configs. # Get the configs.
self.vllm_config = engine_config
self.model_config = engine_config.model_config self.model_config = engine_config.model_config
self.decoding_config = engine_config.decoding_config self.decoding_config = engine_config.decoding_config
...@@ -377,6 +378,9 @@ class MQLLMEngineClient(EngineClient): ...@@ -377,6 +378,9 @@ class MQLLMEngineClient(EngineClient):
async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None): async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None):
return await self.tokenizer.get_lora_tokenizer_async(lora_request) return await self.tokenizer.get_lora_tokenizer_async(lora_request)
async def get_vllm_config(self) -> VllmConfig:
return self.vllm_config
async def get_decoding_config(self) -> DecodingConfig: async def get_decoding_config(self) -> DecodingConfig:
return self.decoding_config return self.decoding_config
......
...@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod ...@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
from typing import AsyncGenerator, List, Mapping, Optional from typing import AsyncGenerator, List, Mapping, Optional
from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
from vllm.config import DecodingConfig, ModelConfig from vllm.config import DecodingConfig, ModelConfig, VllmConfig
from vllm.core.scheduler import SchedulerOutputs from vllm.core.scheduler import SchedulerOutputs
from vllm.inputs.data import PromptType, TokensPrompt from vllm.inputs.data import PromptType, TokensPrompt
from vllm.inputs.parse import is_explicit_encoder_decoder_prompt from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
...@@ -220,6 +220,11 @@ class EngineClient(ABC): ...@@ -220,6 +220,11 @@ class EngineClient(ABC):
""" """
... ...
@abstractmethod
async def get_vllm_config(self) -> VllmConfig:
"""Get the vllm configuration of the vLLM engine."""
...
@abstractmethod @abstractmethod
async def get_model_config(self) -> ModelConfig: async def get_model_config(self) -> ModelConfig:
"""Get the model configuration of the vLLM engine.""" """Get the model configuration of the vLLM engine."""
......
...@@ -30,7 +30,7 @@ from starlette.routing import Mount ...@@ -30,7 +30,7 @@ from starlette.routing import Mount
from typing_extensions import assert_never from typing_extensions import assert_never
import vllm.envs as envs import vllm.envs as envs
from vllm.config import ModelConfig from vllm.config import VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore
from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.engine.multiprocessing.client import MQLLMEngineClient
...@@ -327,6 +327,7 @@ def mount_metrics(app: FastAPI): ...@@ -327,6 +327,7 @@ def mount_metrics(app: FastAPI):
"/load", "/load",
"/ping", "/ping",
"/version", "/version",
"/server_info",
], ],
registry=registry, registry=registry,
).add().instrument(app).expose(app) ).add().instrument(app).expose(app)
...@@ -687,6 +688,11 @@ TASK_HANDLERS: dict[str, dict[str, tuple]] = { ...@@ -687,6 +688,11 @@ TASK_HANDLERS: dict[str, dict[str, tuple]] = {
if envs.VLLM_SERVER_DEV_MODE: if envs.VLLM_SERVER_DEV_MODE:
@router.get("/server_info")
async def show_server_info(raw_request: Request):
server_info = {"vllm_config": str(raw_request.app.state.vllm_config)}
return JSONResponse(content=server_info)
@router.post("/reset_prefix_cache") @router.post("/reset_prefix_cache")
async def reset_prefix_cache(raw_request: Request): async def reset_prefix_cache(raw_request: Request):
""" """
...@@ -894,7 +900,7 @@ def build_app(args: Namespace) -> FastAPI: ...@@ -894,7 +900,7 @@ def build_app(args: Namespace) -> FastAPI:
async def init_app_state( async def init_app_state(
engine_client: EngineClient, engine_client: EngineClient,
model_config: ModelConfig, vllm_config: VllmConfig,
state: State, state: State,
args: Namespace, args: Namespace,
) -> None: ) -> None:
...@@ -915,6 +921,8 @@ async def init_app_state( ...@@ -915,6 +921,8 @@ async def init_app_state(
state.engine_client = engine_client state.engine_client = engine_client
state.log_stats = not args.disable_log_stats state.log_stats = not args.disable_log_stats
state.vllm_config = vllm_config
model_config = vllm_config.model_config
resolved_chat_template = load_chat_template(args.chat_template) resolved_chat_template = load_chat_template(args.chat_template)
if resolved_chat_template is not None: if resolved_chat_template is not None:
...@@ -1069,8 +1077,8 @@ async def run_server(args, **uvicorn_kwargs) -> None: ...@@ -1069,8 +1077,8 @@ async def run_server(args, **uvicorn_kwargs) -> None:
async with build_async_engine_client(args) as engine_client: async with build_async_engine_client(args) as engine_client:
app = build_app(args) app = build_app(args)
model_config = await engine_client.get_model_config() vllm_config = await engine_client.get_vllm_config()
await init_app_state(engine_client, model_config, app.state, args) await init_app_state(engine_client, vllm_config, app.state, args)
def _listen_addr(a: str) -> str: def _listen_addr(a: str) -> str:
if is_valid_ipv6_address(a): if is_valid_ipv6_address(a):
......
...@@ -64,7 +64,7 @@ class AsyncLLM(EngineClient): ...@@ -64,7 +64,7 @@ class AsyncLLM(EngineClient):
assert start_engine_loop assert start_engine_loop
self.model_config = vllm_config.model_config self.model_config = vllm_config.model_config
self.vllm_config = vllm_config
self.log_requests = log_requests self.log_requests = log_requests
self.log_stats = log_stats self.log_stats = log_stats
...@@ -379,6 +379,9 @@ class AsyncLLM(EngineClient): ...@@ -379,6 +379,9 @@ class AsyncLLM(EngineClient):
): ):
raise ValueError("Not Supported on V1 yet.") raise ValueError("Not Supported on V1 yet.")
async def get_vllm_config(self) -> VllmConfig:
return self.vllm_config
async def get_model_config(self) -> ModelConfig: async def get_model_config(self) -> ModelConfig:
return self.model_config return self.model_config
......
...@@ -230,6 +230,9 @@ class LLMEngine: ...@@ -230,6 +230,9 @@ class LLMEngine:
return processed_outputs.request_outputs return processed_outputs.request_outputs
def get_vllm_config(self):
return self.vllm_config
def get_model_config(self): def get_model_config(self):
return self.model_config return self.model_config
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment