Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1666e664
Unverified
Commit
1666e664
authored
Apr 15, 2025
by
Xihui Cang
Committed by
GitHub
Apr 15, 2025
Browse files
Add "/server_info" endpoint in api_server to retrieve the vllm_config. (#16572)
Signed-off-by:
Xihui Cang
<
xihuicang@gmail.com
>
parent
1575c170
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
37 additions
and
6 deletions
+37
-6
vllm/engine/async_llm_engine.py
vllm/engine/async_llm_engine.py
+4
-0
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+4
-0
vllm/engine/multiprocessing/client.py
vllm/engine/multiprocessing/client.py
+4
-0
vllm/engine/protocol.py
vllm/engine/protocol.py
+6
-1
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+12
-4
vllm/v1/engine/async_llm.py
vllm/v1/engine/async_llm.py
+4
-1
vllm/v1/engine/llm_engine.py
vllm/v1/engine/llm_engine.py
+3
-0
No files found.
vllm/engine/async_llm_engine.py
View file @
1666e664
...
@@ -1167,6 +1167,10 @@ class AsyncLLMEngine(EngineClient):
...
@@ -1167,6 +1167,10 @@ class AsyncLLMEngine(EngineClient):
exception
=
asyncio
.
CancelledError
,
exception
=
asyncio
.
CancelledError
,
verbose
=
self
.
log_requests
)
verbose
=
self
.
log_requests
)
async
def
get_vllm_config
(
self
)
->
VllmConfig
:
"""Get the vllm configuration of the vLLM engine."""
return
self
.
engine
.
get_vllm_config
()
async
def
get_model_config
(
self
)
->
ModelConfig
:
async
def
get_model_config
(
self
)
->
ModelConfig
:
"""Get the model configuration of the vLLM engine."""
"""Get the model configuration of the vLLM engine."""
return
self
.
engine
.
get_model_config
()
return
self
.
engine
.
get_model_config
()
...
...
vllm/engine/llm_engine.py
View file @
1666e664
...
@@ -914,6 +914,10 @@ class LLMEngine:
...
@@ -914,6 +914,10 @@ class LLMEngine:
scheduler
.
abort_seq_group
(
scheduler
.
abort_seq_group
(
request_id
,
seq_id_to_seq_group
=
self
.
seq_id_to_seq_group
)
request_id
,
seq_id_to_seq_group
=
self
.
seq_id_to_seq_group
)
def
get_vllm_config
(
self
)
->
VllmConfig
:
"""Gets the vllm configuration."""
return
self
.
vllm_config
def
get_model_config
(
self
)
->
ModelConfig
:
def
get_model_config
(
self
)
->
ModelConfig
:
"""Gets the model configuration."""
"""Gets the model configuration."""
return
self
.
model_config
return
self
.
model_config
...
...
vllm/engine/multiprocessing/client.py
View file @
1666e664
...
@@ -93,6 +93,7 @@ class MQLLMEngineClient(EngineClient):
...
@@ -93,6 +93,7 @@ class MQLLMEngineClient(EngineClient):
self
.
_errored_with
:
Optional
[
BaseException
]
=
None
self
.
_errored_with
:
Optional
[
BaseException
]
=
None
# Get the configs.
# Get the configs.
self
.
vllm_config
=
engine_config
self
.
model_config
=
engine_config
.
model_config
self
.
model_config
=
engine_config
.
model_config
self
.
decoding_config
=
engine_config
.
decoding_config
self
.
decoding_config
=
engine_config
.
decoding_config
...
@@ -377,6 +378,9 @@ class MQLLMEngineClient(EngineClient):
...
@@ -377,6 +378,9 @@ class MQLLMEngineClient(EngineClient):
async
def
get_tokenizer
(
self
,
lora_request
:
Optional
[
LoRARequest
]
=
None
):
async
def
get_tokenizer
(
self
,
lora_request
:
Optional
[
LoRARequest
]
=
None
):
return
await
self
.
tokenizer
.
get_lora_tokenizer_async
(
lora_request
)
return
await
self
.
tokenizer
.
get_lora_tokenizer_async
(
lora_request
)
async
def
get_vllm_config
(
self
)
->
VllmConfig
:
return
self
.
vllm_config
async
def
get_decoding_config
(
self
)
->
DecodingConfig
:
async
def
get_decoding_config
(
self
)
->
DecodingConfig
:
return
self
.
decoding_config
return
self
.
decoding_config
...
...
vllm/engine/protocol.py
View file @
1666e664
...
@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
...
@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
from
typing
import
AsyncGenerator
,
List
,
Mapping
,
Optional
from
typing
import
AsyncGenerator
,
List
,
Mapping
,
Optional
from
vllm.beam_search
import
BeamSearchSequence
,
create_sort_beams_key_function
from
vllm.beam_search
import
BeamSearchSequence
,
create_sort_beams_key_function
from
vllm.config
import
DecodingConfig
,
ModelConfig
from
vllm.config
import
DecodingConfig
,
ModelConfig
,
VllmConfig
from
vllm.core.scheduler
import
SchedulerOutputs
from
vllm.core.scheduler
import
SchedulerOutputs
from
vllm.inputs.data
import
PromptType
,
TokensPrompt
from
vllm.inputs.data
import
PromptType
,
TokensPrompt
from
vllm.inputs.parse
import
is_explicit_encoder_decoder_prompt
from
vllm.inputs.parse
import
is_explicit_encoder_decoder_prompt
...
@@ -220,6 +220,11 @@ class EngineClient(ABC):
...
@@ -220,6 +220,11 @@ class EngineClient(ABC):
"""
"""
...
...
@
abstractmethod
async
def
get_vllm_config
(
self
)
->
VllmConfig
:
"""Get the vllm configuration of the vLLM engine."""
...
@
abstractmethod
@
abstractmethod
async
def
get_model_config
(
self
)
->
ModelConfig
:
async
def
get_model_config
(
self
)
->
ModelConfig
:
"""Get the model configuration of the vLLM engine."""
"""Get the model configuration of the vLLM engine."""
...
...
vllm/entrypoints/openai/api_server.py
View file @
1666e664
...
@@ -30,7 +30,7 @@ from starlette.routing import Mount
...
@@ -30,7 +30,7 @@ from starlette.routing import Mount
from
typing_extensions
import
assert_never
from
typing_extensions
import
assert_never
import
vllm.envs
as
envs
import
vllm.envs
as
envs
from
vllm.config
import
Model
Config
from
vllm.config
import
Vllm
Config
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
# type: ignore
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
# type: ignore
from
vllm.engine.multiprocessing.client
import
MQLLMEngineClient
from
vllm.engine.multiprocessing.client
import
MQLLMEngineClient
...
@@ -327,6 +327,7 @@ def mount_metrics(app: FastAPI):
...
@@ -327,6 +327,7 @@ def mount_metrics(app: FastAPI):
"/load"
,
"/load"
,
"/ping"
,
"/ping"
,
"/version"
,
"/version"
,
"/server_info"
,
],
],
registry
=
registry
,
registry
=
registry
,
).
add
().
instrument
(
app
).
expose
(
app
)
).
add
().
instrument
(
app
).
expose
(
app
)
...
@@ -687,6 +688,11 @@ TASK_HANDLERS: dict[str, dict[str, tuple]] = {
...
@@ -687,6 +688,11 @@ TASK_HANDLERS: dict[str, dict[str, tuple]] = {
if
envs
.
VLLM_SERVER_DEV_MODE
:
if
envs
.
VLLM_SERVER_DEV_MODE
:
@
router
.
get
(
"/server_info"
)
async
def
show_server_info
(
raw_request
:
Request
):
server_info
=
{
"vllm_config"
:
str
(
raw_request
.
app
.
state
.
vllm_config
)}
return
JSONResponse
(
content
=
server_info
)
@
router
.
post
(
"/reset_prefix_cache"
)
@
router
.
post
(
"/reset_prefix_cache"
)
async
def
reset_prefix_cache
(
raw_request
:
Request
):
async
def
reset_prefix_cache
(
raw_request
:
Request
):
"""
"""
...
@@ -894,7 +900,7 @@ def build_app(args: Namespace) -> FastAPI:
...
@@ -894,7 +900,7 @@ def build_app(args: Namespace) -> FastAPI:
async
def
init_app_state
(
async
def
init_app_state
(
engine_client
:
EngineClient
,
engine_client
:
EngineClient
,
model
_config
:
Model
Config
,
vllm
_config
:
Vllm
Config
,
state
:
State
,
state
:
State
,
args
:
Namespace
,
args
:
Namespace
,
)
->
None
:
)
->
None
:
...
@@ -915,6 +921,8 @@ async def init_app_state(
...
@@ -915,6 +921,8 @@ async def init_app_state(
state
.
engine_client
=
engine_client
state
.
engine_client
=
engine_client
state
.
log_stats
=
not
args
.
disable_log_stats
state
.
log_stats
=
not
args
.
disable_log_stats
state
.
vllm_config
=
vllm_config
model_config
=
vllm_config
.
model_config
resolved_chat_template
=
load_chat_template
(
args
.
chat_template
)
resolved_chat_template
=
load_chat_template
(
args
.
chat_template
)
if
resolved_chat_template
is
not
None
:
if
resolved_chat_template
is
not
None
:
...
@@ -1069,8 +1077,8 @@ async def run_server(args, **uvicorn_kwargs) -> None:
...
@@ -1069,8 +1077,8 @@ async def run_server(args, **uvicorn_kwargs) -> None:
async
with
build_async_engine_client
(
args
)
as
engine_client
:
async
with
build_async_engine_client
(
args
)
as
engine_client
:
app
=
build_app
(
args
)
app
=
build_app
(
args
)
model
_config
=
await
engine_client
.
get_
model
_config
()
vllm
_config
=
await
engine_client
.
get_
vllm
_config
()
await
init_app_state
(
engine_client
,
model
_config
,
app
.
state
,
args
)
await
init_app_state
(
engine_client
,
vllm
_config
,
app
.
state
,
args
)
def
_listen_addr
(
a
:
str
)
->
str
:
def
_listen_addr
(
a
:
str
)
->
str
:
if
is_valid_ipv6_address
(
a
):
if
is_valid_ipv6_address
(
a
):
...
...
vllm/v1/engine/async_llm.py
View file @
1666e664
...
@@ -64,7 +64,7 @@ class AsyncLLM(EngineClient):
...
@@ -64,7 +64,7 @@ class AsyncLLM(EngineClient):
assert
start_engine_loop
assert
start_engine_loop
self
.
model_config
=
vllm_config
.
model_config
self
.
model_config
=
vllm_config
.
model_config
self
.
vllm_config
=
vllm_config
self
.
log_requests
=
log_requests
self
.
log_requests
=
log_requests
self
.
log_stats
=
log_stats
self
.
log_stats
=
log_stats
...
@@ -379,6 +379,9 @@ class AsyncLLM(EngineClient):
...
@@ -379,6 +379,9 @@ class AsyncLLM(EngineClient):
):
):
raise
ValueError
(
"Not Supported on V1 yet."
)
raise
ValueError
(
"Not Supported on V1 yet."
)
async
def
get_vllm_config
(
self
)
->
VllmConfig
:
return
self
.
vllm_config
async
def
get_model_config
(
self
)
->
ModelConfig
:
async
def
get_model_config
(
self
)
->
ModelConfig
:
return
self
.
model_config
return
self
.
model_config
...
...
vllm/v1/engine/llm_engine.py
View file @
1666e664
...
@@ -230,6 +230,9 @@ class LLMEngine:
...
@@ -230,6 +230,9 @@ class LLMEngine:
return
processed_outputs
.
request_outputs
return
processed_outputs
.
request_outputs
def
get_vllm_config
(
self
):
return
self
.
vllm_config
def
get_model_config
(
self
):
def
get_model_config
(
self
):
return
self
.
model_config
return
self
.
model_config
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment