Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4bdf7ac5
Unverified
Commit
4bdf7ac5
authored
Oct 09, 2025
by
Cyrus Leung
Committed by
GitHub
Oct 09, 2025
Browse files
[Bugfix] Fix SHM cache initialization (#26427)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
dc7976dd
Changes
30
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
54 additions
and
66 deletions
+54
-66
vllm/entrypoints/openai/serving_responses.py
vllm/entrypoints/openai/serving_responses.py
+1
-4
vllm/entrypoints/openai/serving_score.py
vllm/entrypoints/openai/serving_score.py
+0
-3
vllm/entrypoints/openai/serving_tokenization.py
vllm/entrypoints/openai/serving_tokenization.py
+0
-3
vllm/entrypoints/openai/serving_transcription.py
vllm/entrypoints/openai/serving_transcription.py
+0
-5
vllm/entrypoints/openai/speech_to_text.py
vllm/entrypoints/openai/speech_to_text.py
+1
-4
vllm/inputs/data.py
vllm/inputs/data.py
+4
-4
vllm/inputs/preprocess.py
vllm/inputs/preprocess.py
+7
-10
vllm/v1/engine/async_llm.py
vllm/v1/engine/async_llm.py
+19
-16
vllm/v1/engine/llm_engine.py
vllm/v1/engine/llm_engine.py
+20
-17
vllm/v1/engine/processor.py
vllm/v1/engine/processor.py
+2
-0
No files found.
vllm/entrypoints/openai/serving_responses.py
View file @
4bdf7ac5
...
...
@@ -49,7 +49,6 @@ from openai.types.responses.response_reasoning_item import (
from
openai_harmony
import
Message
as
OpenAIHarmonyMessage
from
vllm
import
envs
from
vllm.config
import
ModelConfig
from
vllm.engine.protocol
import
EngineClient
from
vllm.entrypoints.chat_utils
import
(
ChatCompletionMessageParam
,
...
...
@@ -109,7 +108,6 @@ class OpenAIServingResponses(OpenAIServing):
def
__init__
(
self
,
engine_client
:
EngineClient
,
model_config
:
ModelConfig
,
models
:
OpenAIServingModels
,
*
,
request_logger
:
Optional
[
RequestLogger
],
...
...
@@ -127,7 +125,6 @@ class OpenAIServingResponses(OpenAIServing):
)
->
None
:
super
().
__init__
(
engine_client
=
engine_client
,
model_config
=
model_config
,
models
=
models
,
request_logger
=
request_logger
,
return_tokens_as_token_ids
=
return_tokens_as_token_ids
,
...
...
@@ -176,7 +173,7 @@ class OpenAIServingResponses(OpenAIServing):
"the store."
)
self
.
use_harmony
=
model_config
.
hf_config
.
model_type
==
"gpt_oss"
self
.
use_harmony
=
self
.
model_config
.
hf_config
.
model_type
==
"gpt_oss"
if
self
.
use_harmony
:
logger
.
warning
(
"For gpt-oss, we ignore --enable-auto-tool-choice "
...
...
vllm/entrypoints/openai/serving_score.py
View file @
4bdf7ac5
...
...
@@ -7,7 +7,6 @@ from typing import Any, Optional, Union
from
fastapi
import
Request
from
vllm.config
import
ModelConfig
from
vllm.engine.protocol
import
EngineClient
from
vllm.entrypoints.logger
import
RequestLogger
from
vllm.entrypoints.openai.protocol
import
(
...
...
@@ -47,7 +46,6 @@ class ServingScores(OpenAIServing):
def
__init__
(
self
,
engine_client
:
EngineClient
,
model_config
:
ModelConfig
,
models
:
OpenAIServingModels
,
*
,
request_logger
:
Optional
[
RequestLogger
],
...
...
@@ -55,7 +53,6 @@ class ServingScores(OpenAIServing):
)
->
None
:
super
().
__init__
(
engine_client
=
engine_client
,
model_config
=
model_config
,
models
=
models
,
request_logger
=
request_logger
,
log_error_stack
=
log_error_stack
,
...
...
vllm/entrypoints/openai/serving_tokenization.py
View file @
4bdf7ac5
...
...
@@ -6,7 +6,6 @@ from typing import Any, Final, Optional, Union
import
jinja2
from
fastapi
import
Request
from
vllm.config
import
ModelConfig
from
vllm.engine.protocol
import
EngineClient
from
vllm.entrypoints.chat_utils
import
ChatTemplateContentFormatOption
from
vllm.entrypoints.logger
import
RequestLogger
...
...
@@ -32,7 +31,6 @@ class OpenAIServingTokenization(OpenAIServing):
def
__init__
(
self
,
engine_client
:
EngineClient
,
model_config
:
ModelConfig
,
models
:
OpenAIServingModels
,
*
,
request_logger
:
Optional
[
RequestLogger
],
...
...
@@ -43,7 +41,6 @@ class OpenAIServingTokenization(OpenAIServing):
)
->
None
:
super
().
__init__
(
engine_client
=
engine_client
,
model_config
=
model_config
,
models
=
models
,
request_logger
=
request_logger
,
log_error_stack
=
log_error_stack
,
...
...
vllm/entrypoints/openai/serving_transcription.py
View file @
4bdf7ac5
...
...
@@ -5,7 +5,6 @@ from typing import Optional, Union
from
fastapi
import
Request
from
vllm.config
import
ModelConfig
from
vllm.engine.protocol
import
EngineClient
from
vllm.entrypoints.logger
import
RequestLogger
from
vllm.entrypoints.openai.protocol
import
(
...
...
@@ -34,7 +33,6 @@ class OpenAIServingTranscription(OpenAISpeechToText):
def
__init__
(
self
,
engine_client
:
EngineClient
,
model_config
:
ModelConfig
,
models
:
OpenAIServingModels
,
*
,
request_logger
:
Optional
[
RequestLogger
],
...
...
@@ -43,7 +41,6 @@ class OpenAIServingTranscription(OpenAISpeechToText):
):
super
().
__init__
(
engine_client
=
engine_client
,
model_config
=
model_config
,
models
=
models
,
request_logger
=
request_logger
,
return_tokens_as_token_ids
=
return_tokens_as_token_ids
,
...
...
@@ -95,7 +92,6 @@ class OpenAIServingTranslation(OpenAISpeechToText):
def
__init__
(
self
,
engine_client
:
EngineClient
,
model_config
:
ModelConfig
,
models
:
OpenAIServingModels
,
*
,
request_logger
:
Optional
[
RequestLogger
],
...
...
@@ -104,7 +100,6 @@ class OpenAIServingTranslation(OpenAISpeechToText):
):
super
().
__init__
(
engine_client
=
engine_client
,
model_config
=
model_config
,
models
=
models
,
request_logger
=
request_logger
,
return_tokens_as_token_ids
=
return_tokens_as_token_ids
,
...
...
vllm/entrypoints/openai/speech_to_text.py
View file @
4bdf7ac5
...
...
@@ -12,7 +12,6 @@ import numpy as np
from
fastapi
import
Request
import
vllm.envs
as
envs
from
vllm.config
import
ModelConfig
from
vllm.engine.protocol
import
EngineClient
from
vllm.entrypoints.logger
import
RequestLogger
from
vllm.entrypoints.openai.protocol
import
(
...
...
@@ -53,7 +52,6 @@ class OpenAISpeechToText(OpenAIServing):
def
__init__
(
self
,
engine_client
:
EngineClient
,
model_config
:
ModelConfig
,
models
:
OpenAIServingModels
,
*
,
request_logger
:
Optional
[
RequestLogger
],
...
...
@@ -63,7 +61,6 @@ class OpenAISpeechToText(OpenAIServing):
):
super
().
__init__
(
engine_client
=
engine_client
,
model_config
=
model_config
,
models
=
models
,
request_logger
=
request_logger
,
return_tokens_as_token_ids
=
return_tokens_as_token_ids
,
...
...
@@ -74,7 +71,7 @@ class OpenAISpeechToText(OpenAIServing):
self
.
task_type
=
task_type
self
.
asr_config
=
self
.
model_cls
.
get_speech_to_text_config
(
model_config
,
task_type
self
.
model_config
,
task_type
)
self
.
max_audio_filesize_mb
=
envs
.
VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
...
...
vllm/inputs/data.py
View file @
4bdf7ac5
...
...
@@ -20,13 +20,13 @@ class TextPrompt(TypedDict):
prompt
:
str
"""The input text to be tokenized before passing to the model."""
multi_modal_data
:
NotRequired
[
"MultiModalDataDict"
]
multi_modal_data
:
NotRequired
[
Optional
[
"MultiModalDataDict"
]
]
"""
Optional multi-modal data to pass to the model,
if the model supports it.
"""
mm_processor_kwargs
:
NotRequired
[
dict
[
str
,
Any
]]
mm_processor_kwargs
:
NotRequired
[
Optional
[
dict
[
str
,
Any
]]
]
"""
Optional multi-modal processor kwargs to be forwarded to the
multimodal input mapper & processor. Note that if multiple modalities
...
...
@@ -61,13 +61,13 @@ class TokensPrompt(TypedDict):
token_type_ids
:
NotRequired
[
list
[
int
]]
"""A list of token type IDs to pass to the cross encoder model."""
multi_modal_data
:
NotRequired
[
"MultiModalDataDict"
]
multi_modal_data
:
NotRequired
[
Optional
[
"MultiModalDataDict"
]
]
"""
Optional multi-modal data to pass to the model,
if the model supports it.
"""
mm_processor_kwargs
:
NotRequired
[
dict
[
str
,
Any
]]
mm_processor_kwargs
:
NotRequired
[
Optional
[
dict
[
str
,
Any
]]
]
"""
Optional multi-modal processor kwargs to be forwarded to the
multimodal input mapper & processor. Note that if multiple modalities
...
...
vllm/inputs/preprocess.py
View file @
4bdf7ac5
...
...
@@ -17,7 +17,7 @@ from vllm.multimodal.inputs import (
MultiModalUUIDDict
,
)
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
,
init_tokenizer_from_configs
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.utils.jsontree
import
json_iter_leaves
from
.data
import
(
...
...
@@ -45,20 +45,17 @@ class InputPreprocessor:
def
__init__
(
self
,
model_config
:
ModelConfig
,
tokenizer
:
Optional
[
AnyTokenizer
],
mm_registry
:
MultiModalRegistry
=
MULTIMODAL_REGISTRY
,
mm_processor_cache
:
Optional
[
BaseMultiModalProcessorCache
]
=
None
,
)
->
None
:
super
().
__init__
()
self
.
model_config
=
model_config
self
.
tokenizer
=
tokenizer
self
.
mm_registry
=
mm_registry
self
.
mm_processor_cache
=
mm_processor_cache
if
model_config
.
skip_tokenizer_init
:
self
.
tokenizer
=
None
else
:
self
.
tokenizer
=
init_tokenizer_from_configs
(
model_config
)
def
get_tokenizer
(
self
)
->
AnyTokenizer
:
if
self
.
tokenizer
is
None
:
raise
ValueError
(
...
...
@@ -351,8 +348,8 @@ class InputPreprocessor:
if
self
.
model_config
.
is_multimodal_model
:
inputs
=
self
.
_process_multimodal
(
prompt_token_ids
,
parsed_content
.
get
(
"multi_modal_data"
,
{}
)
,
parsed_content
.
get
(
"mm_processor_kwargs"
),
parsed_content
.
get
(
"multi_modal_data"
)
or
{},
parsed_content
.
get
(
"mm_processor_kwargs"
)
or
{}
,
tokenization_kwargs
=
tokenization_kwargs
,
mm_uuids
=
mm_uuids
,
)
...
...
@@ -380,8 +377,8 @@ class InputPreprocessor:
if
self
.
model_config
.
is_multimodal_model
:
inputs
=
self
.
_process_multimodal
(
prompt_text
,
parsed_content
.
get
(
"multi_modal_data"
,
{}
)
,
parsed_content
.
get
(
"mm_processor_kwargs"
),
parsed_content
.
get
(
"multi_modal_data"
)
or
{},
parsed_content
.
get
(
"mm_processor_kwargs"
)
or
{}
,
tokenization_kwargs
=
tokenization_kwargs
,
mm_uuids
=
mm_uuids
,
)
...
...
vllm/v1/engine/async_llm.py
View file @
4bdf7ac5
...
...
@@ -12,23 +12,23 @@ import numpy as np
import
torch
import
vllm.envs
as
envs
from
vllm.config
import
ModelConfig
,
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.protocol
import
EngineClient
from
vllm.entrypoints.utils
import
_validate_truncation_size
from
vllm.envs
import
VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
from
vllm.inputs
import
PromptType
from
vllm.inputs.preprocess
import
InputPreprocessor
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalRegistry
from
vllm.outputs
import
PoolingRequestOutput
,
RequestOutput
from
vllm.plugins.io_processors
import
get_io_processor
from
vllm.pooling_params
import
PoolingParams
from
vllm.sampling_params
import
SamplingParams
from
vllm.tasks
import
SupportedTask
from
vllm.tracing
import
init_tracer
from
vllm.transformers_utils.config
import
maybe_register_config_serialize_by_value
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
,
init_tokenizer_from_configs
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.utils
import
Device
,
as_list
,
cancel_task_threadsafe
,
cdiv
,
deprecate_kwargs
from
vllm.v1.engine
import
EngineCoreRequest
...
...
@@ -104,8 +104,16 @@ class AsyncLLM(EngineClient):
"logger list; enabling logging without default stat loggers"
)
# Processor (converts Inputs --> EngineCoreRequests).
self
.
processor
=
Processor
(
vllm_config
,
mm_registry
=
mm_registry
)
if
self
.
model_config
.
skip_tokenizer_init
:
tokenizer
=
None
else
:
tokenizer
=
init_tokenizer_from_configs
(
self
.
model_config
)
self
.
processor
=
Processor
(
self
.
vllm_config
,
tokenizer
)
self
.
io_processor
=
get_io_processor
(
self
.
vllm_config
,
self
.
model_config
.
io_processor_plugin
,
)
# OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
self
.
output_processor
=
OutputProcessor
(
...
...
@@ -245,10 +253,6 @@ class AsyncLLM(EngineClient):
cancel_task_threadsafe
(
getattr
(
self
,
"output_handler"
,
None
))
@
property
def
tokenizer
(
self
)
->
Optional
[
AnyTokenizer
]:
return
self
.
processor
.
tokenizer
async
def
get_supported_tasks
(
self
)
->
tuple
[
SupportedTask
,
...]:
return
await
self
.
engine_core
.
get_supported_tasks_async
()
...
...
@@ -615,14 +619,13 @@ class AsyncLLM(EngineClient):
logger
.
info
(
"Request %s failed."
,
request_id
)
raise
EngineGenerateError
()
from
e
async
def
get_vllm_config
(
self
)
->
VllmConfig
:
return
self
.
vllm_config
async
def
get_model_config
(
self
)
->
ModelConfig
:
return
self
.
model_config
@
property
def
tokenizer
(
self
)
->
Optional
[
AnyTokenizer
]:
return
self
.
processor
.
tokenizer
async
def
get_input_preprocessor
(
self
)
->
InputPreprocessor
:
return
self
.
processor
.
input_preprocessor
@
tokenizer
.
setter
def
tokenizer
(
self
,
tokenizer
:
Optional
[
AnyTokenizer
])
->
None
:
self
.
processor
.
tokenizer
=
tokenizer
async
def
get_tokenizer
(
self
)
->
AnyTokenizer
:
if
self
.
tokenizer
is
None
:
...
...
vllm/v1/engine/llm_engine.py
View file @
4bdf7ac5
...
...
@@ -19,11 +19,12 @@ from vllm.logger import init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalRegistry
from
vllm.outputs
import
PoolingRequestOutput
,
RequestOutput
from
vllm.plugins.io_processors
import
get_io_processor
from
vllm.pooling_params
import
PoolingParams
from
vllm.sampling_params
import
SamplingParams
from
vllm.tasks
import
SupportedTask
from
vllm.tracing
import
init_tracer
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
,
init_tokenizer_from_configs
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.utils
import
Device
from
vllm.v1.engine
import
EngineCoreRequest
...
...
@@ -95,8 +96,16 @@ class LLMEngine:
self
.
dp_group
=
None
self
.
should_execute_dummy_batch
=
False
# Processor (convert Inputs --> EngineCoreRequests)
self
.
processor
=
Processor
(
vllm_config
,
mm_registry
=
mm_registry
)
if
self
.
model_config
.
skip_tokenizer_init
:
tokenizer
=
None
else
:
tokenizer
=
init_tokenizer_from_configs
(
self
.
model_config
)
self
.
processor
=
Processor
(
self
.
vllm_config
,
tokenizer
)
self
.
io_processor
=
get_io_processor
(
self
.
vllm_config
,
self
.
model_config
.
io_processor_plugin
,
)
# OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
self
.
output_processor
=
OutputProcessor
(
...
...
@@ -204,14 +213,6 @@ class LLMEngine:
def
validate_outputs
(
cls
,
outputs
,
output_type
):
return
outputs
@
property
def
tokenizer
(
self
)
->
Optional
[
AnyTokenizer
]:
return
self
.
processor
.
tokenizer
@
tokenizer
.
setter
def
tokenizer
(
self
,
tokenizer
:
Optional
[
AnyTokenizer
])
->
None
:
self
.
processor
.
tokenizer
=
tokenizer
def
get_supported_tasks
(
self
)
->
tuple
[
SupportedTask
,
...]:
return
self
.
engine_core
.
get_supported_tasks
()
...
...
@@ -313,12 +314,6 @@ class LLMEngine:
return
processed_outputs
.
request_outputs
def
get_vllm_config
(
self
):
return
self
.
vllm_config
def
get_model_config
(
self
):
return
self
.
model_config
def
start_profile
(
self
):
self
.
engine_core
.
profile
(
True
)
...
...
@@ -345,6 +340,14 @@ class LLMEngine:
assert
self
.
log_stats
,
"Stat logging disabled"
return
get_metrics_snapshot
()
@
property
def
tokenizer
(
self
)
->
Optional
[
AnyTokenizer
]:
return
self
.
processor
.
tokenizer
@
tokenizer
.
setter
def
tokenizer
(
self
,
tokenizer
:
Optional
[
AnyTokenizer
])
->
None
:
self
.
processor
.
tokenizer
=
tokenizer
def
get_tokenizer
(
self
)
->
AnyTokenizer
:
if
self
.
tokenizer
is
None
:
raise
ValueError
(
...
...
vllm/v1/engine/processor.py
View file @
4bdf7ac5
...
...
@@ -37,6 +37,7 @@ class Processor:
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
tokenizer
:
Optional
[
AnyTokenizer
],
mm_registry
:
MultiModalRegistry
=
MULTIMODAL_REGISTRY
,
)
->
None
:
self
.
vllm_config
=
vllm_config
...
...
@@ -52,6 +53,7 @@ class Processor:
self
.
input_preprocessor
=
InputPreprocessor
(
self
.
model_config
,
tokenizer
,
mm_registry
,
mm_processor_cache
=
self
.
mm_processor_cache
,
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment