perf: Create default sampling params only once during initialization (#1294)

92e33b86 · Kris Hung · GitHub · 8df6e882 · 92e33b86 · 92e33b86
Unverified Commit 92e33b86 authored May 30, 2025 by Kris Hung Committed by GitHub May 30, 2025
Showing with 6 additions and 2 deletions

examples/multimodal/components/processor.py examples/multimodal/components/processor.py +1 -0

examples/multimodal/utils/chat_processor.py examples/multimodal/utils/chat_processor.py +5 -2

No files found.
--- a/examples/multimodal/components/processor.py
+++ b/examples/multimodal/components/processor.py
@@ -59,6 +59,7 @@ class Processor(ProcessMixIn):
        class_name = self.__class__.__name__
        self.engine_args = parse_vllm_args(class_name, "")
        self.model_config = self.engine_args.create_model_config()
+        self.default_sampling_params = self.model_config.get_diff_sampling_param()
        self.tokenizer = self._create_tokenizer(self.engine_args)
        self.chat_processor = ChatProcessor(self.tokenizer, self.model_config)
        self.completions_processor = CompletionsProcessor(

--- a/examples/multimodal/utils/chat_processor.py
+++ b/examples/multimodal/utils/chat_processor.py
@@ -29,6 +29,7 @@ from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_engine import RequestPrompt
 from vllm.inputs.data import TokensPrompt
+from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -38,6 +39,7 @@ class ProcessMixInRequired(Protocol):
    chat_processor: "ChatProcessor | None"
    completions_processor: "CompletionsProcessor | None"
    model_config: ModelConfig
+    default_sampling_params: SamplingParams
 class ProcessMixIn(ProcessMixInRequired):
@@ -50,6 +52,7 @@ class ProcessMixIn(ProcessMixInRequired):
    chat_processor: "ChatProcessor | None"
    completions_processor: "CompletionsProcessor | None"
    model_config: ModelConfig
+    default_sampling_params: SamplingParams
    def __init__(self):
        pass
@@ -76,11 +79,11 @@ class ProcessMixIn(ProcessMixInRequired):
        default_max_tokens = self.model_config.max_model_len - len(
            preprocess_result.engine_prompt["prompt_token_ids"]
        )
-        default_sampling_params = self.model_config.get_diff_sampling_param()
        sampling_params = request.to_sampling_params(
            default_max_tokens,
            self.model_config.logits_processor_pattern,
-            default_sampling_params,
+            self.default_sampling_params,
        )
        return (
            request,