Unverified Commit 1c2bc7ea authored by Gabriel Marinho's avatar Gabriel Marinho Committed by GitHub
Browse files

Truncation control for embedding models (#14776)


Signed-off-by: default avatarGabriel Marinho <gmarinho@ibm.com>
Signed-off-by: default avatarMax de Bayser <mbayser@br.ibm.com>
Co-authored-by: default avatarMax de Bayser <mbayser@br.ibm.com>
parent 4055130a
......@@ -2,7 +2,7 @@
import time
from collections.abc import Mapping, Sequence
from typing import Literal, Optional, Union
from typing import Any, Literal, Optional, Union
from vllm.config import VllmConfig
from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
......@@ -198,6 +198,7 @@ class Processor:
params: Union[SamplingParams, PoolingParams],
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
tokenization_kwargs: Optional[dict[str, Any]] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
......@@ -224,6 +225,7 @@ class Processor:
# 3. Apply prompt adapter to prompt token ids if one exists.
processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
prompt,
tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request,
prompt_adapter_request=prompt_adapter_request,
return_mm_hashes=self.use_hash,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment