"""Model context length (prompt and output). If unspecified, will be
automatically derived from the model config.
When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable
format. Examples:\n
- 1k -> 1000\n
...
...
@@ -518,11 +518,11 @@ class ModelConfig:
self.hf_text_config.sliding_window)
logger.warning_once(
f"{self.hf_text_config.model_type} has interleaved "
"attention, which is currently not supported by the "
f"{backend} backend. Disabling sliding window and capping "
"the max length to the slidingwindow size "
f"({sliding_window_len_min}).")
"%s has interleaved attention, which is currently not supported by the %s backend. Disabling sliding window and capping the max length to the sliding window size (%d).",# noqa: E501
f"which means it won't appear in the op registry. "
f"It will be enabled/disabled based on the global settings.")
"Custom op %s was not registered, which means it won't appear in the op registry. It will be enabled/disabled based on the global settings.",# noqa: E501
"The encoder sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) "# noqa: E501
"is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). "# noqa: E501
"This may cause certain multi-modal inputs to fail during inference, even when the input text is short. "# noqa: E501
"To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.",# noqa: E501
seq_len,
total_len,
str(self._get_mm_num_tokens(mm_inputs)),
)
returnDummyEncoderData(encoder_prompt_token_ids)
...
...
@@ -243,17 +240,14 @@ class MultiModalProfiler(Generic[_I]):
iftotal_len>seq_lenandnotenvs.VLLM_USE_V1:
# `max_num_batched_tokens` is defined by `SchedulerConfig`
"The sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) "# noqa: E501
"is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). "# noqa: E501
"This may cause certain multi-modal inputs to fail during inference, even when the input text is short. "# noqa: E501
"To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.",# noqa: E501