"""Model context length (prompt and output). If unspecified, will be
"""Model context length (prompt and output). If unspecified, will be
automatically derived from the model config.
automatically derived from the model config.
When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable
When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable
format. Examples:\n
format. Examples:\n
- 1k -> 1000\n
- 1k -> 1000\n
...
@@ -518,11 +518,11 @@ class ModelConfig:
...
@@ -518,11 +518,11 @@ class ModelConfig:
self.hf_text_config.sliding_window)
self.hf_text_config.sliding_window)
logger.warning_once(
logger.warning_once(
f"{self.hf_text_config.model_type} has interleaved "
"%s has interleaved attention, which is currently not supported by the %s backend. Disabling sliding window and capping the max length to the sliding window size (%d).",# noqa: E501
"attention, which is currently not supported by the "
self.hf_text_config.model_type,
f"{backend} backend. Disabling sliding window and capping "
"Custom op %s was not registered, which means it won't appear in the op registry. It will be enabled/disabled based on the global settings.",# noqa: E501
f"which means it won't appear in the op registry. "
cls.__name__,
f"It will be enabled/disabled based on the global settings.")
"is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). "# noqa: E501
" is too short "
"This may cause certain multi-modal inputs to fail during inference, even when the input text is short. "# noqa: E501
"to hold the multi-modal embeddings in the worst case "
"To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.",# noqa: E501
f"({total_len} tokens in total, out of which "
seq_len,
f"{self._get_mm_num_tokens(mm_inputs)} are reserved for "
total_len,
"multi-modal embeddings). This may cause certain "
str(self._get_mm_num_tokens(mm_inputs)),
"multi-modal inputs to fail during inference, even when "
)
"the input text is short. To avoid this, you should "
"is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). "# noqa: E501
"is too short "
"This may cause certain multi-modal inputs to fail during inference, even when the input text is short. "# noqa: E501
"to hold the multi-modal embeddings in the worst case "
"To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.",# noqa: E501
f"({total_len} tokens in total, out of which "
seq_len,
f"{self._get_mm_num_tokens(mm_inputs)} are reserved for "
total_len,
"multi-modal embeddings). This may cause certain "
str(self._get_mm_num_tokens(mm_inputs)),
"multi-modal inputs to fail during inference, even when "
)
"the input text is short. To avoid this, you should "