Unverified Commit 43f10573 authored by Ajay Anubolu's avatar Ajay Anubolu Committed by GitHub
Browse files

[Bugfix] Fix misleading context length error messages (#36197)


Signed-off-by: default avatarAjAnubolu <anuboluajay@gmail.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent 86e1060b
......@@ -200,7 +200,7 @@ def test_chat_batch_failure_cleanup(llm_for_failure_test):
batch_2 = [valid_msg, valid_msg]
sampling_params = SamplingParams(temperature=0, max_tokens=10)
with pytest.raises(ValueError, match="context length is only"):
with pytest.raises(ValueError, match="maximum context length is"):
llm.chat(batch_1, sampling_params=sampling_params)
assert llm.llm_engine.get_num_unfinished_requests() == 0
......
......@@ -271,7 +271,7 @@ class TestRenderPrompt:
with pytest.raises(
ValueError,
match="input characters and requested .* context length is only",
match="maximum context length is",
):
renderer.tokenize_prompts(
prompts,
......@@ -292,7 +292,7 @@ class TestRenderPrompt:
with pytest.raises(
ValueError,
match="input tokens and requested .* context length is only",
match="maximum context length is",
):
renderer.tokenize_prompts(
prompts,
......@@ -313,7 +313,7 @@ class TestRenderPrompt:
with pytest.raises(
ValueError,
match="input tokens and requested .* context length is only",
match="maximum context length is",
):
renderer.tokenize_prompts(
prompts,
......
......@@ -791,11 +791,15 @@ class OpenAIServing:
if max_tokens is not None and token_num + max_tokens > max_model_len:
raise VLLMValidationError(
"'max_tokens' or 'max_completion_tokens' is too large: "
f"{max_tokens}. This model's maximum context length is "
f"{max_model_len} tokens and your request has "
f"{token_num} input tokens ({max_tokens} > {max_model_len}"
f" - {token_num}).",
f"This model's maximum context length is "
f"{max_model_len} tokens. However, you requested "
f"{max_tokens} output tokens and your prompt contains "
f"{token_num} input tokens, for a total of "
f"{token_num + max_tokens} tokens "
f"({token_num} + {max_tokens} = "
f"{token_num + max_tokens} > {max_model_len}). "
f"Please reduce the length of the input prompt or the "
f"number of requested output tokens.",
parameter="max_tokens",
value=max_tokens,
)
......
......@@ -253,13 +253,14 @@ class TokenizeParams:
# To save resources, fail the request outright without even
# attempting tokenization
raise VLLMValidationError(
f"You passed {len(text)} input characters "
f"and requested {self.max_output_tokens} output tokens. "
f"However, the model's context length is only "
f"{self.max_total_tokens} tokens, resulting in a maximum "
f"input length of {max_input_tokens} tokens "
f"(at most {max_input_chars} characters). "
f"Please reduce the length of the input prompt.",
f"This model's maximum context length is "
f"{self.max_total_tokens} tokens. However, you requested "
f"{self.max_output_tokens} output tokens and your prompt "
f"contains {len(text)} characters (more than "
f"{max_input_chars} characters, which is the upper bound "
f"for {max_input_tokens} input tokens). "
f"Please reduce the length of the input prompt or the "
f"number of requested output tokens.",
parameter="input_text",
value=len(text),
)
......@@ -334,15 +335,22 @@ class TokenizeParams:
return tokens
if len(tokens) > max_input_tokens:
token_count = len(tokens)
# The tokenizer may have truncated the prompt to
# max_input_tokens + 1 (see get_encode_kwargs), so the
# actual prompt length could be larger.
qualifier = "at least " if token_count == max_input_tokens + 1 else ""
total = token_count + self.max_output_tokens
raise VLLMValidationError(
f"You passed {len(tokens)} input tokens "
f"and requested {self.max_output_tokens} output tokens. "
f"However, the model's context length is only "
f"{self.max_total_tokens} tokens, resulting in a maximum "
f"input length of {max_input_tokens} tokens. "
f"Please reduce the length of the input prompt.",
f"This model's maximum context length is "
f"{self.max_total_tokens} tokens. However, you requested "
f"{self.max_output_tokens} output tokens and your prompt "
f"contains {qualifier}{token_count} input tokens, "
f"for a total of {qualifier}{total} tokens. "
f"Please reduce the length of the input prompt or the "
f"number of requested output tokens.",
parameter="input_tokens",
value=len(tokens),
value=token_count,
)
return tokens
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment