"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "5819ca8944af4f7dcbac3c6b73179f760e05910d"
Unverified Commit 43f10573 authored by Ajay Anubolu's avatar Ajay Anubolu Committed by GitHub
Browse files

[Bugfix] Fix misleading context length error messages (#36197)


Signed-off-by: default avatarAjAnubolu <anuboluajay@gmail.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent 86e1060b
...@@ -200,7 +200,7 @@ def test_chat_batch_failure_cleanup(llm_for_failure_test): ...@@ -200,7 +200,7 @@ def test_chat_batch_failure_cleanup(llm_for_failure_test):
batch_2 = [valid_msg, valid_msg] batch_2 = [valid_msg, valid_msg]
sampling_params = SamplingParams(temperature=0, max_tokens=10) sampling_params = SamplingParams(temperature=0, max_tokens=10)
with pytest.raises(ValueError, match="context length is only"): with pytest.raises(ValueError, match="maximum context length is"):
llm.chat(batch_1, sampling_params=sampling_params) llm.chat(batch_1, sampling_params=sampling_params)
assert llm.llm_engine.get_num_unfinished_requests() == 0 assert llm.llm_engine.get_num_unfinished_requests() == 0
......
...@@ -271,7 +271,7 @@ class TestRenderPrompt: ...@@ -271,7 +271,7 @@ class TestRenderPrompt:
with pytest.raises( with pytest.raises(
ValueError, ValueError,
match="input characters and requested .* context length is only", match="maximum context length is",
): ):
renderer.tokenize_prompts( renderer.tokenize_prompts(
prompts, prompts,
...@@ -292,7 +292,7 @@ class TestRenderPrompt: ...@@ -292,7 +292,7 @@ class TestRenderPrompt:
with pytest.raises( with pytest.raises(
ValueError, ValueError,
match="input tokens and requested .* context length is only", match="maximum context length is",
): ):
renderer.tokenize_prompts( renderer.tokenize_prompts(
prompts, prompts,
...@@ -313,7 +313,7 @@ class TestRenderPrompt: ...@@ -313,7 +313,7 @@ class TestRenderPrompt:
with pytest.raises( with pytest.raises(
ValueError, ValueError,
match="input tokens and requested .* context length is only", match="maximum context length is",
): ):
renderer.tokenize_prompts( renderer.tokenize_prompts(
prompts, prompts,
......
...@@ -791,11 +791,15 @@ class OpenAIServing: ...@@ -791,11 +791,15 @@ class OpenAIServing:
if max_tokens is not None and token_num + max_tokens > max_model_len: if max_tokens is not None and token_num + max_tokens > max_model_len:
raise VLLMValidationError( raise VLLMValidationError(
"'max_tokens' or 'max_completion_tokens' is too large: " f"This model's maximum context length is "
f"{max_tokens}. This model's maximum context length is " f"{max_model_len} tokens. However, you requested "
f"{max_model_len} tokens and your request has " f"{max_tokens} output tokens and your prompt contains "
f"{token_num} input tokens ({max_tokens} > {max_model_len}" f"{token_num} input tokens, for a total of "
f" - {token_num}).", f"{token_num + max_tokens} tokens "
f"({token_num} + {max_tokens} = "
f"{token_num + max_tokens} > {max_model_len}). "
f"Please reduce the length of the input prompt or the "
f"number of requested output tokens.",
parameter="max_tokens", parameter="max_tokens",
value=max_tokens, value=max_tokens,
) )
......
...@@ -253,13 +253,14 @@ class TokenizeParams: ...@@ -253,13 +253,14 @@ class TokenizeParams:
# To save resources, fail the request outright without even # To save resources, fail the request outright without even
# attempting tokenization # attempting tokenization
raise VLLMValidationError( raise VLLMValidationError(
f"You passed {len(text)} input characters " f"This model's maximum context length is "
f"and requested {self.max_output_tokens} output tokens. " f"{self.max_total_tokens} tokens. However, you requested "
f"However, the model's context length is only " f"{self.max_output_tokens} output tokens and your prompt "
f"{self.max_total_tokens} tokens, resulting in a maximum " f"contains {len(text)} characters (more than "
f"input length of {max_input_tokens} tokens " f"{max_input_chars} characters, which is the upper bound "
f"(at most {max_input_chars} characters). " f"for {max_input_tokens} input tokens). "
f"Please reduce the length of the input prompt.", f"Please reduce the length of the input prompt or the "
f"number of requested output tokens.",
parameter="input_text", parameter="input_text",
value=len(text), value=len(text),
) )
...@@ -334,15 +335,22 @@ class TokenizeParams: ...@@ -334,15 +335,22 @@ class TokenizeParams:
return tokens return tokens
if len(tokens) > max_input_tokens: if len(tokens) > max_input_tokens:
token_count = len(tokens)
# The tokenizer may have truncated the prompt to
# max_input_tokens + 1 (see get_encode_kwargs), so the
# actual prompt length could be larger.
qualifier = "at least " if token_count == max_input_tokens + 1 else ""
total = token_count + self.max_output_tokens
raise VLLMValidationError( raise VLLMValidationError(
f"You passed {len(tokens)} input tokens " f"This model's maximum context length is "
f"and requested {self.max_output_tokens} output tokens. " f"{self.max_total_tokens} tokens. However, you requested "
f"However, the model's context length is only " f"{self.max_output_tokens} output tokens and your prompt "
f"{self.max_total_tokens} tokens, resulting in a maximum " f"contains {qualifier}{token_count} input tokens, "
f"input length of {max_input_tokens} tokens. " f"for a total of {qualifier}{total} tokens. "
f"Please reduce the length of the input prompt.", f"Please reduce the length of the input prompt or the "
f"number of requested output tokens.",
parameter="input_tokens", parameter="input_tokens",
value=len(tokens), value=token_count,
) )
return tokens return tokens
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment