Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e77f162c
Unverified
Commit
e77f162c
authored
Jan 31, 2026
by
Nicolò Lucchesi
Committed by
GitHub
Jan 31, 2026
Browse files
[Bugfix] Fix `Qwen3ASR` language asr tag in output (#33410)
Signed-off-by:
NickLucche
<
nlucches@redhat.com
>
parent
8ecd213c
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
42 additions
and
2 deletions
+42
-2
vllm/entrypoints/openai/translations/speech_to_text.py
vllm/entrypoints/openai/translations/speech_to_text.py
+6
-1
vllm/model_executor/models/interfaces.py
vllm/model_executor/models/interfaces.py
+16
-0
vllm/model_executor/models/qwen3_asr.py
vllm/model_executor/models/qwen3_asr.py
+20
-1
No files found.
vllm/entrypoints/openai/translations/speech_to_text.py
View file @
e77f162c
...
...
@@ -518,7 +518,8 @@ class OpenAISpeechToText(OpenAIServing):
total_segments
.
extend
(
segments
)
text_parts
.
extend
([
seg
.
text
for
seg
in
segments
])
else
:
text_parts
.
append
(
op
.
outputs
[
0
].
text
)
raw_text
=
op
.
outputs
[
0
].
text
text_parts
.
append
(
self
.
model_cls
.
post_process_output
(
raw_text
))
text
=
""
.
join
(
text_parts
)
if
self
.
task_type
==
"transcribe"
:
final_response
:
ResponseType
...
...
@@ -607,6 +608,10 @@ class OpenAISpeechToText(OpenAIServing):
assert
len
(
res
.
outputs
)
==
1
output
=
res
.
outputs
[
0
]
# TODO: For models that output structured formats (e.g.,
# Qwen3-ASR with "language X<asr_text>" prefix), streaming
# would need buffering to strip the prefix properly since
# deltas may split the tag across chunks.
delta_message
=
DeltaMessage
(
content
=
output
.
text
)
completion_tokens
+=
len
(
output
.
token_ids
)
...
...
vllm/model_executor/models/interfaces.py
View file @
e77f162c
...
...
@@ -1145,6 +1145,22 @@ class SupportsTranscription(Protocol):
"""
return
None
@
classmethod
def
post_process_output
(
cls
,
text
:
str
)
->
str
:
"""
Post-process the raw model output text.
Some ASR models output structured formats (e.g., language tags,
special tokens) that need to be stripped before returning to the user.
Args:
text: Raw decoded text from the model.
Returns:
Cleaned transcription text.
"""
return
text
@
overload
def
supports_transcription
(
...
...
vllm/model_executor/models/qwen3_asr.py
View file @
e77f162c
...
...
@@ -90,6 +90,7 @@ from vllm.transformers_utils.processors.qwen3_asr import (
)
logger
=
init_logger
(
__name__
)
_ASR_TEXT_TAG
=
"<asr_text>"
def
_get_feat_extract_output_lengths
(
input_lengths
:
torch
.
Tensor
):
...
...
@@ -556,7 +557,7 @@ class Qwen3ASRForConditionalGeneration(
else
:
prompt
=
(
f
"<|im_start|>user
\n
{
audio_placeholder
}
<|im_end|>
\n
"
f
"<|im_start|>assistant
\n
language
{
full_lang_name_to
}
<asr_text>
"
f
"<|im_start|>assistant
\n
language
{
full_lang_name_to
}
{
_ASR_TEXT_TAG
}
"
)
prompt_token_ids
=
tokenizer
.
encode
(
prompt
)
...
...
@@ -565,3 +566,21 @@ class Qwen3ASRForConditionalGeneration(
"multi_modal_data"
:
{
"audio"
:
audio
},
}
return
cast
(
PromptType
,
prompt_dict
)
@
classmethod
def
post_process_output
(
cls
,
text
:
str
)
->
str
:
"""
Post-process Qwen3-ASR raw output to extract clean transcription.
The model outputs in format: "language {lang}<asr_text>{transcription}"
This method strips the language prefix and asr_text tags.
"""
if
not
text
:
return
""
if
_ASR_TEXT_TAG
not
in
text
:
return
text
# Split on <asr_text> tag and take the transcription part
_
,
text_part
=
text
.
rsplit
(
_ASR_TEXT_TAG
,
1
)
return
text_part
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment