Unverified Commit 15e0bb9c authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

[Streaming -> Realtime] Rename all voxtral related classes, fn, files (#33415)


Signed-off-by: default avatarPatrick von Platen <patrick.v.platen@gmail.com>
parent 6c64c41b
...@@ -610,54 +610,10 @@ Audio must be sent as base64-encoded PCM16 audio at 16kHz sample rate, mono chan ...@@ -610,54 +610,10 @@ Audio must be sent as base64-encoded PCM16 audio at 16kHz sample rate, mono chan
| `transcription.done` | Final transcription with usage stats | | `transcription.done` | Final transcription with usage stats |
| `error` | Error notification with message and optional code | | `error` | Error notification with message and optional code |
#### Python WebSocket Example #### Example Clients
??? code - [openai_realtime_client.py](https://github.com/vllm-project/vllm/tree/main/examples/online_serving/openai_realtime_client.py) - Upload and transcribe an audio file
- [openai_realtime_microphone_client.py](https://github.com/vllm-project/vllm/tree/main/examples/online_serving/openai_realtime_microphone_client.py) - Gradio demo for live microphone transcription
```python
import asyncio
import base64
import json
import websockets
async def realtime_transcribe():
uri = "ws://localhost:8000/v1/realtime"
async with websockets.connect(uri) as ws:
# Wait for session.created
response = await ws.recv()
print(f"Session: {response}")
# Commit buffer
await ws.send(json.dumps({
"type": "input_audio_buffer.commit"
}))
# Send audio chunks (example with file)
with open("audio.raw", "rb") as f:
while chunk := f.read(4096):
await ws.send(json.dumps({
"type": "input_audio_buffer.append",
"audio": base64.b64encode(chunk).decode()
}))
# Signal all audio is sent
await ws.send(json.dumps({
"type": "input_audio_buffer.commit",
"final": True,
}))
# Receive transcription
while True:
response = json.loads(await ws.recv())
if response["type"] == "transcription.delta":
print(response["delta"], end="", flush=True)
elif response["type"] == "transcription.done":
print(f"\nFinal: {response['text']}")
break
asyncio.run(realtime_transcribe())
```
### Tokenizer API ### Tokenizer API
......
...@@ -74,7 +74,7 @@ def async_engine() -> AsyncLLM: ...@@ -74,7 +74,7 @@ def async_engine() -> AsyncLLM:
@pytest.mark.skip(reason="Voxtral streaming is not yet public") @pytest.mark.skip(reason="Voxtral streaming is not yet public")
def test_voxtral_streaming_forward(audio_assets, tokenizer, engine): def test_voxtral_realtime_forward(audio_assets, tokenizer, engine):
audio_config = tokenizer.instruct_tokenizer.tokenizer.audio audio_config = tokenizer.instruct_tokenizer.tokenizer.audio
def from_file(file_path: str): def from_file(file_path: str):
...@@ -219,7 +219,7 @@ class RealTimeAudioInput: ...@@ -219,7 +219,7 @@ class RealTimeAudioInput:
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.skip(reason="Voxtral streaming is not yet public") @pytest.mark.skip(reason="Voxtral streaming is not yet public")
async def test_voxtral_streaming_generator(audio_assets, tokenizer, async_engine): async def test_voxtral_realtime_generator(audio_assets, tokenizer, async_engine):
sampling_params = SamplingParams(temperature=0.0, max_tokens=1) sampling_params = SamplingParams(temperature=0.0, max_tokens=1)
output_tokens_list = [] output_tokens_list = []
......
...@@ -989,7 +989,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -989,7 +989,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
# disable this temporarily until we support HF format # disable this temporarily until we support HF format
is_available_online=False, is_available_online=False,
), ),
"VoxtralStreamingGeneration": _HfExamplesInfo( "VoxtralRealtimeGeneration": _HfExamplesInfo(
"<place-holder>", "<place-holder>",
# disable this temporarily until we support HF format # disable this temporarily until we support HF format
is_available_online=False, is_available_online=False,
......
...@@ -462,7 +462,7 @@ _MULTIMODAL_MODELS = { ...@@ -462,7 +462,7 @@ _MULTIMODAL_MODELS = {
), ),
"UltravoxModel": ("ultravox", "UltravoxModel"), "UltravoxModel": ("ultravox", "UltravoxModel"),
"VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501 "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501
"VoxtralStreamingGeneration": ("voxtral_streaming", "VoxtralStreamingGeneration"), # noqa: E501 "VoxtralRealtimeGeneration": ("voxtral_realtime", "VoxtralRealtimeGeneration"), # noqa: E501
# [Encoder-decoder] # [Encoder-decoder]
"NemotronParseForConditionalGeneration": ( "NemotronParseForConditionalGeneration": (
"nemotron_parse", "nemotron_parse",
......
...@@ -50,7 +50,7 @@ logger = init_logger(__name__) ...@@ -50,7 +50,7 @@ logger = init_logger(__name__)
_PRE_ALLOCATE_BUFFER_SIZE_IN_S = 30 _PRE_ALLOCATE_BUFFER_SIZE_IN_S = 30
class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor): class VoxtralRealtimeMultiModalProcessor(VoxtralMultiModalProcessor):
def __init__( def __init__(
self, self,
info: _I, info: _I,
...@@ -58,7 +58,7 @@ class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor): ...@@ -58,7 +58,7 @@ class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor):
*, *,
cache: BaseMultiModalProcessorCache | None = None, cache: BaseMultiModalProcessorCache | None = None,
) -> None: ) -> None:
# streaming can't make use of a cache yet # realtime can't make use of a cache yet
super().__init__(info, dummy_inputs, cache=None) super().__init__(info, dummy_inputs, cache=None)
def _maybe_apply_prompt_updates( def _maybe_apply_prompt_updates(
...@@ -72,10 +72,10 @@ class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor): ...@@ -72,10 +72,10 @@ class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor):
# there are no placeholder audio tokens for streaming # there are no placeholder audio tokens for streaming
# so we need to build the place placeholder positions manually # so we need to build the place placeholder positions manually
# in streaming there is always only one audio input # in realtime there is always only one audio input
audios = mm_kwargs.get("audio", []) audios = mm_kwargs.get("audio", [])
assert len(audios) == 1, ( assert len(audios) == 1, (
f"Expected only one audio input for streaming, got {mm_kwargs=}" f"Expected only one audio input for realtime, got {mm_kwargs=}"
) )
tokenizer = self.info.get_tokenizer() tokenizer = self.info.get_tokenizer()
audio_config = tokenizer.instruct.audio_encoder.audio_config audio_config = tokenizer.instruct.audio_encoder.audio_config
...@@ -211,12 +211,12 @@ class VoxtralRealtimeBuffer: ...@@ -211,12 +211,12 @@ class VoxtralRealtimeBuffer:
@MULTIMODAL_REGISTRY.register_processor( @MULTIMODAL_REGISTRY.register_processor(
VoxtralStreamingMultiModalProcessor, VoxtralRealtimeMultiModalProcessor,
info=VoxtralProcessingInfo, info=VoxtralProcessingInfo,
dummy_inputs=VoxtralDummyInputsBuilder, dummy_inputs=VoxtralDummyInputsBuilder,
) )
@support_torch_compile @support_torch_compile
class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealtime): class VoxtralRealtimeGeneration(VoxtralForConditionalGeneration, SupportsRealtime):
requires_raw_input_tokens = True requires_raw_input_tokens = True
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
...@@ -224,10 +224,7 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealti ...@@ -224,10 +224,7 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealti
assert ( assert (
not vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs() not vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs()
), ( ), "Voxtral realtime doesn't support full cudagraphs yet. Please use PIECEWISE."
"Voxtral streaming doesn't support full cudagraphs yet. "
"Please use PIECEWISE."
)
self.time_embedding: TimeEmbedding = TimeEmbedding( self.time_embedding: TimeEmbedding = TimeEmbedding(
dim=self.config.text_config.hidden_size dim=self.config.text_config.hidden_size
...@@ -302,11 +299,11 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealti ...@@ -302,11 +299,11 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealti
handle_oov_mm_token: bool = True, handle_oov_mm_token: bool = True,
) -> torch.Tensor: ) -> torch.Tensor:
"""Pass post-conv embeddings directly as input""" """Pass post-conv embeddings directly as input"""
# for streaming we simply flatten the multimodal embeddings # for realtime we simply flatten the multimodal embeddings
# to be in tensor format, we treat the input ids later # to be in tensor format, we treat the input ids later
assert multimodal_embeddings is not None assert multimodal_embeddings is not None
assert len(multimodal_embeddings) > 0, ( assert len(multimodal_embeddings) > 0, (
"For streaming you must provide a multimodal_embedding at every step." "For realtime you must provide a multimodal_embedding at every step."
) )
mm_embeds_flat = _flatten_embeddings(multimodal_embeddings) mm_embeds_flat = _flatten_embeddings(multimodal_embeddings)
return mm_embeds_flat return mm_embeds_flat
...@@ -370,7 +367,7 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealti ...@@ -370,7 +367,7 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealti
audio_inputs = self._parse_and_validate_audio_arrays(**kwargs) audio_inputs = self._parse_and_validate_audio_arrays(**kwargs)
assert audio_inputs is not None, ( assert audio_inputs is not None, (
"For streaming you must provide an audio input at every step." "For realtime you must provide an audio input at every step."
) )
def _truncate_left( def _truncate_left(
......
...@@ -204,7 +204,7 @@ def _remap_mistral_audio_args(config: dict) -> dict: ...@@ -204,7 +204,7 @@ def _remap_mistral_audio_args(config: dict) -> dict:
raise NotImplementedError(f"Unsupported: {_maybe_sliding_window=}") raise NotImplementedError(f"Unsupported: {_maybe_sliding_window=}")
architecture = ( architecture = (
"VoxtralStreamingGeneration" "VoxtralRealtimeGeneration"
if encoder_args.get("causal") if encoder_args.get("causal")
else "VoxtralForConditionalGeneration" else "VoxtralForConditionalGeneration"
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment