[Frontend] feat: add streaming support for token generation endpoint (#37171)

Signed-off-by: Hyeonki Hong <hyeonki.hong@moreh.io>

[Frontend] feat: add streaming support for token generation endpoint (#37171)
Signed-off-by: Hyeonki Hong <hyeonki.hong@moreh.io>
25f2b553 · Hyeonki Hong · GitHub · cb4ff07f · 25f2b553 · 25f2b553
Unverified Commit 25f2b553 authored Apr 03, 2026 by Hyeonki Hong Committed by GitHub Apr 03, 2026
4 changed files
--- a/tests/entrypoints/serve/disagg/test_generate_stream.py
+++ b/tests/entrypoints/serve/disagg/test_generate_stream.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from dataclasses import dataclass, field
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from vllm.config.multimodal import MultiModalConfig
+from vllm.entrypoints.openai.engine.protocol import StreamOptions
+from vllm.entrypoints.openai.models.protocol import BaseModelPath
+from vllm.entrypoints.openai.models.serving import OpenAIServingModels
+from vllm.entrypoints.serve.disagg.protocol import GenerateRequest
+from vllm.entrypoints.serve.disagg.serving import ServingTokens
+from vllm.entrypoints.serve.render.serving import OpenAIServingRender
+from vllm.logprobs import Logprob
+from vllm.outputs import CompletionOutput, RequestOutput
+from vllm.renderers import renderer_from_config
+from vllm.sampling_params import SamplingParams
+from vllm.v1.engine.async_llm import AsyncLLM
+
+MODEL_NAME = "openai-community/gpt2"
+BASE_MODEL_PATHS = [
+    BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME),
+]
+
+
+@dataclass
+class MockHFConfig:
+    model_type: str = "any"
+
+
+@dataclass
+class MockModelConfig:
+    task = "generate"
+    runner_type = "generate"
+    model = MODEL_NAME
+    tokenizer = MODEL_NAME
+    trust_remote_code = False
+    tokenizer_mode = "auto"
+    max_model_len = 100
+    tokenizer_revision = None
+    multimodal_config = MultiModalConfig()
+    hf_config = MockHFConfig()
+    hf_text_config = MockHFConfig()
+    logits_processors: list[str] | None = None
+    diff_sampling_param: dict | None = None
+    allowed_local_media_path: str = ""
+    allowed_media_domains: list[str] | None = None
+    encoder_config = None
+    generation_config: str = "auto"
+    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
+    skip_tokenizer_init = False
+    is_encoder_decoder: bool = False
+    is_multimodal_model: bool = False
+    renderer_num_workers: int = 1
+
+    def get_diff_sampling_param(self):
+        return self.diff_sampling_param or {}
+
+
+@dataclass
+class MockParallelConfig:
+    _api_process_rank: int = 0
+
+
+@dataclass
+class MockVllmConfig:
+    model_config: MockModelConfig
+    parallel_config: MockParallelConfig
+
+
+def _build_renderer(model_config: MockModelConfig):
+    return renderer_from_config(
+        MockVllmConfig(model_config, parallel_config=MockParallelConfig()),
+    )
+
+
+def _build_serving_tokens(engine: AsyncLLM, **kwargs) -> ServingTokens:
+    models = OpenAIServingModels(
+        engine_client=engine,
+        base_model_paths=BASE_MODEL_PATHS,
+    )
+    serving_render = OpenAIServingRender(
+        model_config=engine.model_config,
+        renderer=engine.renderer,
+        io_processor=engine.io_processor,
+        model_registry=models.registry,
+        request_logger=None,
+        chat_template=None,
+        chat_template_content_format="auto",
+    )
+    serving = ServingTokens(
+        engine,
+        models,
+        openai_serving_render=serving_render,
+        request_logger=None,
+        **kwargs,
+    )
+
+    async def _fake_preprocess(*args, **kwargs):
+        return [{"prompt_token_ids": [1, 2, 3]}]
+
+    serving.openai_serving_render.preprocess_completion = AsyncMock(
+        side_effect=_fake_preprocess
+    )
+    return serving
+
+
+def _make_request_output(
+    request_id: str,
+    token_ids: list[int],
+    finish_reason: str | None = None,
+    finished: bool = False,
+    prompt_token_ids: list[int] | None = None,
+    logprobs: list[dict[int, Any] | None] | None = None,
+    num_cached_tokens: int | None = None,
+    index: int = 0,
+) -> RequestOutput:
+    return RequestOutput(
+        request_id=request_id,
+        prompt=None,
+        prompt_token_ids=prompt_token_ids or [1, 2, 3],
+        prompt_logprobs=None,
+        outputs=[
+            CompletionOutput(
+                index=index,
+                text="",
+                token_ids=token_ids,
+                cumulative_logprob=None,
+                logprobs=logprobs,
+                finish_reason=finish_reason,
+            )
+        ],
+        finished=finished,
+        metrics=None,
+        lora_request=None,
+        encoder_prompt=None,
+        encoder_prompt_token_ids=None,
+        num_cached_tokens=num_cached_tokens,
+    )
+
+
+def _mock_engine() -> MagicMock:
+    engine = MagicMock(spec=AsyncLLM)
+    engine.errored = False
+    engine.model_config = MockModelConfig()
+    engine.input_processor = MagicMock()
+    engine.io_processor = MagicMock()
+    engine.renderer = _build_renderer(engine.model_config)
+    return engine
+
+
+def _parse_sse_chunks(chunks: list[str]) -> list[Any]:
+    """Parse SSE chunks into dicts (JSON) or raw strings ([DONE])."""
+    parsed: list[Any] = []
+    for chunk in chunks:
+        assert chunk.startswith("data: ") and chunk.endswith("\n\n")
+        payload = chunk[len("data: ") : -len("\n\n")]
+        if payload == "[DONE]":
+            parsed.append("[DONE]")
+        else:
+            parsed.append(json.loads(payload))
+    return parsed
+
+
+@pytest.mark.asyncio
+async def test_stream_basic():
+    """Streaming returns SSE chunks with correct token_ids and ends with [DONE]."""
+    engine = _mock_engine()
+
+    async def mock_generate(*args, **kwargs):
+        yield _make_request_output("req-1", token_ids=[10])
+        yield _make_request_output("req-1", token_ids=[20, 30])
+        yield _make_request_output(
+            "req-1", token_ids=[40], finish_reason="stop", finished=True
+        )
+
+    engine.generate = MagicMock(side_effect=mock_generate)
+    serving = _build_serving_tokens(engine)
+
+    request = GenerateRequest(
+        token_ids=[1, 2, 3],
+        sampling_params=SamplingParams(max_tokens=10),
+        model=MODEL_NAME,
+        stream=True,
+    )
+
+    response = await serving.serve_tokens(request)
+    chunks = []
+    async for chunk in response:
+        chunks.append(chunk)
+
+    parsed = _parse_sse_chunks(chunks)
+
+    # 3 data chunks + [DONE]
+    assert parsed[-1] == "[DONE]"
+    data_chunks = [c for c in parsed if c != "[DONE]"]
+    assert len(data_chunks) == 3
+
+    assert data_chunks[0]["choices"][0]["token_ids"] == [10]
+    assert data_chunks[1]["choices"][0]["token_ids"] == [20, 30]
+    assert data_chunks[2]["choices"][0]["token_ids"] == [40]
+    assert data_chunks[2]["choices"][0]["finish_reason"] == "stop"
+
+
+@pytest.mark.asyncio
+async def test_stream_error_mid_generation():
+    """finish_reason='error' mid-stream yields error chunk then [DONE]."""
+    engine = _mock_engine()
+
+    async def mock_generate(*args, **kwargs):
+        yield _make_request_output("req-1", token_ids=[10])
+        yield _make_request_output(
+            "req-1", token_ids=[20], finish_reason="error", finished=True
+        )
+
+    engine.generate = MagicMock(side_effect=mock_generate)
+    serving = _build_serving_tokens(engine)
+
+    request = GenerateRequest(
+        token_ids=[1, 2, 3],
+        sampling_params=SamplingParams(max_tokens=10),
+        model=MODEL_NAME,
+        stream=True,
+    )
+
+    response = await serving.serve_tokens(request)
+    chunks = []
+    async for chunk in response:
+        chunks.append(chunk)
+
+    assert len(chunks) >= 2
+    assert any("Internal server error" in chunk for chunk in chunks), (
+        f"Expected error message in chunks: {chunks}"
+    )
+    assert chunks[-1] == "data: [DONE]\n\n"
+
+
+@pytest.mark.asyncio
+async def test_stream_error_with_empty_delta():
+    """finish_reason='error' with empty delta_token_ids still raises."""
+    engine = _mock_engine()
+
+    async def mock_generate(*args, **kwargs):
+        yield _make_request_output("req-1", token_ids=[10])
+        yield _make_request_output(
+            "req-1", token_ids=[], finish_reason="error", finished=True
+        )
+
+    engine.generate = MagicMock(side_effect=mock_generate)
+    serving = _build_serving_tokens(engine)
+
+    request = GenerateRequest(
+        token_ids=[1, 2, 3],
+        sampling_params=SamplingParams(max_tokens=10),
+        model=MODEL_NAME,
+        stream=True,
+    )
+
+    response = await serving.serve_tokens(request)
+    chunks = []
+    async for chunk in response:
+        chunks.append(chunk)
+
+    assert any("Internal server error" in chunk for chunk in chunks), (
+        f"Expected error message in chunks: {chunks}"
+    )
+    assert chunks[-1] == "data: [DONE]\n\n"
+
+
+@pytest.mark.asyncio
+async def test_stream_skips_empty_token_output():
+    """Outputs with empty token_ids are skipped (no chunk emitted)."""
+    engine = _mock_engine()
+
+    async def mock_generate(*args, **kwargs):
+        yield _make_request_output("req-1", token_ids=[10])
+        yield _make_request_output("req-1", token_ids=[])
+        yield _make_request_output(
+            "req-1", token_ids=[20], finish_reason="stop", finished=True
+        )
+
+    engine.generate = MagicMock(side_effect=mock_generate)
+    serving = _build_serving_tokens(engine)
+
+    request = GenerateRequest(
+        token_ids=[1, 2, 3],
+        sampling_params=SamplingParams(max_tokens=10),
+        model=MODEL_NAME,
+        stream=True,
+    )
+
+    response = await serving.serve_tokens(request)
+    chunks = []
+    async for chunk in response:
+        chunks.append(chunk)
+
+    parsed = _parse_sse_chunks(chunks)
+    assert parsed[-1] == "[DONE]"
+    data_chunks = [c for c in parsed if c != "[DONE]"]
+
+    # Only 2 data chunks — the empty one is skipped
+    assert len(data_chunks) == 2
+    assert data_chunks[0]["choices"][0]["token_ids"] == [10]
+    assert data_chunks[1]["choices"][0]["token_ids"] == [20]
+
+
+@pytest.mark.asyncio
+async def test_stream_include_usage():
+    """stream_options.include_usage emits a final usage-only chunk."""
+    engine = _mock_engine()
+
+    async def mock_generate(*args, **kwargs):
+        yield _make_request_output("req-1", token_ids=[10])
+        yield _make_request_output(
+            "req-1", token_ids=[20], finish_reason="stop", finished=True
+        )
+
+    engine.generate = MagicMock(side_effect=mock_generate)
+    serving = _build_serving_tokens(engine)
+
+    request = GenerateRequest(
+        token_ids=[1, 2, 3],
+        sampling_params=SamplingParams(max_tokens=10),
+        model=MODEL_NAME,
+        stream=True,
+        stream_options=StreamOptions(include_usage=True),
+    )
+
+    response = await serving.serve_tokens(request)
+    chunks = []
+    async for chunk in response:
+        chunks.append(chunk)
+
+    parsed = _parse_sse_chunks(chunks)
+    assert parsed[-1] == "[DONE]"
+
+    # The chunk before [DONE] should be the usage-only chunk
+    usage_chunk = parsed[-2]
+    assert usage_chunk["choices"] == []
+    assert usage_chunk["usage"]["prompt_tokens"] == 3
+    assert usage_chunk["usage"]["completion_tokens"] == 2
+    assert usage_chunk["usage"]["total_tokens"] == 5
+
+
+@pytest.mark.asyncio
+async def test_stream_continuous_usage():
+    """continuous_usage_stats adds usage to every data chunk."""
+    engine = _mock_engine()
+
+    async def mock_generate(*args, **kwargs):
+        yield _make_request_output("req-1", token_ids=[10])
+        yield _make_request_output(
+            "req-1", token_ids=[20], finish_reason="stop", finished=True
+        )
+
+    engine.generate = MagicMock(side_effect=mock_generate)
+    serving = _build_serving_tokens(engine)
+
+    request = GenerateRequest(
+        token_ids=[1, 2, 3],
+        sampling_params=SamplingParams(max_tokens=10),
+        model=MODEL_NAME,
+        stream=True,
+        stream_options=StreamOptions(
+            include_usage=True,
+            continuous_usage_stats=True,
+        ),
+    )
+
+    response = await serving.serve_tokens(request)
+    chunks = []
+    async for chunk in response:
+        chunks.append(chunk)
+
+    parsed = _parse_sse_chunks(chunks)
+    data_chunks = [c for c in parsed if isinstance(c, dict) and c.get("choices")]
+
+    # Every data chunk should have usage
+    for i, dc in enumerate(data_chunks):
+        assert dc["usage"] is not None, f"chunk {i} missing usage"
+        assert dc["usage"]["prompt_tokens"] == 3
+
+    # First chunk: 1 completion token
+    assert data_chunks[0]["usage"]["completion_tokens"] == 1
+    assert data_chunks[0]["usage"]["total_tokens"] == 4
+
+    # Second chunk: 2 completion tokens (cumulative)
+    assert data_chunks[1]["usage"]["completion_tokens"] == 2
+    assert data_chunks[1]["usage"]["total_tokens"] == 5
+
+
+@pytest.mark.asyncio
+async def test_stream_with_logprobs():
+    """Streaming with logprobs includes logprob data in each chunk."""
+    engine = _mock_engine()
+
+    async def mock_generate(*args, **kwargs):
+        yield _make_request_output(
+            "req-1",
+            token_ids=[10],
+            logprobs=[{10: Logprob(logprob=-0.5)}],
+        )
+        yield _make_request_output(
+            "req-1",
+            token_ids=[20],
+            logprobs=[{20: Logprob(logprob=-1.0)}],
+            finish_reason="stop",
+            finished=True,
+        )
+
+    engine.generate = MagicMock(side_effect=mock_generate)
+    serving = _build_serving_tokens(engine)
+
+    request = GenerateRequest(
+        token_ids=[1, 2, 3],
+        sampling_params=SamplingParams(max_tokens=10, logprobs=1),
+        model=MODEL_NAME,
+        stream=True,
+    )
+
+    response = await serving.serve_tokens(request)
+    chunks = []
+    async for chunk in response:
+        chunks.append(chunk)
+
+    parsed = _parse_sse_chunks(chunks)
+    data_chunks = [c for c in parsed if isinstance(c, dict) and c.get("choices")]
+
+    for dc in data_chunks:
+        lp = dc["choices"][0]["logprobs"]
+        assert lp is not None
+        assert len(lp["content"]) == 1
+        assert lp["content"][0]["token"].startswith("token_id:")
+
+
+@pytest.mark.asyncio
+async def test_stream_prompt_tokens_details():
+    """enable_prompt_tokens_details includes cached_tokens in final usage."""
+    engine = _mock_engine()
+
+    async def mock_generate(*args, **kwargs):
+        yield _make_request_output(
+            "req-1",
+            token_ids=[10],
+            finish_reason="stop",
+            finished=True,
+            num_cached_tokens=2,
+        )
+
+    engine.generate = MagicMock(side_effect=mock_generate)
+    serving = _build_serving_tokens(engine, enable_prompt_tokens_details=True)
+
+    request = GenerateRequest(
+        token_ids=[1, 2, 3],
+        sampling_params=SamplingParams(max_tokens=10),
+        model=MODEL_NAME,
+        stream=True,
+        stream_options=StreamOptions(include_usage=True),
+    )
+
+    response = await serving.serve_tokens(request)
+    chunks = []
+    async for chunk in response:
+        chunks.append(chunk)
+
+    parsed = _parse_sse_chunks(chunks)
+    # Usage-only chunk (before [DONE])
+    usage_chunk = parsed[-2]
+    assert usage_chunk["choices"] == []
+    assert usage_chunk["usage"]["prompt_tokens_details"]["cached_tokens"] == 2
--- a/tests/entrypoints/serve/disagg/test_serving_tokens.py
+++ b/tests/entrypoints/serve/disagg/test_serving_tokens.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+import json
 import os

 import httpx
@@ -113,6 +114,54 @@ async def test_generate_endpoint(client):
    assert "choices" in data


+@pytest.mark.asyncio
+async def test_generate_stream(client):
+    payload = {
+        "model": MODEL_NAME,
+        "token_ids": [1, 2, 3],
+        "sampling_params": {"max_tokens": 5},
+        "stream": True,
+    }
+    async with client.stream("POST", GEN_ENDPOINT, json=payload) as resp:
+        resp.raise_for_status()
+        chunks = []
+        async for line in resp.aiter_lines():
+            if not line.startswith("data: "):
+                continue
+            payload_str = line[len("data: ") :]
+            if payload_str == "[DONE]":
+                break
+            chunks.append(json.loads(payload_str))
+
+    assert len(chunks) > 0
+    # Every chunk has choices with token_ids
+    all_token_ids = []
+    for chunk in chunks:
+        assert "choices" in chunk
+        assert len(chunk["choices"]) == 1
+        choice = chunk["choices"][0]
+        assert "token_ids" in choice
+        assert len(choice["token_ids"]) > 0
+        all_token_ids.extend(choice["token_ids"])
+
+    # Last chunk should have a finish_reason
+    assert chunks[-1]["choices"][0]["finish_reason"] is not None
+
+    # Streaming should produce the same tokens as non-streaming
+    non_stream_resp = await client.post(
+        GEN_ENDPOINT,
+        json={
+            "model": MODEL_NAME,
+            "token_ids": [1, 2, 3],
+            "sampling_params": {"max_tokens": 5, "temperature": 0.0},
+            "stream": False,
+        },
+    )
+    non_stream_data = non_stream_resp.json()
+    # Just verify we got the right number of tokens
+    assert len(all_token_ids) == len(non_stream_data["choices"][0]["token_ids"])
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("logprobs_value", [0, 1, 5])
 async def test_generate_logprobs(client, logprobs_value):

--- a/vllm/entrypoints/serve/disagg/protocol.py
+++ b/vllm/entrypoints/serve/disagg/protocol.py
@@ -6,7 +6,7 @@ from pydantic import BaseModel, Field, field_validator

 from vllm.config import ModelConfig
 from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionLogProbs
-from vllm.entrypoints.openai.engine.protocol import StreamOptions
+from vllm.entrypoints.openai.engine.protocol import StreamOptions, UsageInfo
 from vllm.logprobs import Logprob
 from vllm.renderers import TokenizeParams
 from vllm.sampling_params import SamplingParams
@@ -122,6 +122,26 @@ class GenerateResponseChoice(BaseModel):
    token_ids: list[int] | None = None


+class GenerateResponseStreamChoice(BaseModel):
+    index: int
+    logprobs: ChatCompletionLogProbs | None = None
+    finish_reason: str | None = None
+    token_ids: list[int] | None = None
+
+
+class GenerateStreamResponse(BaseModel):
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    choices: list[GenerateResponseStreamChoice]
+    usage: UsageInfo | None = Field(default=None)
+
+
 class GenerateResponse(BaseModel):
    request_id: str = Field(
        default_factory=lambda: f"{random_uuid()}",

--- a/vllm/entrypoints/serve/disagg/serving.py
+++ b/vllm/entrypoints/serve/disagg/serving.py
@@ -18,6 +18,7 @@ from vllm.entrypoints.openai.chat_completion.protocol import (
 )
 from vllm.entrypoints.openai.engine.protocol import (
    ErrorResponse,
+    GenerationError,
    PromptTokenUsageInfo,
    RequestResponseMetadata,
    UsageInfo,
@@ -28,12 +29,15 @@ from vllm.entrypoints.serve.disagg.protocol import (
    GenerateRequest,
    GenerateResponse,
    GenerateResponseChoice,
+    GenerateResponseStreamChoice,
+    GenerateStreamResponse,
 )
 from vllm.entrypoints.serve.render.serving import OpenAIServingRender
+from vllm.entrypoints.utils import should_include_usage
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob
 from vllm.outputs import RequestOutput
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import RequestOutputKind, SamplingParams
 from vllm.utils.collection_utils import as_list

 logger = init_logger(__name__)
@@ -74,7 +78,7 @@ class ServingTokens(OpenAIServing):
        self,
        request: GenerateRequest,
        raw_request: Request | None = None,
-    ) -> GenerateResponse | ErrorResponse:
+    ) -> GenerateResponse | ErrorResponse | AsyncGenerator[str, None]:
        error_check_ret = await self._check_model(request)
        if error_check_ret is not None:
            logger.error("Error with model %s", error_check_ret)
@@ -110,6 +114,8 @@ class ServingTokens(OpenAIServing):
        sampling_params = request.sampling_params
        if self.force_no_detokenize:
            sampling_params.detokenize = False
+        if request.stream:
+            sampling_params.output_kind = RequestOutputKind.DELTA

        self._log_inputs(
            request_id,
@@ -133,9 +139,17 @@ class ServingTokens(OpenAIServing):
            priority=request.priority,
        )

-        # TODO(NickLucche): Implement streaming response
-
        assert result_generator is not None
+
+        if request.stream:
+            return self.serve_tokens_stream_generator(
+                request,
+                result_generator,
+                request_id,
+                model_name,
+                request_metadata,
+            )
+
        return await self.serve_tokens_full_generator(
            request, result_generator, request_id, model_name, request_metadata
        )
@@ -236,6 +250,109 @@ class ServingTokens(OpenAIServing):

        return response

+    async def serve_tokens_stream_generator(
+        self,
+        request: GenerateRequest,
+        result_generator: AsyncGenerator[RequestOutput, None],
+        request_id: str,
+        model_name: str,
+        request_metadata: RequestResponseMetadata,
+    ) -> AsyncGenerator[str, None]:
+        num_prompt_tokens = 0
+        num_generated_tokens: list[int] = []
+        first_iteration = True
+        num_cached_tokens = None
+        sampling_params: SamplingParams = request.sampling_params
+
+        include_usage, include_continuous_usage = should_include_usage(
+            request.stream_options, False
+        )
+
+        try:
+            async for res in result_generator:
+                if first_iteration:
+                    if res.prompt_token_ids is not None:
+                        num_prompt_tokens = len(res.prompt_token_ids)
+                    if res.encoder_prompt_token_ids is not None:
+                        num_prompt_tokens += len(res.encoder_prompt_token_ids)
+                    num_cached_tokens = res.num_cached_tokens
+                    num_generated_tokens = [0] * len(res.outputs)
+                    first_iteration = False
+
+                for output in res.outputs:
+                    i = output.index
+                    delta_token_ids = output.token_ids
+                    num_generated_tokens[i] += len(delta_token_ids)
+
+                    finish_reason = output.finish_reason
+                    self._raise_if_error(finish_reason, request_id)
+
+                    if not delta_token_ids:
+                        continue
+
+                    if sampling_params.logprobs is not None:
+                        out_logprobs = output.logprobs
+                        assert out_logprobs is not None, "Did not output logprobs"
+                        logprobs = self._create_tokens_logprobs(
+                            token_ids=delta_token_ids,
+                            top_logprobs=out_logprobs,
+                            num_output_top_logprobs=sampling_params.logprobs,
+                        )
+                    else:
+                        logprobs = None
+
+                    chunk = GenerateStreamResponse(
+                        request_id=request_id,
+                        choices=[
+                            GenerateResponseStreamChoice(
+                                index=i,
+                                logprobs=logprobs,
+                                finish_reason=finish_reason,
+                                token_ids=as_list(delta_token_ids),
+                            )
+                        ],
+                    )
+                    if include_continuous_usage:
+                        chunk.usage = UsageInfo(
+                            prompt_tokens=num_prompt_tokens,
+                            completion_tokens=num_generated_tokens[i],
+                            total_tokens=(num_prompt_tokens + num_generated_tokens[i]),
+                        )
+
+                    yield f"data: {chunk.model_dump_json()}\n\n"
+
+            total_completion_tokens = sum(num_generated_tokens)
+            final_usage_info = UsageInfo(
+                prompt_tokens=num_prompt_tokens,
+                completion_tokens=total_completion_tokens,
+                total_tokens=num_prompt_tokens + total_completion_tokens,
+            )
+
+            if self.enable_prompt_tokens_details and num_cached_tokens:
+                final_usage_info.prompt_tokens_details = PromptTokenUsageInfo(
+                    cached_tokens=num_cached_tokens
+                )
+
+            if include_usage:
+                final_chunk = GenerateStreamResponse(
+                    request_id=request_id,
+                    choices=[],
+                    usage=final_usage_info,
+                )
+                yield f"data: {final_chunk.model_dump_json(exclude_none=True)}\n\n"
+
+            request_metadata.final_usage_info = final_usage_info
+
+        except GenerationError as e:
+            yield (
+                f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n"
+            )
+        except Exception as e:
+            logger.exception("Error in token generation stream.")
+            data = self.create_streaming_error_response(e)
+            yield f"data: {data}\n\n"
+        yield "data: [DONE]\n\n"
+
    def _create_tokens_logprobs(
        self,
        token_ids: GenericSequence[int],