Issue/193: feats for deployment

Signed-off-by: Ceng23333 <441651826@qq.com>

Issue/193: feats for deployment
Signed-off-by: Ceng23333 <441651826@qq.com>
338b35f5 · Ceng23333 · f2d9d397 · 338b35f5 · 338b35f5 · 338b35f5
Commit 338b35f5 authored Jan 20, 2026 by Ceng23333
4 changed files
--- a/python/infinilm/llm/llm.py
+++ b/python/infinilm/llm/llm.py
@@ -228,11 +228,39 @@ class LLMEngine:
            req.generated_token_ids.append(token_id)
            if req.is_prefill:
                req.is_prefill = False
+            # vLLM-style replacement character handling is primarily relevant for streaming.
+            # For offline generation (no output queue), keep the fast incremental path.
+            if req._output_queue is None:
+                token_text = self.detokenize([token_id])
+                req.generated_text += token_text
+            else:
+                # Streaming path: compute delta from a full decode so we can hold back
+                # trailing '\ufffd' (likely an incomplete UTF-8 sequence).
+                decoded_text = self.detokenize(req.generated_token_ids)
+
+                finished_now = False
+                if self._check_request_finished(req, token_id):
+                    req.mark_finished(req.finish_reason)
+                    finished_now = True

-            token_text = self.tokenizer.decode(token_id)
-            req.generated_text += token_text
+                # Update generated_text to the latest decode (used for stop-string checks and debugging)
+                req.generated_text = decoded_text
+
+                holds_back_incomplete_utf8 = (
+                    bool(decoded_text) and decoded_text.endswith("\ufffd")
+                )

-            if self._check_request_finished(req, token_id):
+                # vLLM-style: hold back only if we are not on the final chunk.
+                if holds_back_incomplete_utf8 and not finished_now:
+                    token_text = ""
+                else:
+                    last_len = getattr(req, "_stream_last_yielded_length", 0)
+                    token_text = decoded_text[last_len:]
+                    if token_text:
+                        req._stream_last_yielded_length = len(decoded_text)
+
+            # For non-streaming, finish checks happen here.
+            if req._output_queue is None and self._check_request_finished(req, token_id):
                req.mark_finished(req.finish_reason)

            # Put output in queue if it exists (for async streaming)
@@ -283,12 +311,15 @@ class LLMEngine:
        self,
        messages: List[dict],
        add_generation_prompt: bool = True,
+        chat_template_kwargs: Optional[dict] = None,
    ) -> str:
        """Apply chat template to messages."""
+        chat_template_kwargs = chat_template_kwargs or {}
        return self.tokenizer.apply_chat_template(
            conversation=messages,
            add_generation_prompt=add_generation_prompt,
            tokenize=False,
+            **chat_template_kwargs,
        )


@@ -486,6 +517,10 @@ class AsyncLLMEngine:

        self._running = False
        self._step_thread: Optional[threading.Thread] = None
+        self._healthy = True
+
+    def is_healthy(self) -> bool:
+        return bool(self._healthy)

    def start(self):
        """Start the background inference loop."""
@@ -520,6 +555,7 @@ class AsyncLLMEngine:
                    time.sleep(0.01)
            except Exception as e:
                logger.error(f"Error in step loop: {e}", exc_info=True)
+                self._healthy = False
                self._running = False
                break

@@ -581,6 +617,8 @@ class AsyncLLMEngine:
        request_id: Optional[str] = None,
        request_data: Optional[dict] = None,
        http_request: Optional[any] = None,
+        add_generation_prompt: bool = True,
+        chat_template_kwargs: Optional[dict] = None,
    ) -> InferenceRequest:
        """Add a chat request to the engine.

@@ -594,7 +632,11 @@ class AsyncLLMEngine:
        Returns:
            The created InferenceRequest object.
        """
-        prompt = self.engine.apply_chat_template(messages, add_generation_prompt=True)
+        prompt = self.engine.apply_chat_template(
+            messages,
+            add_generation_prompt=add_generation_prompt,
+            chat_template_kwargs=chat_template_kwargs,
+        )
        return self.add_request(
            prompt=prompt,
            sampling_params=sampling_params,
@@ -607,6 +649,7 @@ class AsyncLLMEngine:
        self,
        request: InferenceRequest,
        timeout: float = 100.0,
+        request_timeout: Optional[float] = None,
    ) -> AsyncIterator[TokenOutput]:
        """Stream tokens from a request.

@@ -619,6 +662,7 @@ class AsyncLLMEngine:
        """
        import asyncio

+        start = time.time()
        while True:
            if request.is_finished() and request.output_queue.async_q.empty():
                break
@@ -635,6 +679,20 @@ class AsyncLLMEngine:
                if token_output.finished:
                    break
            except asyncio.TimeoutError:
+                # Enforce request-level timeout even if no tokens are produced.
+                if request_timeout is not None:
+                    now = time.time()
+                    if now - start > float(request_timeout):
+                        request.mark_timeout()
+                        yield TokenOutput(
+                            request_id=request.request_id,
+                            token_id=-1,
+                            token_text="",
+                            finished=True,
+                            finish_reason=FinishReason.TIMEOUT,
+                            generated_text=request.generated_text,
+                        )
+                        break
                if request.is_finished():
                    break
                continue

--- a/python/infinilm/llm/request.py
+++ b/python/infinilm/llm/request.py
@@ -144,6 +144,10 @@ class InferenceRequest:
        # Output management (for async streaming)
        self._output_queue: Optional[janus.Queue] = None

+        # Streaming helpers (vLLM-style UTF-8 buffering at the chunking layer)
+        # Used by the engine to compute "delta" text chunks from a full decode.
+        self._stream_last_yielded_length: int = 0
+
    @property
    def output_queue(self) -> janus.Queue:
        """Lazy initialization of output queue."""

--- a/python/infinilm/llm/scheduler.py
+++ b/python/infinilm/llm/scheduler.py
@@ -154,6 +154,10 @@ class Scheduler:
                req = self.waiting_queue.sync_q.get_nowait()
            except queue.Empty:
                break
+            # Skip requests that were already finished (e.g., timed out/canceled while waiting)
+            if req.is_finished():
+                self.complete_requests([req])
+                continue

            req_tokens = req.get_input_tokens()
            num_required_blocks = req.get_num_blocks_required(self.block_size)
@@ -185,6 +189,10 @@ class Scheduler:
                req = self.running_queue.sync_q.get_nowait()
            except queue.Empty:
                break
+            # Skip requests that were already finished (e.g., timed out/canceled while running)
+            if req.is_finished():
+                self.complete_requests([req])
+                continue

            # Decode phase: allocate slot for newly generated token
            try:

--- a/python/infinilm/server/inference_server.py
+++ b/python/infinilm/server/inference_server.py
@@ -10,6 +10,7 @@ import uuid
 import argparse
 import uvicorn
 import logging
+import os

 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, StreamingResponse
@@ -22,7 +23,7 @@ DEFAULT_STREAM_TIMEOUT = 100.0
 DEFAULT_REQUEST_TIMEOUT = 1000.0


-def chunk_json(id_, content=None, role=None, finish_reason=None):
+def chunk_json(id_, content=None, role=None, finish_reason=None, model: str = "unknown"):
    """Generate JSON chunk for streaming response."""
    delta = {}
    if content:
@@ -33,7 +34,7 @@ def chunk_json(id_, content=None, role=None, finish_reason=None):
        "id": id_,
        "object": "chat.completion.chunk",
        "created": int(time.time()),
-        "model": "jiuge",
+        "model": model,
        "system_fingerprint": None,
        "choices": [
            {
@@ -84,6 +85,8 @@ class InferenceServer:
            port: Server port number.
        """
        self.model_path = model_path
+        # vLLM-like served model id: directory name of model_path
+        self.model_id = os.path.basename(os.path.normpath(model_path)) or "model"
        self.device = device
        self.dtype = dtype
        self.tensor_parallel_size = tensor_parallel_size
@@ -136,7 +139,10 @@ class InferenceServer:
    def _register_routes(self, app: FastAPI):
        """Register API routes."""

+        # OpenAI-compatible chat completions endpoint.
+        # Support both legacy path and OpenAI-style /v1 prefix for proxy/router compatibility.
        @app.post("/chat/completions")
+        @app.post("/v1/chat/completions")
        async def chat_completions(request: Request):
            try:
                data = await request.json()
@@ -169,15 +175,21 @@ class InferenceServer:

        @app.get("/health")
        async def health():
+            # Expose engine health so babysitter/registry can treat backend as unhealthy.
+            if (
+                self.engine is not None
+                and hasattr(self.engine, "is_healthy")
+                and not self.engine.is_healthy()
+            ):
+                return JSONResponse(content={"status": "unhealthy"}, status_code=503)
            return {"status": "healthy"}

-        @app.get("/v1/models")
-        async def list_models():
+        def _models_payload():
            return {
                "object": "list",
                "data": [
                    {
-                        "id": "jiuge",
+                        "id": self.model_id,
                        "object": "model",
                        "created": int(time.time()),
                        "owned_by": "infinilm",
@@ -185,14 +197,53 @@ class InferenceServer:
                ],
            }

+        # Support both /v1/models (OpenAI) and /models (common legacy) for compatibility.
+        @app.get("/v1/models")
+        async def list_models():
+            return _models_payload()
+
+        @app.get("/models")
+        async def list_models_legacy():
+            return _models_payload()
+
    def _build_sampling_params(self, data: dict) -> SamplingParams:
        """Build SamplingParams from request data."""
+        # Support both:
+        # - top-level OpenAI-ish fields: temperature/top_p/top_k/max_tokens/stop
+        # - nested dict: sampling_params: { ... }
+        sp = data.get("sampling_params") or {}
+        if not isinstance(sp, dict):
+            sp = {}
+
+        def pick(key: str, default):
+            # Priority: explicit top-level field > nested sampling_params > server default
+            if key in data and data.get(key) is not None:
+                return data.get(key)
+            if key in sp and sp.get(key) is not None:
+                return sp.get(key)
+            return default
+
+        # Accept common alias
+        max_tokens = pick("max_tokens", self.max_tokens)
+        if max_tokens is None:
+            # Some clients use max_new_tokens
+            max_tokens = pick("max_new_tokens", self.max_tokens)
+
+        stop = pick("stop", None)
+        if isinstance(stop, str):
+            stop = [stop]
+
+        stop_token_ids = pick("stop_token_ids", None)
+        if isinstance(stop_token_ids, int):
+            stop_token_ids = [stop_token_ids]
+
        return SamplingParams(
-            temperature=data.get("temperature", self.temperature),
-            top_p=data.get("top_p", self.top_p),
-            top_k=data.get("top_k", self.top_k),
-            max_tokens=data.get("max_tokens", self.max_tokens),
-            stop=data.get("stop"),
+            temperature=float(pick("temperature", self.temperature)),
+            top_p=float(pick("top_p", self.top_p)),
+            top_k=int(pick("top_k", self.top_k)),
+            max_tokens=int(max_tokens) if max_tokens is not None else None,
+            stop=stop,
+            stop_token_ids=stop_token_ids,
        )

    async def _stream_chat(self, request_id: str, data: dict, http_request: Request):
@@ -210,22 +261,26 @@ class InferenceServer:
                request_id=request_id,
                request_data=data,
                http_request=http_request,
+                add_generation_prompt=bool(data.get("add_generation_prompt", True)),
+                chat_template_kwargs=data.get("chat_template_kwargs") or {},
            )

            async for token_output in self.engine.stream_request(
-                req, timeout=DEFAULT_STREAM_TIMEOUT
+                req,
+                timeout=DEFAULT_STREAM_TIMEOUT,
+                request_timeout=DEFAULT_REQUEST_TIMEOUT,
            ):
-                # Check timeout
-                if time.time() - start_time > DEFAULT_REQUEST_TIMEOUT:
+                # If stream_request enforces timeout, we can just surface the state to the client.
+                if token_output.finish_reason == FinishReason.TIMEOUT:
                    logger.warning(
                        f"Request {request_id} timed out after {DEFAULT_REQUEST_TIMEOUT}s"
                    )
-                    req.mark_timeout()
                    error_chunk = json.dumps(
                        chunk_json(
                            request_id,
                            content="[Request timeout]",
                            finish_reason="timeout",
+                            model=self.model_id,
                        ),
                        ensure_ascii=False,
                    )
@@ -240,7 +295,9 @@ class InferenceServer:

                # Send token
                chunk = json.dumps(
-                    chunk_json(request_id, content=token_output.token_text),
+                    chunk_json(
+                        request_id, content=token_output.token_text, model=self.model_id
+                    ),
                    ensure_ascii=False,
                )
                yield f"data: {chunk}\n\n"
@@ -250,7 +307,9 @@ class InferenceServer:
                        token_output.finish_reason
                    )
                    chunk = json.dumps(
-                        chunk_json(request_id, finish_reason=finish_reason),
+                        chunk_json(
+                            request_id, finish_reason=finish_reason, model=self.model_id
+                        ),
                        ensure_ascii=False,
                    )
                    yield f"data: {chunk}\n\n"
@@ -262,7 +321,10 @@ class InferenceServer:
                req.mark_failed()
            error_chunk = json.dumps(
                chunk_json(
-                    request_id, content=f"[Error: {str(e)}]", finish_reason="error"
+                    request_id,
+                    content=f"[Error: {str(e)}]",
+                    finish_reason="error",
+                    model=self.model_id,
                ),
                ensure_ascii=False,
            )
@@ -290,17 +352,20 @@ class InferenceServer:
                request_id=request_id,
                request_data=data,
                http_request=http_request,
+                add_generation_prompt=bool(data.get("add_generation_prompt", True)),
+                chat_template_kwargs=data.get("chat_template_kwargs") or {},
            )

            # Collect all generated tokens
            output_text = ""
            async for token_output in self.engine.stream_request(
-                req, timeout=DEFAULT_STREAM_TIMEOUT
+                req,
+                timeout=DEFAULT_STREAM_TIMEOUT,
+                request_timeout=DEFAULT_REQUEST_TIMEOUT,
            ):
-                # Check timeout
-                if time.time() - start_time > DEFAULT_REQUEST_TIMEOUT:
+                # Request-level timeout is handled inside stream_request.
+                if token_output.finish_reason == FinishReason.TIMEOUT:
                    logger.warning(f"Request {request_id} timed out")
-                    req.mark_timeout()
                    break

                # Check client disconnect
@@ -322,6 +387,7 @@ class InferenceServer:
                content=output_text,
                role="assistant",
                finish_reason=finish_reason or "stop",
+                model=self.model_id,
            )
            return response