init

366dfe82 · jixx · 366dfe82 · 366dfe82 · 366dfe82 · 366dfe82
Commit 366dfe82 authored Nov 20, 2024 by jixx
20 changed files
--- a/clients/python/text_generation/client.py
+++ b/clients/python/text_generation/client.py
+import json
+import requests
+import warnings
+from aiohttp import ClientSession, ClientTimeout
+from pydantic import ValidationError
+from typing import Dict, Optional, List, AsyncIterator, Iterator, Union
+from text_generation import DEPRECATION_WARNING
+from text_generation.types import (
+    StreamResponse,
+    Response,
+    Request,
+    Parameters,
+    Grammar,
+    CompletionRequest,
+    Completion,
+    CompletionComplete,
+    ChatRequest,
+    ChatCompletionChunk,
+    ChatComplete,
+    Message,
+    Tool,
+)
+from text_generation.errors import parse_error
+# emit deprecation warnings
+warnings.simplefilter("always", DeprecationWarning)
+class Client:
+    """Client to make calls to a text-generation-inference instance
+     Example:
+     ```python
+     >>> from text_generation import Client
+     >>> client = Client("https://api-inference.huggingface.co/models/bigscience/bloomz")
+     >>> client.generate("Why is the sky blue?").generated_text
+     ' Rayleigh scattering'
+     >>> result = ""
+     >>> for response in client.generate_stream("Why is the sky blue?"):
+     >>>     if not response.token.special:
+     >>>         result += response.token.text
+     >>> result
+    ' Rayleigh scattering'
+     ```
+    """
+    def __init__(
+        self,
+        base_url: str,
+        headers: Optional[Dict[str, str]] = None,
+        cookies: Optional[Dict[str, str]] = None,
+        timeout: int = 10,
+    ):
+        """
+        Args:
+            base_url (`str`):
+                text-generation-inference instance base url
+            headers (`Optional[Dict[str, str]]`):
+                Additional headers
+            cookies (`Optional[Dict[str, str]]`):
+                Cookies to include in the requests
+            timeout (`int`):
+                Timeout in seconds
+        """
+        warnings.warn(DEPRECATION_WARNING, DeprecationWarning)
+        self.base_url = base_url
+        self.headers = headers
+        self.cookies = cookies
+        self.timeout = timeout
+    def completion(
+        self,
+        prompt: str,
+        frequency_penalty: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        seed: Optional[int] = None,
+        stream: bool = False,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        stop: Optional[List[str]] = None,
+    ):
+        """
+        Given a prompt, generate a response synchronously
+        Args:
+            prompt (`str`):
+                Prompt
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
+            max_tokens (`int`):
+                Maximum number of generated tokens
+            repetition_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            seed (`int`):
+                Random sampling seed
+            stream (`bool`):
+                Stream the response
+            temperature (`float`):
+                The value used to module the logits distribution.
+            top_p (`float`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation
+            stop (`List[str]`):
+                Stop generating tokens if a member of `stop` is generated
+        """
+        request = CompletionRequest(
+            model="tgi",
+            prompt=prompt,
+            frequency_penalty=frequency_penalty,
+            max_tokens=max_tokens,
+            repetition_penalty=repetition_penalty,
+            seed=seed,
+            stream=stream,
+            temperature=temperature,
+            top_p=top_p,
+            stop=stop,
+        )
+        if not stream:
+            resp = requests.post(
+                f"{self.base_url}/v1/completions",
+                json=request.dict(),
+                headers=self.headers,
+                cookies=self.cookies,
+                timeout=self.timeout,
+            )
+            payload = resp.json()
+            if resp.status_code != 200:
+                raise parse_error(resp.status_code, payload)
+            return Completion(**payload)
+        else:
+            return self._completion_stream_response(request)
+    def _completion_stream_response(self, request):
+        resp = requests.post(
+            f"{self.base_url}/v1/completions",
+            json=request.dict(),
+            headers=self.headers,
+            cookies=self.cookies,
+            timeout=self.timeout,
+            stream=True,
+        )
+        # iterate and print stream
+        for byte_payload in resp.iter_lines():
+            if byte_payload == b"\n":
+                continue
+            payload = byte_payload.decode("utf-8")
+            if payload.startswith("data:"):
+                json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
+                try:
+                    response = CompletionComplete(**json_payload)
+                    yield response
+                except ValidationError:
+                    raise parse_error(resp.status, json_payload)
+    def chat(
+        self,
+        messages: List[Message],
+        repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[List[float]] = None,
+        logprobs: Optional[bool] = None,
+        top_logprobs: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        stream: bool = False,
+        seed: Optional[int] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        tools: Optional[List[Tool]] = None,
+        tool_prompt: Optional[str] = None,
+        tool_choice: Optional[str] = None,
+        stop: Optional[List[str]] = None,
+    ):
+        """
+        Given a list of messages, generate a response asynchronously
+        Args:
+            messages (`List[Message]`):
+                List of messages
+            repetition_penalty (`float`):
+                The parameter for repetition penalty. 0.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
+            logit_bias (`List[float]`):
+                Adjust the likelihood of specified tokens
+            logprobs (`bool`):
+                Include log probabilities in the response
+            top_logprobs (`int`):
+                Include the `n` most likely tokens at each step
+            max_tokens (`int`):
+                Maximum number of generated tokens
+            n (`int`):
+                Generate `n` completions
+            presence_penalty (`float`):
+                The parameter for presence penalty. 0.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            stream (`bool`):
+                Stream the response
+            seed (`int`):
+                Random sampling seed
+            temperature (`float`):
+                The value used to module the logits distribution.
+            top_p (`float`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation
+            tools (`List[Tool]`):
+                List of tools to use
+            tool_prompt (`str`):
+                A prompt to be appended before the tools
+            tool_choice (`str`):
+                The tool to use
+            stop (`List[str]`):
+                Stop generating tokens if a member of `stop` is generated
+        """
+        request = ChatRequest(
+            model="tgi",
+            messages=messages,
+            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            top_logprobs=top_logprobs,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            stream=stream,
+            seed=seed,
+            temperature=temperature,
+            top_p=top_p,
+            tools=tools,
+            tool_prompt=tool_prompt,
+            tool_choice=tool_choice,
+            stop=stop,
+        )
+        if not stream:
+            resp = requests.post(
+                f"{self.base_url}/v1/chat/completions",
+                json=request.dict(),
+                headers=self.headers,
+                cookies=self.cookies,
+                timeout=self.timeout,
+            )
+            payload = resp.json()
+            if resp.status_code != 200:
+                raise parse_error(resp.status_code, payload)
+            return ChatComplete(**payload)
+        else:
+            return self._chat_stream_response(request)
+    def _chat_stream_response(self, request):
+        resp = requests.post(
+            f"{self.base_url}/v1/chat/completions",
+            json=request.dict(),
+            headers=self.headers,
+            cookies=self.cookies,
+            timeout=self.timeout,
+            stream=True,
+        )
+        # iterate and print stream
+        for byte_payload in resp.iter_lines():
+            if byte_payload == b"\n":
+                continue
+            payload = byte_payload.decode("utf-8")
+            if payload.startswith("data:"):
+                json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
+                try:
+                    response = ChatCompletionChunk(**json_payload)
+                    yield response
+                except ValidationError:
+                    raise parse_error(resp.status, json_payload)
+    def generate(
+        self,
+        prompt: str,
+        do_sample: bool = False,
+        max_new_tokens: int = 20,
+        best_of: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
+        return_full_text: bool = False,
+        seed: Optional[int] = None,
+        stop_sequences: Optional[List[str]] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: bool = False,
+        decoder_input_details: bool = False,
+        top_n_tokens: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
+    ) -> Response:
+        """
+        Given a prompt, generate the following text
+        Args:
+            prompt (`str`):
+                Input text
+            do_sample (`bool`):
+                Activate logits sampling
+            max_new_tokens (`int`):
+                Maximum number of generated tokens
+            best_of (`int`):
+                Generate best_of sequences and return the one if the highest token logprobs
+            repetition_penalty (`float`):
+                The parameter for repetition penalty. 1.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 1.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
+            return_full_text (`bool`):
+                Whether to prepend the prompt to the generated text
+            seed (`int`):
+                Random sampling seed
+            stop_sequences (`List[str]`):
+                Stop generating tokens if a member of `stop_sequences` is generated
+            temperature (`float`):
+                The value used to module the logits distribution.
+            top_k (`int`):
+                The number of highest probability vocabulary tokens to keep for top-k-filtering.
+            top_p (`float`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation.
+            truncate (`int`):
+                Truncate inputs tokens to the given size
+            typical_p (`float`):
+                Typical Decoding mass
+                See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
+            watermark (`bool`):
+                Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+            decoder_input_details (`bool`):
+                Return the decoder input token logprobs and ids
+            top_n_tokens (`int`):
+                Return the `n` most likely tokens at each step
+            grammar (`Grammar`):
+                Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
+                of the text to match a regular expression or JSON schema.
+        Returns:
+            Response: generated response
+        """
+        # Validate parameters
+        parameters = Parameters(
+            best_of=best_of,
+            details=True,
+            do_sample=do_sample,
+            max_new_tokens=max_new_tokens,
+            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
+            return_full_text=return_full_text,
+            seed=seed,
+            stop=stop_sequences if stop_sequences is not None else [],
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            truncate=truncate,
+            typical_p=typical_p,
+            watermark=watermark,
+            decoder_input_details=decoder_input_details,
+            top_n_tokens=top_n_tokens,
+            grammar=grammar,
+        )
+        request = Request(inputs=prompt, stream=False, parameters=parameters)
+        resp = requests.post(
+            self.base_url,
+            json=request.dict(),
+            headers=self.headers,
+            cookies=self.cookies,
+            timeout=self.timeout,
+        )
+        payload = resp.json()
+        if resp.status_code != 200:
+            raise parse_error(resp.status_code, payload)
+        return Response(**payload[0])
+    def generate_stream(
+        self,
+        prompt: str,
+        do_sample: bool = False,
+        max_new_tokens: int = 20,
+        repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
+        return_full_text: bool = False,
+        seed: Optional[int] = None,
+        stop_sequences: Optional[List[str]] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: bool = False,
+        top_n_tokens: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
+    ) -> Iterator[StreamResponse]:
+        """
+        Given a prompt, generate the following stream of tokens
+        Args:
+            prompt (`str`):
+                Input text
+            do_sample (`bool`):
+                Activate logits sampling
+            max_new_tokens (`int`):
+                Maximum number of generated tokens
+            repetition_penalty (`float`):
+                The parameter for repetition penalty. 1.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 1.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
+            return_full_text (`bool`):
+                Whether to prepend the prompt to the generated text
+            seed (`int`):
+                Random sampling seed
+            stop_sequences (`List[str]`):
+                Stop generating tokens if a member of `stop_sequences` is generated
+            temperature (`float`):
+                The value used to module the logits distribution.
+            top_k (`int`):
+                The number of highest probability vocabulary tokens to keep for top-k-filtering.
+            top_p (`float`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation.
+            truncate (`int`):
+                Truncate inputs tokens to the given size
+            typical_p (`float`):
+                Typical Decoding mass
+                See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
+            watermark (`bool`):
+                Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+            top_n_tokens (`int`):
+                Return the `n` most likely tokens at each step
+            grammar (`Grammar`):
+                Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
+                of the text to match a regular expression or JSON schema.
+        Returns:
+            Iterator[StreamResponse]: stream of generated tokens
+        """
+        # Validate parameters
+        parameters = Parameters(
+            best_of=None,
+            details=True,
+            decoder_input_details=False,
+            do_sample=do_sample,
+            max_new_tokens=max_new_tokens,
+            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
+            return_full_text=return_full_text,
+            seed=seed,
+            stop=stop_sequences if stop_sequences is not None else [],
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            truncate=truncate,
+            typical_p=typical_p,
+            watermark=watermark,
+            top_n_tokens=top_n_tokens,
+            grammar=grammar,
+        )
+        request = Request(inputs=prompt, stream=True, parameters=parameters)
+        resp = requests.post(
+            self.base_url,
+            json=request.dict(),
+            headers=self.headers,
+            cookies=self.cookies,
+            timeout=self.timeout,
+            stream=True,
+        )
+        if resp.status_code != 200:
+            raise parse_error(resp.status_code, resp.json())
+        # Parse ServerSentEvents
+        for byte_payload in resp.iter_lines():
+            # Skip line
+            if byte_payload == b"\n":
+                continue
+            payload = byte_payload.decode("utf-8")
+            # Event data
+            if payload.startswith("data:"):
+                # Decode payload
+                json_payload = json.loads(payload.lstrip("data:").rstrip("/n"))
+                # Parse payload
+                try:
+                    response = StreamResponse(**json_payload)
+                except ValidationError:
+                    # If we failed to parse the payload, then it is an error payload
+                    raise parse_error(resp.status_code, json_payload)
+                yield response
+class AsyncClient:
+    """Asynchronous Client to make calls to a text-generation-inference instance
+     Example:
+     ```python
+     >>> from text_generation import AsyncClient
+     >>> client = AsyncClient("https://api-inference.huggingface.co/models/bigscience/bloomz")
+     >>> response = await client.generate("Why is the sky blue?")
+     >>> response.generated_text
+     ' Rayleigh scattering'
+     >>> result = ""
+     >>> async for response in client.generate_stream("Why is the sky blue?"):
+     >>>     if not response.token.special:
+     >>>         result += response.token.text
+     >>> result
+    ' Rayleigh scattering'
+     ```
+    """
+    def __init__(
+        self,
+        base_url: str,
+        headers: Optional[Dict[str, str]] = None,
+        cookies: Optional[Dict[str, str]] = None,
+        timeout: int = 10,
+    ):
+        """
+        Args:
+            base_url (`str`):
+                text-generation-inference instance base url
+            headers (`Optional[Dict[str, str]]`):
+                Additional headers
+            cookies (`Optional[Dict[str, str]]`):
+                Cookies to include in the requests
+            timeout (`int`):
+                Timeout in seconds
+        """
+        warnings.warn(DEPRECATION_WARNING, DeprecationWarning)
+        self.base_url = base_url
+        self.headers = headers
+        self.cookies = cookies
+        self.timeout = ClientTimeout(timeout)
+    async def completion(
+        self,
+        prompt: str,
+        frequency_penalty: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        seed: Optional[int] = None,
+        stream: bool = False,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        stop: Optional[List[str]] = None,
+    ) -> Union[Completion, AsyncIterator[CompletionComplete]]:
+        """
+        Given a prompt, generate a response asynchronously
+        Args:
+            prompt (`str`):
+                Prompt
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
+            max_tokens (`int`):
+                Maximum number of generated tokens
+            repetition_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            seed (`int`):
+                Random sampling seed
+            stream (`bool`):
+                Stream the response
+            temperature (`float`):
+                The value used to module the logits distribution.
+            top_p (`float`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation
+            stop (`List[str]`):
+                Stop generating tokens if a member of `stop` is generated
+        """
+        request = CompletionRequest(
+            model="tgi",
+            prompt=prompt,
+            frequency_penalty=frequency_penalty,
+            max_tokens=max_tokens,
+            repetition_penalty=repetition_penalty,
+            seed=seed,
+            stream=stream,
+            temperature=temperature,
+            top_p=top_p,
+            stop=stop,
+        )
+        if not stream:
+            return await self._completion_single_response(request)
+        else:
+            return self._completion_stream_response(request)
+    async def _completion_single_response(self, request):
+        async with ClientSession(
+            headers=self.headers, cookies=self.cookies, timeout=self.timeout
+        ) as session:
+            async with session.post(
+                f"{self.base_url}/v1/completions", json=request.dict()
+            ) as resp:
+                payload = await resp.json()
+                if resp.status != 200:
+                    raise parse_error(resp.status, payload)
+                return Completion(**payload)
+    async def _completion_stream_response(self, request):
+        async with ClientSession(
+            headers=self.headers, cookies=self.cookies, timeout=self.timeout
+        ) as session:
+            async with session.post(
+                f"{self.base_url}/v1/completions", json=request.dict()
+            ) as resp:
+                async for byte_payload in resp.content:
+                    if byte_payload == b"\n":
+                        continue
+                    payload = byte_payload.decode("utf-8")
+                    if payload.startswith("data:"):
+                        json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
+                        try:
+                            response = CompletionComplete(**json_payload)
+                            yield response
+                        except ValidationError:
+                            raise parse_error(resp.status, json_payload)
+    async def chat(
+        self,
+        messages: List[Message],
+        repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[List[float]] = None,
+        logprobs: Optional[bool] = None,
+        top_logprobs: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        stream: bool = False,
+        seed: Optional[int] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        tools: Optional[List[Tool]] = None,
+        tool_prompt: Optional[str] = None,
+        tool_choice: Optional[str] = None,
+        stop: Optional[List[str]] = None,
+    ) -> Union[ChatComplete, AsyncIterator[ChatCompletionChunk]]:
+        """
+        Given a list of messages, generate a response asynchronously
+        Args:
+            messages (`List[Message]`):
+                List of messages
+            repetition_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
+            logit_bias (`List[float]`):
+                Adjust the likelihood of specified tokens
+            logprobs (`bool`):
+                Include log probabilities in the response
+            top_logprobs (`int`):
+                Include the `n` most likely tokens at each step
+            max_tokens (`int`):
+                Maximum number of generated tokens
+            n (`int`):
+                Generate `n` completions
+            presence_penalty (`float`):
+                The parameter for presence penalty. 0.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            stream (`bool`):
+                Stream the response
+            seed (`int`):
+                Random sampling seed
+            temperature (`float`):
+                The value used to module the logits distribution.
+            top_p (`float`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation
+            tools (`List[Tool]`):
+                List of tools to use
+            tool_prompt (`str`):
+                A prompt to be appended before the tools
+            tool_choice (`str`):
+                The tool to use
+            stop (`List[str]`):
+                Stop generating tokens if a member of `stop` is generated
+        """
+        request = ChatRequest(
+            model="tgi",
+            messages=messages,
+            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            top_logprobs=top_logprobs,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            stream=stream,
+            seed=seed,
+            temperature=temperature,
+            top_p=top_p,
+            tools=tools,
+            tool_prompt=tool_prompt,
+            tool_choice=tool_choice,
+            stop=stop,
+        )
+        if not stream:
+            return await self._chat_single_response(request)
+        else:
+            return self._chat_stream_response(request)
+    async def _chat_single_response(self, request):
+        async with ClientSession(
+            headers=self.headers, cookies=self.cookies, timeout=self.timeout
+        ) as session:
+            async with session.post(
+                f"{self.base_url}/v1/chat/completions", json=request.dict()
+            ) as resp:
+                payload = await resp.json()
+                if resp.status != 200:
+                    raise parse_error(resp.status, payload)
+                return ChatComplete(**payload)
+    async def _chat_stream_response(self, request):
+        async with ClientSession(
+            headers=self.headers, cookies=self.cookies, timeout=self.timeout
+        ) as session:
+            async with session.post(
+                f"{self.base_url}/v1/chat/completions", json=request.dict()
+            ) as resp:
+                async for byte_payload in resp.content:
+                    if byte_payload == b"\n":
+                        continue
+                    payload = byte_payload.decode("utf-8")
+                    if payload.startswith("data:"):
+                        json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
+                        try:
+                            response = ChatCompletionChunk(**json_payload)
+                            yield response
+                        except ValidationError:
+                            raise parse_error(resp.status, json_payload)
+    async def generate(
+        self,
+        prompt: str,
+        do_sample: bool = False,
+        max_new_tokens: int = 20,
+        best_of: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
+        return_full_text: bool = False,
+        seed: Optional[int] = None,
+        stop_sequences: Optional[List[str]] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: bool = False,
+        decoder_input_details: bool = False,
+        top_n_tokens: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
+    ) -> Response:
+        """
+        Given a prompt, generate the following text asynchronously
+        Args:
+            prompt (`str`):
+                Input text
+            do_sample (`bool`):
+                Activate logits sampling
+            max_new_tokens (`int`):
+                Maximum number of generated tokens
+            best_of (`int`):
+                Generate best_of sequences and return the one if the highest token logprobs
+            repetition_penalty (`float`):
+                The parameter for repetition penalty. 1.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 1.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
+            return_full_text (`bool`):
+                Whether to prepend the prompt to the generated text
+            seed (`int`):
+                Random sampling seed
+            stop_sequences (`List[str]`):
+                Stop generating tokens if a member of `stop_sequences` is generated
+            temperature (`float`):
+                The value used to module the logits distribution.
+            top_k (`int`):
+                The number of highest probability vocabulary tokens to keep for top-k-filtering.
+            top_p (`float`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation.
+            truncate (`int`):
+                Truncate inputs tokens to the given size
+            typical_p (`float`):
+                Typical Decoding mass
+                See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
+            watermark (`bool`):
+                Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+            decoder_input_details (`bool`):
+                Return the decoder input token logprobs and ids
+            top_n_tokens (`int`):
+                Return the `n` most likely tokens at each step
+            grammar (`Grammar`):
+                Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
+                of the text to match a regular expression or JSON schema.
+        Returns:
+            Response: generated response
+        """
+        # Validate parameters
+        parameters = Parameters(
+            best_of=best_of,
+            details=True,
+            decoder_input_details=decoder_input_details,
+            do_sample=do_sample,
+            max_new_tokens=max_new_tokens,
+            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
+            return_full_text=return_full_text,
+            seed=seed,
+            stop=stop_sequences if stop_sequences is not None else [],
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            truncate=truncate,
+            typical_p=typical_p,
+            watermark=watermark,
+            top_n_tokens=top_n_tokens,
+            grammar=grammar,
+        )
+        request = Request(inputs=prompt, stream=False, parameters=parameters)
+        async with ClientSession(
+            headers=self.headers, cookies=self.cookies, timeout=self.timeout
+        ) as session:
+            async with session.post(self.base_url, json=request.dict()) as resp:
+                payload = await resp.json()
+                if resp.status != 200:
+                    raise parse_error(resp.status, payload)
+                return Response(**payload[0])
+    async def generate_stream(
+        self,
+        prompt: str,
+        do_sample: bool = False,
+        max_new_tokens: int = 20,
+        repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
+        return_full_text: bool = False,
+        seed: Optional[int] = None,
+        stop_sequences: Optional[List[str]] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: bool = False,
+        top_n_tokens: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
+    ) -> AsyncIterator[StreamResponse]:
+        """
+        Given a prompt, generate the following stream of tokens asynchronously
+        Args:
+            prompt (`str`):
+                Input text
+            do_sample (`bool`):
+                Activate logits sampling
+            max_new_tokens (`int`):
+                Maximum number of generated tokens
+            repetition_penalty (`float`):
+                The parameter for repetition penalty. 1.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 1.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
+            return_full_text (`bool`):
+                Whether to prepend the prompt to the generated text
+            seed (`int`):
+                Random sampling seed
+            stop_sequences (`List[str]`):
+                Stop generating tokens if a member of `stop_sequences` is generated
+            temperature (`float`):
+                The value used to module the logits distribution.
+            top_k (`int`):
+                The number of highest probability vocabulary tokens to keep for top-k-filtering.
+            top_p (`float`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation.
+            truncate (`int`):
+                Truncate inputs tokens to the given size
+            typical_p (`float`):
+                Typical Decoding mass
+                See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
+            watermark (`bool`):
+                Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+            top_n_tokens (`int`):
+                Return the `n` most likely tokens at each step
+            grammar (`Grammar`):
+                Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
+                of the text to match a regular expression or JSON schema.
+        Returns:
+            AsyncIterator[StreamResponse]: stream of generated tokens
+        """
+        # Validate parameters
+        parameters = Parameters(
+            best_of=None,
+            details=True,
+            decoder_input_details=False,
+            do_sample=do_sample,
+            max_new_tokens=max_new_tokens,
+            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
+            return_full_text=return_full_text,
+            seed=seed,
+            stop=stop_sequences if stop_sequences is not None else [],
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            truncate=truncate,
+            typical_p=typical_p,
+            watermark=watermark,
+            top_n_tokens=top_n_tokens,
+            grammar=grammar,
+        )
+        request = Request(inputs=prompt, stream=True, parameters=parameters)
+        async with ClientSession(
+            headers=self.headers, cookies=self.cookies, timeout=self.timeout
+        ) as session:
+            async with session.post(self.base_url, json=request.dict()) as resp:
+                if resp.status != 200:
+                    raise parse_error(resp.status, await resp.json())
+                # Parse ServerSentEvents
+                async for byte_payload in resp.content:
+                    # Skip line
+                    if byte_payload == b"\n":
+                        continue
+                    payload = byte_payload.decode("utf-8")
+                    # Event data
+                    if payload.startswith("data:"):
+                        # Decode payload
+                        json_payload = json.loads(payload.lstrip("data:").rstrip("/n"))
+                        # Parse payload
+                        try:
+                            response = StreamResponse(**json_payload)
+                        except ValidationError:
+                            # If we failed to parse the payload, then it is an error payload
+                            raise parse_error(resp.status, json_payload)
+                        yield response
--- a/clients/python/text_generation/errors.py
+++ b/clients/python/text_generation/errors.py
+from typing import Dict
+# Text Generation Inference Errors
+class ValidationError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+class GenerationError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+class OverloadedError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+class IncompleteGenerationError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+# API Inference Errors
+class BadRequestError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+class ShardNotReadyError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+class ShardTimeoutError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+class NotFoundError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+class RateLimitExceededError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+class NotSupportedError(Exception):
+    def __init__(self, model_id: str):
+        message = (
+            f"Model `{model_id}` is not available for inference with this client. \n"
+            "Use `huggingface_hub.inference_api.InferenceApi` instead."
+        )
+        super(NotSupportedError, self).__init__(message)
+# Unknown error
+class UnknownError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+def parse_error(status_code: int, payload: Dict[str, str]) -> Exception:
+    """
+    Parse error given an HTTP status code and a json payload
+    Args:
+        status_code (`int`):
+            HTTP status code
+        payload (`Dict[str, str]`):
+            Json payload
+    Returns:
+        Exception: parsed exception
+    """
+    # Try to parse a Text Generation Inference error
+    message = payload["error"]
+    if "error_type" in payload:
+        error_type = payload["error_type"]
+        if error_type == "generation":
+            return GenerationError(message)
+        if error_type == "incomplete_generation":
+            return IncompleteGenerationError(message)
+        if error_type == "overloaded":
+            return OverloadedError(message)
+        if error_type == "validation":
+            return ValidationError(message)
+    # Try to parse a APIInference error
+    if status_code == 400:
+        return BadRequestError(message)
+    if status_code == 403 or status_code == 424:
+        return ShardNotReadyError(message)
+    if status_code == 504:
+        return ShardTimeoutError(message)
+    if status_code == 404:
+        return NotFoundError(message)
+    if status_code == 429:
+        return RateLimitExceededError(message)
+    # Fallback to an unknown error
+    return UnknownError(message)
--- a/clients/python/text_generation/inference_api.py
+++ b/clients/python/text_generation/inference_api.py
+import os
+import requests
+from typing import Dict, Optional, List
+from huggingface_hub.utils import build_hf_headers
+from text_generation import Client, AsyncClient, __version__
+from text_generation.types import DeployedModel
+from text_generation.errors import NotSupportedError, parse_error
+INFERENCE_ENDPOINT = os.environ.get(
+    "HF_INFERENCE_ENDPOINT", "https://api-inference.huggingface.co"
+)
+def deployed_models(headers: Optional[Dict] = None) -> List[DeployedModel]:
+    """
+    Get all currently deployed models with text-generation-inference-support
+    Returns:
+        List[DeployedModel]: list of all currently deployed models
+    """
+    resp = requests.get(
+        f"https://api-inference.huggingface.co/framework/text-generation-inference",
+        headers=headers,
+        timeout=5,
+    )
+    payload = resp.json()
+    if resp.status_code != 200:
+        raise parse_error(resp.status_code, payload)
+    models = [DeployedModel(**raw_deployed_model) for raw_deployed_model in payload]
+    return models
+def check_model_support(repo_id: str, headers: Optional[Dict] = None) -> bool:
+    """
+    Check if a given model is supported by text-generation-inference
+    Returns:
+        bool: whether the model is supported by this client
+    """
+    resp = requests.get(
+        f"https://api-inference.huggingface.co/status/{repo_id}",
+        headers=headers,
+        timeout=5,
+    )
+    payload = resp.json()
+    if resp.status_code != 200:
+        raise parse_error(resp.status_code, payload)
+    framework = payload["framework"]
+    supported = framework == "text-generation-inference"
+    return supported
+class InferenceAPIClient(Client):
+    """Client to make calls to the HuggingFace Inference API.
+     Only supports a subset of the available text-generation or text2text-generation models that are served using
+     text-generation-inference
+     Example:
+     ```python
+     >>> from text_generation import InferenceAPIClient
+     >>> client = InferenceAPIClient("bigscience/bloomz")
+     >>> client.generate("Why is the sky blue?").generated_text
+     ' Rayleigh scattering'
+     >>> result = ""
+     >>> for response in client.generate_stream("Why is the sky blue?"):
+     >>>     if not response.token.special:
+     >>>         result += response.token.text
+     >>> result
+    ' Rayleigh scattering'
+     ```
+    """
+    def __init__(self, repo_id: str, token: Optional[str] = None, timeout: int = 10):
+        """
+        Init headers and API information
+        Args:
+            repo_id (`str`):
+                Id of repository (e.g. `bigscience/bloom`).
+            token (`str`, `optional`):
+                The API token to use as HTTP bearer authorization. This is not
+                the authentication token. You can find the token in
+                https://huggingface.co/settings/token. Alternatively, you can
+                find both your organizations and personal API tokens using
+                `HfApi().whoami(token)`.
+            timeout (`int`):
+                Timeout in seconds
+        """
+        headers = build_hf_headers(
+            token=token, library_name="text-generation", library_version=__version__
+        )
+        # Text Generation Inference client only supports a subset of the available hub models
+        if not check_model_support(repo_id, headers):
+            raise NotSupportedError(repo_id)
+        base_url = f"{INFERENCE_ENDPOINT}/models/{repo_id}"
+        super(InferenceAPIClient, self).__init__(
+            base_url, headers=headers, timeout=timeout
+        )
+class InferenceAPIAsyncClient(AsyncClient):
+    """Aynschronous Client to make calls to the HuggingFace Inference API.
+     Only supports a subset of the available text-generation or text2text-generation models that are served using
+     text-generation-inference
+     Example:
+     ```python
+     >>> from text_generation import InferenceAPIAsyncClient
+     >>> client = InferenceAPIAsyncClient("bigscience/bloomz")
+     >>> response = await client.generate("Why is the sky blue?")
+     >>> response.generated_text
+     ' Rayleigh scattering'
+     >>> result = ""
+     >>> async for response in client.generate_stream("Why is the sky blue?"):
+     >>>     if not response.token.special:
+     >>>         result += response.token.text
+     >>> result
+    ' Rayleigh scattering'
+     ```
+    """
+    def __init__(self, repo_id: str, token: Optional[str] = None, timeout: int = 10):
+        """
+        Init headers and API information
+        Args:
+            repo_id (`str`):
+                Id of repository (e.g. `bigscience/bloom`).
+            token (`str`, `optional`):
+                The API token to use as HTTP bearer authorization. This is not
+                the authentication token. You can find the token in
+                https://huggingface.co/settings/token. Alternatively, you can
+                find both your organizations and personal API tokens using
+                `HfApi().whoami(token)`.
+            timeout (`int`):
+                Timeout in seconds
+        """
+        headers = build_hf_headers(
+            token=token, library_name="text-generation", library_version=__version__
+        )
+        # Text Generation Inference client only supports a subset of the available hub models
+        if not check_model_support(repo_id, headers):
+            raise NotSupportedError(repo_id)
+        base_url = f"{INFERENCE_ENDPOINT}/models/{repo_id}"
+        super(InferenceAPIAsyncClient, self).__init__(
+            base_url, headers=headers, timeout=timeout
+        )
--- a/clients/python/text_generation/types.py
+++ b/clients/python/text_generation/types.py
+from enum import Enum
+from pydantic import BaseModel, field_validator, ConfigDict
+from typing import Optional, List, Union, Any
+from text_generation.errors import ValidationError
+# enum for grammar type
+class GrammarType(str, Enum):
+    Json = "json"
+    Regex = "regex"
+# Grammar type and value
+class Grammar(BaseModel):
+    # Grammar type
+    type: GrammarType
+    # Grammar value
+    value: Union[str, dict]
+class ToolCall(BaseModel):
+    # Id of the tool call
+    id: int
+    # Type of the tool call
+    type: str
+    # Function details of the tool call
+    function: dict
+class Message(BaseModel):
+    # Role of the message sender
+    role: str
+    # Content of the message
+    content: Optional[str] = None
+    # Optional name of the message sender
+    name: Optional[str] = None
+    # Tool calls associated with the chat completion
+    tool_calls: Optional[Any] = None
+class Tool(BaseModel):
+    # Type of the tool
+    type: str
+    # Function details of the tool
+    function: dict
+class Function(BaseModel):
+    name: Optional[str]
+    arguments: str
+class ChoiceDeltaToolCall(BaseModel):
+    index: int
+    id: str
+    type: str
+    function: Function
+class ChoiceDelta(BaseModel):
+    role: str
+    content: Optional[str] = None
+    tool_calls: Optional[ChoiceDeltaToolCall]
+class Choice(BaseModel):
+    index: int
+    delta: ChoiceDelta
+    logprobs: Optional[dict] = None
+    finish_reason: Optional[str] = None
+class CompletionRequest(BaseModel):
+    # Model identifier
+    model: str
+    # Prompt
+    prompt: str
+    # The parameter for repetition penalty. 1.0 means no penalty.
+    # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+    repetition_penalty: Optional[float] = None
+    # The parameter for frequency penalty. 1.0 means no penalty
+    # Penalize new tokens based on their existing frequency in the text so far,
+    # decreasing the model's likelihood to repeat the same line verbatim.
+    frequency_penalty: Optional[float] = None
+    # Maximum number of tokens to generate
+    max_tokens: Optional[int] = None
+    # Flag to indicate streaming response
+    stream: bool = False
+    # Random sampling seed
+    seed: Optional[int] = None
+    # Sampling temperature
+    temperature: Optional[float] = None
+    # Top-p value for nucleus sampling
+    top_p: Optional[float] = None
+    # Stop generating tokens if a member of `stop` is generated
+    stop: Optional[List[str]] = None
+class CompletionComplete(BaseModel):
+    # Index of the chat completion
+    index: int
+    # Message associated with the chat completion
+    text: str
+    # Log probabilities for the chat completion
+    logprobs: Optional[Any]
+    # Reason for completion
+    finish_reason: str
+class Completion(BaseModel):
+    # Completion details
+    id: str
+    object: str
+    created: int
+    model: str
+    system_fingerprint: str
+    choices: List[CompletionComplete]
+class ChatRequest(BaseModel):
+    # Model identifier
+    model: str
+    # List of messages in the conversation
+    messages: List[Message]
+    # The parameter for repetition penalty. 1.0 means no penalty.
+    # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+    repetition_penalty: Optional[float] = None
+    # The parameter for frequency penalty. 1.0 means no penalty
+    # Penalize new tokens based on their existing frequency in the text so far,
+    # decreasing the model's likelihood to repeat the same line verbatim.
+    frequency_penalty: Optional[float] = None
+    # Bias values for token selection
+    logit_bias: Optional[List[float]] = None
+    # Whether to return log probabilities
+    logprobs: Optional[bool] = None
+    # Number of most likely tokens to return at each position
+    top_logprobs: Optional[int] = None
+    # Maximum number of tokens to generate
+    max_tokens: Optional[int] = None
+    # Number of chat completion choices to generate
+    n: Optional[int] = None
+    # Penalty for presence of new tokens
+    presence_penalty: Optional[float] = None
+    # Flag to indicate streaming response
+    stream: bool = False
+    # Random sampling seed
+    seed: Optional[int] = None
+    # Sampling temperature
+    temperature: Optional[float] = None
+    # Top-p value for nucleus sampling
+    top_p: Optional[float] = None
+    # List of tools to be used
+    tools: Optional[List[Tool]] = None
+    # A prompt to be appended before the tools
+    tool_prompt: Optional[str] = None
+    # Choice of tool to be used
+    tool_choice: Optional[str] = None
+    # Stop generating tokens if a member of `stop` is generated
+    stop: Optional[List[str]] = None
+class ChatCompletionComplete(BaseModel):
+    # Index of the chat completion
+    index: int
+    # Message associated with the chat completion
+    message: Message
+    # Log probabilities for the chat completion
+    logprobs: Optional[Any]
+    # Reason for completion
+    finish_reason: str
+    # Usage details of the chat completion
+    usage: Optional[Any] = None
+class ChatComplete(BaseModel):
+    # Chat completion details
+    id: str
+    object: str
+    created: int
+    model: str
+    system_fingerprint: str
+    choices: List[ChatCompletionComplete]
+    usage: Any
+class ChatCompletionChunk(BaseModel):
+    id: str
+    object: str
+    created: int
+    model: str
+    system_fingerprint: str
+    choices: List[Choice]
+class Parameters(BaseModel):
+    # Activate logits sampling
+    do_sample: bool = False
+    # Maximum number of generated tokens
+    max_new_tokens: int = 20
+    # The parameter for repetition penalty. 1.0 means no penalty.
+    # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+    repetition_penalty: Optional[float] = None
+    # The parameter for frequency penalty. 1.0 means no penalty
+    # Penalize new tokens based on their existing frequency in the text so far,
+    # decreasing the model's likelihood to repeat the same line verbatim.
+    frequency_penalty: Optional[float] = None
+    # Whether to prepend the prompt to the generated text
+    return_full_text: bool = False
+    # Stop generating tokens if a member of `stop_sequences` is generated
+    stop: List[str] = []
+    # Random sampling seed
+    seed: Optional[int] = None
+    # The value used to module the logits distribution.
+    temperature: Optional[float] = None
+    # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+    top_k: Optional[int] = None
+    # If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+    # higher are kept for generation.
+    top_p: Optional[float] = None
+    # truncate inputs tokens to the given size
+    truncate: Optional[int] = None
+    # Typical Decoding mass
+    # See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
+    typical_p: Optional[float] = None
+    # Generate best_of sequences and return the one if the highest token logprobs
+    best_of: Optional[int] = None
+    # Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+    watermark: bool = False
+    # Get generation details
+    details: bool = False
+    # Get decoder input token logprobs and ids
+    decoder_input_details: bool = False
+    # Return the N most likely tokens at each step
+    top_n_tokens: Optional[int] = None
+    # grammar to use for generation
+    grammar: Optional[Grammar] = None
+    @field_validator("best_of")
+    def valid_best_of(cls, field_value, values):
+        if field_value is not None:
+            if field_value <= 0:
+                raise ValidationError("`best_of` must be strictly positive")
+            if field_value > 1 and values.data["seed"] is not None:
+                raise ValidationError("`seed` must not be set when `best_of` is > 1")
+            sampling = (
+                values.data["do_sample"]
+                | (values.data["temperature"] is not None)
+                | (values.data["top_k"] is not None)
+                | (values.data["top_p"] is not None)
+                | (values.data["typical_p"] is not None)
+            )
+            if field_value > 1 and not sampling:
+                raise ValidationError("you must use sampling when `best_of` is > 1")
+        return field_value
+    @field_validator("repetition_penalty")
+    def valid_repetition_penalty(cls, v):
+        if v is not None and v <= 0:
+            raise ValidationError("`repetition_penalty` must be strictly positive")
+        return v
+    @field_validator("frequency_penalty")
+    def valid_frequency_penalty(cls, v):
+        if v is not None and v <= 0:
+            raise ValidationError("`frequency_penalty` must be strictly positive")
+        return v
+    @field_validator("seed")
+    def valid_seed(cls, v):
+        if v is not None and v < 0:
+            raise ValidationError("`seed` must be positive")
+        return v
+    @field_validator("temperature")
+    def valid_temp(cls, v):
+        if v is not None and v <= 0:
+            raise ValidationError("`temperature` must be strictly positive")
+        return v
+    @field_validator("top_k")
+    def valid_top_k(cls, v):
+        if v is not None and v <= 0:
+            raise ValidationError("`top_k` must be strictly positive")
+        return v
+    @field_validator("top_p")
+    def valid_top_p(cls, v):
+        if v is not None and (v <= 0 or v >= 1.0):
+            raise ValidationError("`top_p` must be > 0.0 and < 1.0")
+        return v
+    @field_validator("truncate")
+    def valid_truncate(cls, v):
+        if v is not None and v <= 0:
+            raise ValidationError("`truncate` must be strictly positive")
+        return v
+    @field_validator("typical_p")
+    def valid_typical_p(cls, v):
+        if v is not None and (v <= 0 or v >= 1.0):
+            raise ValidationError("`typical_p` must be > 0.0 and < 1.0")
+        return v
+    @field_validator("top_n_tokens")
+    def valid_top_n_tokens(cls, v):
+        if v is not None and v <= 0:
+            raise ValidationError("`top_n_tokens` must be strictly positive")
+        return v
+    @field_validator("grammar")
+    def valid_grammar(cls, v):
+        if v is not None:
+            if v.type == GrammarType.Regex and not v.value:
+                raise ValidationError("`value` cannot be empty for `regex` grammar")
+            if v.type == GrammarType.Json and not v.value:
+                raise ValidationError("`value` cannot be empty for `json` grammar")
+        return v
+class Request(BaseModel):
+    # Prompt
+    inputs: str
+    # Generation parameters
+    parameters: Optional[Parameters] = None
+    # Whether to stream output tokens
+    stream: bool = False
+    @field_validator("inputs")
+    def valid_input(cls, v):
+        if not v:
+            raise ValidationError("`inputs` cannot be empty")
+        return v
+    @field_validator("stream")
+    def valid_best_of_stream(cls, field_value, values):
+        parameters = values.data["parameters"]
+        if (
+            parameters is not None
+            and parameters.best_of is not None
+            and parameters.best_of > 1
+            and field_value
+        ):
+            raise ValidationError(
+                "`best_of` != 1 is not supported when `stream` == True"
+            )
+        return field_value
+# Decoder input tokens
+class InputToken(BaseModel):
+    # Token ID from the model tokenizer
+    id: int
+    # Token text
+    text: str
+    # Logprob
+    # Optional since the logprob of the first token cannot be computed
+    logprob: Optional[float] = None
+# Generated tokens
+class Token(BaseModel):
+    # Token ID from the model tokenizer
+    id: int
+    # Token text
+    text: str
+    # Logprob
+    logprob: Optional[float] = None
+    # Is the token a special token
+    # Can be used to ignore tokens when concatenating
+    special: bool
+# Generation finish reason
+class FinishReason(str, Enum):
+    # number of generated tokens == `max_new_tokens`
+    Length = "length"
+    # the model generated its end of sequence token
+    EndOfSequenceToken = "eos_token"
+    # the model generated a text included in `stop_sequences`
+    StopSequence = "stop_sequence"
+# Additional sequences when using the `best_of` parameter
+class BestOfSequence(BaseModel):
+    # Generated text
+    generated_text: str
+    # Generation finish reason
+    finish_reason: FinishReason
+    # Number of generated tokens
+    generated_tokens: int
+    # Sampling seed if sampling was activated
+    seed: Optional[int] = None
+    # Decoder input tokens, empty if decoder_input_details is False
+    prefill: List[InputToken]
+    # Generated tokens
+    tokens: List[Token]
+    # Most likely tokens
+    top_tokens: Optional[List[List[Token]]] = None
+# `generate` details
+class Details(BaseModel):
+    # Generation finish reason
+    finish_reason: FinishReason
+    # Number of generated tokens
+    generated_tokens: int
+    # Sampling seed if sampling was activated
+    seed: Optional[int] = None
+    # Decoder input tokens, empty if decoder_input_details is False
+    prefill: List[InputToken]
+    # Generated tokens
+    tokens: List[Token]
+    # Most likely tokens
+    top_tokens: Optional[List[List[Token]]] = None
+    # Additional sequences when using the `best_of` parameter
+    best_of_sequences: Optional[List[BestOfSequence]] = None
+# `generate` return value
+class Response(BaseModel):
+    # Generated text
+    generated_text: str
+    # Generation details
+    details: Details
+# `generate_stream` details
+class StreamDetails(BaseModel):
+    # Generation finish reason
+    finish_reason: FinishReason
+    # Number of generated tokens
+    generated_tokens: int
+    # Sampling seed if sampling was activated
+    seed: Optional[int] = None
+# `generate_stream` return value
+class StreamResponse(BaseModel):
+    # Generated token
+    token: Token
+    # Most likely tokens
+    top_tokens: Optional[List[Token]] = None
+    # Complete generated text
+    # Only available when the generation is finished
+    generated_text: Optional[str] = None
+    # Generation details
+    # Only available when the generation is finished
+    details: Optional[StreamDetails] = None
+# Inference API currently deployed model
+class DeployedModel(BaseModel):
+    # Disable warning for use of `model_` prefix in `model_id`. Be mindful about adding members
+    # with model_ prefixes, since this disables guardrails for colliding fields:
+    # https://github.com/pydantic/pydantic/issues/9177
+    model_config = ConfigDict(protected_namespaces=())
+    model_id: str
+    sha: str
--- a/docs/README.md
+++ b/docs/README.md
+Documentation available at: https://huggingface.co/docs/text-generation-inference
+## Release
+When making a release, please update the latest version in the documentation with:
+```
+export OLD_VERSION="2\.0\.3"
+export NEW_VERSION="2\.0\.4"
+find . -name '*.md' -exec sed -i -e "s/$OLD_VERSION/$NEW_VERSION/g" {} \;
+```
--- a/docs/index.html
+++ b/docs/index.html
+<html>
+    <head>
+        <!-- Load the latest Swagger UI code and style from npm using unpkg.com -->
+        <script src="https://unpkg.com/swagger-ui-dist@3/swagger-ui-bundle.js"></script>
+        <link rel="stylesheet" type="text/css" href="https://unpkg.com/swagger-ui-dist@3/swagger-ui.css"/>
+        <title>Text Generation Inference API</title>
+    </head>
+    <body>
+        <div id="swagger-ui"></div> <!-- Div to hold the UI component -->
+        <script>
+            window.onload = function () {
+                // Begin Swagger UI call region
+                const ui = SwaggerUIBundle({
+                    url: "openapi.json", //Location of Open API spec in the repo
+                    dom_id: '#swagger-ui',
+                    deepLinking: true,
+                    supportedSubmitMethods: [],
+                    presets: [
+                        SwaggerUIBundle.presets.apis,
+                        SwaggerUIBundle.SwaggerUIStandalonePreset
+                    ],
+                    plugins: [
+                        SwaggerUIBundle.plugins.DownloadUrl
+                    ],
+                })
+                window.ui = ui
+            }
+        </script>
+    </body>
+</html>
--- a/docs/openapi.json
+++ b/docs/openapi.json
+{
+  "openapi": "3.0.3",
+  "info": {
+    "title": "Text Generation Inference",
+    "description": "Text Generation Webserver",
+    "contact": {
+      "name": "Olivier Dehaene"
+    },
+    "license": {
+      "name": "Apache 2.0",
+      "url": "https://www.apache.org/licenses/LICENSE-2.0"
+    },
+    "version": "2.1.1"
+  },
+  "paths": {
+    "/": {
+      "post": {
+        "tags": [
+          "Text Generation Inference"
+        ],
+        "summary": "Generate tokens if `stream == false` or a stream of token if `stream == true`",
+        "operationId": "compat_generate",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/CompatGenerateRequest"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "Generated Text",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/GenerateResponse"
+                }
+              },
+              "text/event-stream": {
+                "schema": {
+                  "$ref": "#/components/schemas/StreamResponse"
+                }
+              }
+            }
+          },
+          "422": {
+            "description": "Input validation error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Input validation error"
+                }
+              }
+            }
+          },
+          "424": {
+            "description": "Generation Error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Request failed during generation"
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Model is overloaded",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Model is overloaded"
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Incomplete generation",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Incomplete generation"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/generate": {
+      "post": {
+        "tags": [
+          "Text Generation Inference"
+        ],
+        "summary": "Generate tokens",
+        "operationId": "generate",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/GenerateRequest"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "Generated Text",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/GenerateResponse"
+                }
+              }
+            }
+          },
+          "422": {
+            "description": "Input validation error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Input validation error"
+                }
+              }
+            }
+          },
+          "424": {
+            "description": "Generation Error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Request failed during generation"
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Model is overloaded",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Model is overloaded"
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Incomplete generation",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Incomplete generation"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/generate_stream": {
+      "post": {
+        "tags": [
+          "Text Generation Inference"
+        ],
+        "summary": "Generate a stream of token using Server-Sent Events",
+        "operationId": "generate_stream",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/GenerateRequest"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "Generated Text",
+            "content": {
+              "text/event-stream": {
+                "schema": {
+                  "$ref": "#/components/schemas/StreamResponse"
+                }
+              }
+            }
+          },
+          "422": {
+            "description": "Input validation error",
+            "content": {
+              "text/event-stream": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Input validation error"
+                }
+              }
+            }
+          },
+          "424": {
+            "description": "Generation Error",
+            "content": {
+              "text/event-stream": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Request failed during generation"
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Model is overloaded",
+            "content": {
+              "text/event-stream": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Model is overloaded"
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Incomplete generation",
+            "content": {
+              "text/event-stream": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Incomplete generation"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/health": {
+      "get": {
+        "tags": [
+          "Text Generation Inference"
+        ],
+        "summary": "Health check method",
+        "operationId": "health",
+        "responses": {
+          "200": {
+            "description": "Everything is working fine"
+          },
+          "503": {
+            "description": "Text generation inference is down",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "unhealthy",
+                  "error_type": "healthcheck"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/info": {
+      "get": {
+        "tags": [
+          "Text Generation Inference"
+        ],
+        "summary": "Text Generation Inference endpoint info",
+        "operationId": "get_model_info",
+        "responses": {
+          "200": {
+            "description": "Served model info",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/Info"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/metrics": {
+      "get": {
+        "tags": [
+          "Text Generation Inference"
+        ],
+        "summary": "Prometheus metrics scrape endpoint",
+        "operationId": "metrics",
+        "responses": {
+          "200": {
+            "description": "Prometheus Metrics",
+            "content": {
+              "text/plain": {
+                "schema": {
+                  "type": "string"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/tokenize": {
+      "post": {
+        "tags": [
+          "Text Generation Inference"
+        ],
+        "summary": "Tokenize inputs",
+        "operationId": "tokenize",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/GenerateRequest"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "Tokenized ids",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/TokenizeResponse"
+                }
+              }
+            }
+          },
+          "404": {
+            "description": "No tokenizer found",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "No fast tokenizer available"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/v1/chat/completions": {
+      "post": {
+        "tags": [
+          "Text Generation Inference"
+        ],
+        "summary": "Generate tokens",
+        "operationId": "chat_completions",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/ChatRequest"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "Generated Chat Completion",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ChatCompletion"
+                }
+              },
+              "text/event-stream": {
+                "schema": {
+                  "$ref": "#/components/schemas/ChatCompletionChunk"
+                }
+              }
+            }
+          },
+          "422": {
+            "description": "Input validation error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Input validation error"
+                }
+              }
+            }
+          },
+          "424": {
+            "description": "Generation Error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Request failed during generation"
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Model is overloaded",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Model is overloaded"
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Incomplete generation",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Incomplete generation"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/v1/completions": {
+      "post": {
+        "tags": [
+          "Text Generation Inference"
+        ],
+        "summary": "Generate tokens",
+        "operationId": "completions",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/CompletionRequest"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "Generated Chat Completion",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/Completion"
+                }
+              },
+              "text/event-stream": {
+                "schema": {
+                  "$ref": "#/components/schemas/CompletionCompleteChunk"
+                }
+              }
+            }
+          },
+          "422": {
+            "description": "Input validation error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Input validation error"
+                }
+              }
+            }
+          },
+          "424": {
+            "description": "Generation Error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Request failed during generation"
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Model is overloaded",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Model is overloaded"
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Incomplete generation",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Incomplete generation"
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  },
+  "components": {
+    "schemas": {
+      "BestOfSequence": {
+        "type": "object",
+        "required": [
+          "generated_text",
+          "finish_reason",
+          "generated_tokens",
+          "prefill",
+          "tokens"
+        ],
+        "properties": {
+          "finish_reason": {
+            "$ref": "#/components/schemas/FinishReason"
+          },
+          "generated_text": {
+            "type": "string",
+            "example": "test"
+          },
+          "generated_tokens": {
+            "type": "integer",
+            "format": "int32",
+            "example": 1,
+            "minimum": 0
+          },
+          "prefill": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/PrefillToken"
+            }
+          },
+          "seed": {
+            "type": "integer",
+            "format": "int64",
+            "example": 42,
+            "nullable": true,
+            "minimum": 0
+          },
+          "tokens": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/Token"
+            }
+          },
+          "top_tokens": {
+            "type": "array",
+            "items": {
+              "type": "array",
+              "items": {
+                "$ref": "#/components/schemas/Token"
+              }
+            }
+          }
+        }
+      },
+      "ChatCompletion": {
+        "type": "object",
+        "required": [
+          "id",
+          "created",
+          "model",
+          "system_fingerprint",
+          "choices",
+          "usage"
+        ],
+        "properties": {
+          "choices": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/ChatCompletionComplete"
+            }
+          },
+          "created": {
+            "type": "integer",
+            "format": "int64",
+            "example": "1706270835",
+            "minimum": 0
+          },
+          "id": {
+            "type": "string"
+          },
+          "model": {
+            "type": "string",
+            "example": "mistralai/Mistral-7B-Instruct-v0.2"
+          },
+          "system_fingerprint": {
+            "type": "string"
+          },
+          "usage": {
+            "$ref": "#/components/schemas/Usage"
+          }
+        }
+      },
+      "ChatCompletionChoice": {
+        "type": "object",
+        "required": [
+          "index",
+          "delta"
+        ],
+        "properties": {
+          "delta": {
+            "$ref": "#/components/schemas/ChatCompletionDelta"
+          },
+          "finish_reason": {
+            "type": "string",
+            "nullable": true
+          },
+          "index": {
+            "type": "integer",
+            "format": "int32",
+            "minimum": 0
+          },
+          "logprobs": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/ChatCompletionLogprobs"
+              }
+            ],
+            "nullable": true
+          }
+        }
+      },
+      "ChatCompletionChunk": {
+        "type": "object",
+        "required": [
+          "id",
+          "created",
+          "model",
+          "system_fingerprint",
+          "choices"
+        ],
+        "properties": {
+          "choices": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/ChatCompletionChoice"
+            }
+          },
+          "created": {
+            "type": "integer",
+            "format": "int64",
+            "example": "1706270978",
+            "minimum": 0
+          },
+          "id": {
+            "type": "string"
+          },
+          "model": {
+            "type": "string",
+            "example": "mistralai/Mistral-7B-Instruct-v0.2"
+          },
+          "system_fingerprint": {
+            "type": "string"
+          }
+        }
+      },
+      "ChatCompletionComplete": {
+        "type": "object",
+        "required": [
+          "index",
+          "message",
+          "finish_reason"
+        ],
+        "properties": {
+          "finish_reason": {
+            "type": "string"
+          },
+          "index": {
+            "type": "integer",
+            "format": "int32",
+            "minimum": 0
+          },
+          "logprobs": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/ChatCompletionLogprobs"
+              }
+            ],
+            "nullable": true
+          },
+          "message": {
+            "$ref": "#/components/schemas/OutputMessage"
+          }
+        }
+      },
+      "ChatCompletionDelta": {
+        "oneOf": [
+          {
+            "$ref": "#/components/schemas/TextMessage"
+          },
+          {
+            "$ref": "#/components/schemas/ToolCallDelta"
+          }
+        ]
+      },
+      "ChatCompletionLogprob": {
+        "type": "object",
+        "required": [
+          "token",
+          "logprob",
+          "top_logprobs"
+        ],
+        "properties": {
+          "logprob": {
+            "type": "number",
+            "format": "float"
+          },
+          "token": {
+            "type": "string"
+          },
+          "top_logprobs": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/ChatCompletionTopLogprob"
+            }
+          }
+        }
+      },
+      "ChatCompletionLogprobs": {
+        "type": "object",
+        "required": [
+          "content"
+        ],
+        "properties": {
+          "content": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/ChatCompletionLogprob"
+            }
+          }
+        }
+      },
+      "ChatCompletionTopLogprob": {
+        "type": "object",
+        "required": [
+          "token",
+          "logprob"
+        ],
+        "properties": {
+          "logprob": {
+            "type": "number",
+            "format": "float"
+          },
+          "token": {
+            "type": "string"
+          }
+        }
+      },
+      "ChatRequest": {
+        "type": "object",
+        "required": [
+          "model",
+          "messages"
+        ],
+        "properties": {
+          "frequency_penalty": {
+            "type": "number",
+            "format": "float",
+            "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim.",
+            "example": "1.0",
+            "nullable": true
+          },
+          "logit_bias": {
+            "type": "array",
+            "items": {
+              "type": "number",
+              "format": "float"
+            },
+            "description": "UNUSED\nModify the likelihood of specified tokens appearing in the completion. Accepts a JSON object that maps tokens\n(specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically,\nthe bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model,\nbut values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should\nresult in a ban or exclusive selection of the relevant token.",
+            "nullable": true
+          },
+          "logprobs": {
+            "type": "boolean",
+            "description": "Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each\noutput token returned in the content of message.",
+            "example": "false",
+            "nullable": true
+          },
+          "max_tokens": {
+            "type": "integer",
+            "format": "int32",
+            "description": "The maximum number of tokens that can be generated in the chat completion.",
+            "example": "32",
+            "nullable": true,
+            "minimum": 0
+          },
+          "messages": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/Message"
+            },
+            "description": "A list of messages comprising the conversation so far.",
+            "example": "[{\"role\": \"user\", \"content\": \"What is Deep Learning?\"}]"
+          },
+          "model": {
+            "type": "string",
+            "description": "[UNUSED] ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.",
+            "example": "mistralai/Mistral-7B-Instruct-v0.2"
+          },
+          "n": {
+            "type": "integer",
+            "format": "int32",
+            "description": "UNUSED\nHow many chat completion choices to generate for each input message. Note that you will be charged based on the\nnumber of generated tokens across all of the choices. Keep n as 1 to minimize costs.",
+            "example": "2",
+            "nullable": true,
+            "minimum": 0
+          },
+          "presence_penalty": {
+            "type": "number",
+            "format": "float",
+            "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far,\nincreasing the model's likelihood to talk about new topics",
+            "example": 0.1,
+            "nullable": true
+          },
+          "response_format": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/GrammarType"
+              }
+            ],
+            "default": "null",
+            "nullable": true
+          },
+          "seed": {
+            "type": "integer",
+            "format": "int64",
+            "example": 42,
+            "nullable": true,
+            "minimum": 0
+          },
+          "stop": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            },
+            "description": "Up to 4 sequences where the API will stop generating further tokens.",
+            "example": "null",
+            "nullable": true
+          },
+          "stream": {
+            "type": "boolean"
+          },
+          "temperature": {
+            "type": "number",
+            "format": "float",
+            "description": "What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while\nlower values like 0.2 will make it more focused and deterministic.\n\nWe generally recommend altering this or `top_p` but not both.",
+            "example": 1.0,
+            "nullable": true
+          },
+          "tool_choice": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/ToolType"
+              }
+            ],
+            "nullable": true
+          },
+          "tool_prompt": {
+            "type": "string",
+            "description": "A prompt to be appended before the tools",
+            "example": "\"You will be presented with a JSON schema representing a set of tools.\nIf the user request lacks of sufficient information to make a precise tool selection: Do not invent any tool's properties, instead notify with an error message.\n\nJSON Schema:\n\"",
+            "nullable": true
+          },
+          "tools": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/Tool"
+            },
+            "description": "A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of\nfunctions the model may generate JSON inputs for.",
+            "example": "null",
+            "nullable": true
+          },
+          "top_logprobs": {
+            "type": "integer",
+            "format": "int32",
+            "description": "An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with\nan associated log probability. logprobs must be set to true if this parameter is used.",
+            "example": "5",
+            "nullable": true,
+            "minimum": 0
+          },
+          "top_p": {
+            "type": "number",
+            "format": "float",
+            "description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the\ntokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.",
+            "example": 0.95,
+            "nullable": true
+          }
+        }
+      },
+      "Chunk": {
+        "type": "object",
+        "required": [
+          "id",
+          "created",
+          "choices",
+          "model",
+          "system_fingerprint"
+        ],
+        "properties": {
+          "choices": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/CompletionComplete"
+            }
+          },
+          "created": {
+            "type": "integer",
+            "format": "int64",
+            "minimum": 0
+          },
+          "id": {
+            "type": "string"
+          },
+          "model": {
+            "type": "string"
+          },
+          "system_fingerprint": {
+            "type": "string"
+          }
+        }
+      },
+      "CompatGenerateRequest": {
+        "type": "object",
+        "required": [
+          "inputs"
+        ],
+        "properties": {
+          "inputs": {
+            "type": "string",
+            "example": "My name is Olivier and I"
+          },
+          "parameters": {
+            "$ref": "#/components/schemas/GenerateParameters"
+          },
+          "stream": {
+            "type": "boolean",
+            "default": "false"
+          }
+        }
+      },
+      "Completion": {
+        "oneOf": [
+          {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/Chunk"
+              },
+              {
+                "type": "object",
+                "required": [
+                  "object"
+                ],
+                "properties": {
+                  "object": {
+                    "type": "string",
+                    "enum": [
+                      "text_completion"
+                    ]
+                  }
+                }
+              }
+            ]
+          },
+          {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/CompletionFinal"
+              },
+              {
+                "type": "object",
+                "required": [
+                  "object"
+                ],
+                "properties": {
+                  "object": {
+                    "type": "string",
+                    "enum": [
+                      "text_completion"
+                    ]
+                  }
+                }
+              }
+            ]
+          }
+        ],
+        "discriminator": {
+          "propertyName": "object"
+        }
+      },
+      "CompletionComplete": {
+        "type": "object",
+        "required": [
+          "index",
+          "text",
+          "finish_reason"
+        ],
+        "properties": {
+          "finish_reason": {
+            "type": "string"
+          },
+          "index": {
+            "type": "integer",
+            "format": "int32",
+            "minimum": 0
+          },
+          "logprobs": {
+            "type": "array",
+            "items": {
+              "type": "number",
+              "format": "float"
+            },
+            "nullable": true
+          },
+          "text": {
+            "type": "string"
+          }
+        }
+      },
+      "CompletionFinal": {
+        "type": "object",
+        "required": [
+          "id",
+          "created",
+          "model",
+          "system_fingerprint",
+          "choices",
+          "usage"
+        ],
+        "properties": {
+          "choices": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/CompletionComplete"
+            }
+          },
+          "created": {
+            "type": "integer",
+            "format": "int64",
+            "example": "1706270835",
+            "minimum": 0
+          },
+          "id": {
+            "type": "string"
+          },
+          "model": {
+            "type": "string",
+            "example": "mistralai/Mistral-7B-Instruct-v0.2"
+          },
+          "system_fingerprint": {
+            "type": "string"
+          },
+          "usage": {
+            "$ref": "#/components/schemas/Usage"
+          }
+        }
+      },
+      "CompletionRequest": {
+        "type": "object",
+        "required": [
+          "model",
+          "prompt"
+        ],
+        "properties": {
+          "frequency_penalty": {
+            "type": "number",
+            "format": "float",
+            "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim.",
+            "example": "1.0",
+            "nullable": true
+          },
+          "max_tokens": {
+            "type": "integer",
+            "format": "int32",
+            "description": "The maximum number of tokens that can be generated in the chat completion.",
+            "default": "32",
+            "nullable": true,
+            "minimum": 0
+          },
+          "model": {
+            "type": "string",
+            "description": "UNUSED\nID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.",
+            "example": "mistralai/Mistral-7B-Instruct-v0.2"
+          },
+          "prompt": {
+            "$ref": "#/components/schemas/Prompt"
+          },
+          "repetition_penalty": {
+            "type": "number",
+            "format": "float",
+            "nullable": true
+          },
+          "seed": {
+            "type": "integer",
+            "format": "int64",
+            "example": 42,
+            "nullable": true,
+            "minimum": 0
+          },
+          "stop": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            },
+            "description": "Up to 4 sequences where the API will stop generating further tokens.",
+            "example": "null",
+            "nullable": true
+          },
+          "stream": {
+            "type": "boolean"
+          },
+          "suffix": {
+            "type": "string",
+            "description": "The text to append to the prompt. This is useful for completing sentences or generating a paragraph of text.\nplease see the completion_template field in the model's tokenizer_config.json file for completion template.",
+            "nullable": true
+          },
+          "temperature": {
+            "type": "number",
+            "format": "float",
+            "description": "What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while\nlower values like 0.2 will make it more focused and deterministic. We generally recommend altering this or `top_p` but not both.",
+            "example": 1.0,
+            "nullable": true
+          },
+          "top_p": {
+            "type": "number",
+            "format": "float",
+            "description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the\ntokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.",
+            "example": 0.95,
+            "nullable": true
+          }
+        }
+      },
+      "DeltaToolCall": {
+        "type": "object",
+        "required": [
+          "index",
+          "id",
+          "type",
+          "function"
+        ],
+        "properties": {
+          "function": {
+            "$ref": "#/components/schemas/Function"
+          },
+          "id": {
+            "type": "string"
+          },
+          "index": {
+            "type": "integer",
+            "format": "int32",
+            "minimum": 0
+          },
+          "type": {
+            "type": "string"
+          }
+        }
+      },
+      "Details": {
+        "type": "object",
+        "required": [
+          "finish_reason",
+          "generated_tokens",
+          "prefill",
+          "tokens"
+        ],
+        "properties": {
+          "best_of_sequences": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/BestOfSequence"
+            },
+            "nullable": true
+          },
+          "finish_reason": {
+            "$ref": "#/components/schemas/FinishReason"
+          },
+          "generated_tokens": {
+            "type": "integer",
+            "format": "int32",
+            "example": 1,
+            "minimum": 0
+          },
+          "prefill": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/PrefillToken"
+            }
+          },
+          "seed": {
+            "type": "integer",
+            "format": "int64",
+            "example": 42,
+            "nullable": true,
+            "minimum": 0
+          },
+          "tokens": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/Token"
+            }
+          },
+          "top_tokens": {
+            "type": "array",
+            "items": {
+              "type": "array",
+              "items": {
+                "$ref": "#/components/schemas/Token"
+              }
+            }
+          }
+        }
+      },
+      "ErrorResponse": {
+        "type": "object",
+        "required": [
+          "error",
+          "error_type"
+        ],
+        "properties": {
+          "error": {
+            "type": "string"
+          },
+          "error_type": {
+            "type": "string"
+          }
+        }
+      },
+      "FinishReason": {
+        "type": "string",
+        "enum": [
+          "length",
+          "eos_token",
+          "stop_sequence"
+        ],
+        "example": "Length"
+      },
+      "Function": {
+        "type": "object",
+        "required": [
+          "arguments"
+        ],
+        "properties": {
+          "arguments": {
+            "type": "string"
+          },
+          "name": {
+            "type": "string",
+            "nullable": true
+          }
+        }
+      },
+      "FunctionDefinition": {
+        "type": "object",
+        "required": [
+          "name",
+          "arguments"
+        ],
+        "properties": {
+          "arguments": {},
+          "description": {
+            "type": "string",
+            "nullable": true
+          },
+          "name": {
+            "type": "string"
+          }
+        }
+      },
+      "GenerateParameters": {
+        "type": "object",
+        "properties": {
+          "adapter_id": {
+            "type": "string",
+            "description": "Lora adapter id",
+            "default": "null",
+            "example": "null",
+            "nullable": true
+          },
+          "best_of": {
+            "type": "integer",
+            "description": "Generate best_of sequences and return the one if the highest token logprobs.",
+            "default": "null",
+            "example": 1,
+            "nullable": true,
+            "minimum": 0,
+            "exclusiveMinimum": 0
+          },
+          "decoder_input_details": {
+            "type": "boolean",
+            "description": "Whether to return decoder input token logprobs and ids.",
+            "default": "false"
+          },
+          "details": {
+            "type": "boolean",
+            "description": "Whether to return generation details.",
+            "default": "true"
+          },
+          "do_sample": {
+            "type": "boolean",
+            "description": "Activate logits sampling.",
+            "default": "false",
+            "example": true
+          },
+          "frequency_penalty": {
+            "type": "number",
+            "format": "float",
+            "description": "The parameter for frequency penalty. 1.0 means no penalty\nPenalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim.",
+            "default": "null",
+            "example": 0.1,
+            "nullable": true,
+            "exclusiveMinimum": -2
+          },
+          "grammar": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/GrammarType"
+              }
+            ],
+            "default": "null",
+            "nullable": true
+          },
+          "max_new_tokens": {
+            "type": "integer",
+            "format": "int32",
+            "description": "Maximum number of tokens to generate.",
+            "default": "100",
+            "example": "20",
+            "nullable": true,
+            "minimum": 0
+          },
+          "repetition_penalty": {
+            "type": "number",
+            "format": "float",
+            "description": "The parameter for repetition penalty. 1.0 means no penalty.\nSee [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.",
+            "default": "null",
+            "example": 1.03,
+            "nullable": true,
+            "exclusiveMinimum": 0
+          },
+          "return_full_text": {
+            "type": "boolean",
+            "description": "Whether to prepend the prompt to the generated text",
+            "default": "null",
+            "example": false,
+            "nullable": true
+          },
+          "seed": {
+            "type": "integer",
+            "format": "int64",
+            "description": "Random sampling seed.",
+            "default": "null",
+            "example": "null",
+            "nullable": true,
+            "minimum": 0,
+            "exclusiveMinimum": 0
+          },
+          "stop": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            },
+            "description": "Stop generating tokens if a member of `stop` is generated.",
+            "example": [
+              "photographer"
+            ],
+            "maxItems": 4
+          },
+          "temperature": {
+            "type": "number",
+            "format": "float",
+            "description": "The value used to module the logits distribution.",
+            "default": "null",
+            "example": 0.5,
+            "nullable": true,
+            "exclusiveMinimum": 0
+          },
+          "top_k": {
+            "type": "integer",
+            "format": "int32",
+            "description": "The number of highest probability vocabulary tokens to keep for top-k-filtering.",
+            "default": "null",
+            "example": 10,
+            "nullable": true,
+            "exclusiveMinimum": 0
+          },
+          "top_n_tokens": {
+            "type": "integer",
+            "format": "int32",
+            "description": "The number of highest probability vocabulary tokens to keep for top-n-filtering.",
+            "default": "null",
+            "example": 5,
+            "nullable": true,
+            "minimum": 0,
+            "exclusiveMinimum": 0
+          },
+          "top_p": {
+            "type": "number",
+            "format": "float",
+            "description": "Top-p value for nucleus sampling.",
+            "default": "null",
+            "example": 0.95,
+            "nullable": true,
+            "maximum": 1,
+            "exclusiveMinimum": 0
+          },
+          "truncate": {
+            "type": "integer",
+            "description": "Truncate inputs tokens to the given size.",
+            "default": "null",
+            "example": "null",
+            "nullable": true,
+            "minimum": 0
+          },
+          "typical_p": {
+            "type": "number",
+            "format": "float",
+            "description": "Typical Decoding mass\nSee [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information.",
+            "default": "null",
+            "example": 0.95,
+            "nullable": true,
+            "maximum": 1,
+            "exclusiveMinimum": 0
+          },
+          "watermark": {
+            "type": "boolean",
+            "description": "Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226).",
+            "default": "false",
+            "example": true
+          }
+        }
+      },
+      "GenerateRequest": {
+        "type": "object",
+        "required": [
+          "inputs"
+        ],
+        "properties": {
+          "inputs": {
+            "type": "string",
+            "example": "My name is Olivier and I"
+          },
+          "parameters": {
+            "$ref": "#/components/schemas/GenerateParameters"
+          }
+        }
+      },
+      "GenerateResponse": {
+        "type": "object",
+        "required": [
+          "generated_text"
+        ],
+        "properties": {
+          "details": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/Details"
+              }
+            ],
+            "nullable": true
+          },
+          "generated_text": {
+            "type": "string",
+            "example": "test"
+          }
+        }
+      },
+      "GrammarType": {
+        "oneOf": [
+          {
+            "type": "object",
+            "required": [
+              "type",
+              "value"
+            ],
+            "properties": {
+              "type": {
+                "type": "string",
+                "enum": [
+                  "json"
+                ]
+              },
+              "value": {
+                "description": "A string that represents a [JSON Schema](https://json-schema.org/).\n\nJSON Schema is a declarative language that allows to annotate JSON documents\nwith types and descriptions."
+              }
+            }
+          },
+          {
+            "type": "object",
+            "required": [
+              "type",
+              "value"
+            ],
+            "properties": {
+              "type": {
+                "type": "string",
+                "enum": [
+                  "regex"
+                ]
+              },
+              "value": {
+                "type": "string"
+              }
+            }
+          }
+        ],
+        "discriminator": {
+          "propertyName": "type"
+        }
+      },
+      "Info": {
+        "type": "object",
+        "required": [
+          "model_id",
+          "model_dtype",
+          "model_device_type",
+          "max_concurrent_requests",
+          "max_best_of",
+          "max_stop_sequences",
+          "max_input_tokens",
+          "max_total_tokens",
+          "waiting_served_ratio",
+          "max_batch_total_tokens",
+          "max_waiting_tokens",
+          "validation_workers",
+          "max_client_batch_size",
+          "router",
+          "version"
+        ],
+        "properties": {
+          "docker_label": {
+            "type": "string",
+            "example": "null",
+            "nullable": true
+          },
+          "max_batch_size": {
+            "type": "integer",
+            "example": "null",
+            "nullable": true,
+            "minimum": 0
+          },
+          "max_batch_total_tokens": {
+            "type": "integer",
+            "format": "int32",
+            "example": "32000",
+            "minimum": 0
+          },
+          "max_best_of": {
+            "type": "integer",
+            "example": "2",
+            "minimum": 0
+          },
+          "max_client_batch_size": {
+            "type": "integer",
+            "example": "32",
+            "minimum": 0
+          },
+          "max_concurrent_requests": {
+            "type": "integer",
+            "description": "Router Parameters",
+            "example": "128",
+            "minimum": 0
+          },
+          "max_input_tokens": {
+            "type": "integer",
+            "example": "1024",
+            "minimum": 0
+          },
+          "max_stop_sequences": {
+            "type": "integer",
+            "example": "4",
+            "minimum": 0
+          },
+          "max_total_tokens": {
+            "type": "integer",
+            "example": "2048",
+            "minimum": 0
+          },
+          "max_waiting_tokens": {
+            "type": "integer",
+            "example": "20",
+            "minimum": 0
+          },
+          "model_device_type": {
+            "type": "string",
+            "example": "cuda"
+          },
+          "model_dtype": {
+            "type": "string",
+            "example": "torch.float16"
+          },
+          "model_id": {
+            "type": "string",
+            "description": "Model info",
+            "example": "bigscience/blomm-560m"
+          },
+          "model_pipeline_tag": {
+            "type": "string",
+            "example": "text-generation",
+            "nullable": true
+          },
+          "model_sha": {
+            "type": "string",
+            "example": "e985a63cdc139290c5f700ff1929f0b5942cced2",
+            "nullable": true
+          },
+          "router": {
+            "type": "string",
+            "description": "Router Info",
+            "example": "text-generation-router"
+          },
+          "sha": {
+            "type": "string",
+            "example": "null",
+            "nullable": true
+          },
+          "validation_workers": {
+            "type": "integer",
+            "example": "2",
+            "minimum": 0
+          },
+          "version": {
+            "type": "string",
+            "example": "0.5.0"
+          },
+          "waiting_served_ratio": {
+            "type": "number",
+            "format": "float",
+            "example": "1.2"
+          }
+        }
+      },
+      "Message": {
+        "type": "object",
+        "required": [
+          "role",
+          "content"
+        ],
+        "properties": {
+          "content": {
+            "$ref": "#/components/schemas/MessageContent"
+          },
+          "name": {
+            "type": "string",
+            "example": "\"David\"",
+            "nullable": true
+          },
+          "role": {
+            "type": "string",
+            "example": "user"
+          }
+        }
+      },
+      "PrefillToken": {
+        "type": "object",
+        "required": [
+          "id",
+          "text",
+          "logprob"
+        ],
+        "properties": {
+          "id": {
+            "type": "integer",
+            "format": "int32",
+            "example": 0,
+            "minimum": 0
+          },
+          "logprob": {
+            "type": "number",
+            "format": "float",
+            "example": -0.34,
+            "nullable": true
+          },
+          "text": {
+            "type": "string",
+            "example": "test"
+          }
+        }
+      },
+      "Prompt": {
+        "type": "array",
+        "items": {
+          "type": "string"
+        }
+      },
+      "SimpleToken": {
+        "type": "object",
+        "required": [
+          "id",
+          "text",
+          "start",
+          "stop"
+        ],
+        "properties": {
+          "id": {
+            "type": "integer",
+            "format": "int32",
+            "example": 0,
+            "minimum": 0
+          },
+          "start": {
+            "type": "integer",
+            "example": 0,
+            "minimum": 0
+          },
+          "stop": {
+            "type": "integer",
+            "example": 2,
+            "minimum": 0
+          },
+          "text": {
+            "type": "string",
+            "example": "test"
+          }
+        }
+      },
+      "StreamDetails": {
+        "type": "object",
+        "required": [
+          "finish_reason",
+          "generated_tokens"
+        ],
+        "properties": {
+          "finish_reason": {
+            "$ref": "#/components/schemas/FinishReason"
+          },
+          "generated_tokens": {
+            "type": "integer",
+            "format": "int32",
+            "example": 1,
+            "minimum": 0
+          },
+          "seed": {
+            "type": "integer",
+            "format": "int64",
+            "example": 42,
+            "nullable": true,
+            "minimum": 0
+          }
+        }
+      },
+      "StreamResponse": {
+        "type": "object",
+        "required": [
+          "index",
+          "token"
+        ],
+        "properties": {
+          "details": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/StreamDetails"
+              }
+            ],
+            "default": "null",
+            "nullable": true
+          },
+          "generated_text": {
+            "type": "string",
+            "default": "null",
+            "example": "test",
+            "nullable": true
+          },
+          "index": {
+            "type": "integer",
+            "format": "int32",
+            "minimum": 0
+          },
+          "token": {
+            "$ref": "#/components/schemas/Token"
+          },
+          "top_tokens": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/Token"
+            }
+          }
+        }
+      },
+      "Token": {
+        "type": "object",
+        "required": [
+          "id",
+          "text",
+          "logprob",
+          "special"
+        ],
+        "properties": {
+          "id": {
+            "type": "integer",
+            "format": "int32",
+            "example": 0,
+            "minimum": 0
+          },
+          "logprob": {
+            "type": "number",
+            "format": "float",
+            "example": -0.34,
+            "nullable": true
+          },
+          "special": {
+            "type": "boolean",
+            "example": "false"
+          },
+          "text": {
+            "type": "string",
+            "example": "test"
+          }
+        }
+      },
+      "TokenizeResponse": {
+        "type": "array",
+        "items": {
+          "$ref": "#/components/schemas/SimpleToken"
+        }
+      },
+      "Tool": {
+        "type": "object",
+        "required": [
+          "type",
+          "function"
+        ],
+        "properties": {
+          "function": {
+            "$ref": "#/components/schemas/FunctionDefinition"
+          },
+          "type": {
+            "type": "string",
+            "example": "function"
+          }
+        }
+      },
+      "ToolCall": {
+        "type": "object",
+        "required": [
+          "id",
+          "type",
+          "function"
+        ],
+        "properties": {
+          "function": {
+            "$ref": "#/components/schemas/FunctionDefinition"
+          },
+          "id": {
+            "type": "string"
+          },
+          "type": {
+            "type": "string"
+          }
+        }
+      },
+      "ToolType": {
+        "oneOf": [
+          {
+            "type": "object",
+            "default": null,
+            "nullable": true
+          },
+          {
+            "type": "string"
+          },
+          {
+            "type": "object",
+            "required": [
+              "function"
+            ],
+            "properties": {
+              "function": {
+                "$ref": "#/components/schemas/FunctionName"
+              }
+            }
+          }
+        ]
+      },
+      "Usage": {
+        "type": "object",
+        "required": [
+          "prompt_tokens",
+          "completion_tokens",
+          "total_tokens"
+        ],
+        "properties": {
+          "completion_tokens": {
+            "type": "integer",
+            "format": "int32",
+            "minimum": 0
+          },
+          "prompt_tokens": {
+            "type": "integer",
+            "format": "int32",
+            "minimum": 0
+          },
+          "total_tokens": {
+            "type": "integer",
+            "format": "int32",
+            "minimum": 0
+          }
+        }
+      }
+    }
+  },
+  "tags": [
+    {
+      "name": "Text Generation Inference",
+      "description": "Hugging Face Text Generation Inference API"
+    }
+  ]
+}
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
+- sections:
+  - local: index
+    title: Text Generation Inference
+  - local: quicktour
+    title: Quick Tour
+  - local: installation_nvidia
+    title: Using TGI with Nvidia GPUs
+  - local: installation_amd
+    title: Using TGI with AMD GPUs
+  - local: installation_gaudi
+    title: Using TGI with Intel Gaudi
+  - local: installation_inferentia
+    title: Using TGI with AWS Inferentia
+  - local: installation
+    title: Installation from source
+  - local: supported_models
+    title: Supported Models and Hardware
+  - local: messages_api
+    title: Messages API
+  - local: architecture
+    title: Internal Architecture
+  title: Getting started
+- sections:
+  - local: basic_tutorials/consuming_tgi
+    title: Consuming TGI
+  - local: basic_tutorials/preparing_model
+    title: Preparing Model for Serving
+  - local: basic_tutorials/gated_model_access
+    title: Serving Private & Gated Models
+  - local: basic_tutorials/using_cli
+    title: Using TGI CLI
+  - local: basic_tutorials/launcher
+    title: All TGI CLI options
+  - local: basic_tutorials/non_core_models
+    title: Non-core Model Serving
+  - local: basic_tutorials/safety
+    title: Safety
+  - local: basic_tutorials/using_guidance
+    title: Using Guidance, JSON, tools
+  - local: basic_tutorials/visual_language_models
+    title: Visual Language Models
+  - local: basic_tutorials/monitoring
+    title: Monitoring TGI with Prometheus and Grafana
+  - local: basic_tutorials/train_medusa
+    title: Train Medusa
+  title: Tutorials
+- sections:
+  - local: conceptual/streaming
+    title: Streaming
+  - local: conceptual/quantization
+    title: Quantization
+  - local: conceptual/tensor_parallelism
+    title: Tensor Parallelism
+  - local: conceptual/paged_attention
+    title: PagedAttention
+  - local: conceptual/safetensors
+    title: Safetensors
+  - local: conceptual/flash_attention
+    title: Flash Attention
+  - local: conceptual/speculation
+    title: Speculation (Medusa, ngram)
+  - local: conceptual/guidance
+    title: How Guidance Works (via outlines
+  - local: conceptual/lora
+    title: LoRA (Low-Rank Adaptation)
+  title: Conceptual Guides
--- a/docs/source/architecture.md
+++ b/docs/source/architecture.md
+# Text Generation Inference Architecture
+This document aims at describing the architecture of Text Generation Inference (TGI), by describing the call flow between the separate components.
+A high-level architecture diagram can be seen here:
+![TGI architecture](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/TGI.png)
+This diagram shows well there are these separate components:
+- **The router**, also named `webserver`, that receives the client requests, buffers them, creates some batches, and prepares gRPC calls to a model server.
+- **The model server**, responsible of receiving the gRPC requests and to process the inference on the model. If the model is sharded across multiple accelerators (e.g.: multiple GPUs), the model server shards might be synchronized via NCCL or equivalent.
+- **The launcher** is a helper thar will be able to launch one or several model servers (if model is sharded), and it launches the router with the compatible arguments.
+The router and the model server can be two different machines, they do not need to be deployed together.
+## The Router
+This component is a rust web server binary that accepts HTTP requests using the custom [HTTP API](https://huggingface.github.io/text-generation-inference/), as well as OpenAI's [Messages API](https://huggingface.co/docs/text-generation-inference/messages_api).
+The router receives the API calls and handles the "baches" logic (and introduction to batching can be found [here](https://github.com/huggingface/text-generation-inference/blob/main/router/README.md)).
+It uses different strategies to reduce latency between requests and responses, especially oriented to decoding latency. It will use queues, schedulers, and block allocators to achieve that and produce batched requests that it will then be sent to the model server.
+### Router's command line
+The router command line will be the way to pass parameters to it (it does not rely on configuration file):
+```
+Text Generation Webserver
+Usage: text-generation-router [OPTIONS]
+Options:
+      --max-concurrent-requests <MAX_CONCURRENT_REQUESTS>
+          [env: MAX_CONCURRENT_REQUESTS=] [default: 128]
+      --max-best-of <MAX_BEST_OF>
+          [env: MAX_BEST_OF=] [default: 2]
+      --max-stop-sequences <MAX_STOP_SEQUENCES>
+          [env: MAX_STOP_SEQUENCES=] [default: 4]
+      --max-top-n-tokens <MAX_TOP_N_TOKENS>
+          [env: MAX_TOP_N_TOKENS=] [default: 5]
+      --max-input-tokens <MAX_INPUT_TOKENS>
+          [env: MAX_INPUT_TOKENS=] [default: 1024]
+      --max-total-tokens <MAX_TOTAL_TOKENS>
+          [env: MAX_TOTAL_TOKENS=] [default: 2048]
+      --waiting-served-ratio <WAITING_SERVED_RATIO>
+          [env: WAITING_SERVED_RATIO=] [default: 1.2]
+      --max-batch-prefill-tokens <MAX_BATCH_PREFILL_TOKENS>
+          [env: MAX_BATCH_PREFILL_TOKENS=] [default: 4096]
+      --max-batch-total-tokens <MAX_BATCH_TOTAL_TOKENS>
+          [env: MAX_BATCH_TOTAL_TOKENS=]
+      --max-waiting-tokens <MAX_WAITING_TOKENS>
+          [env: MAX_WAITING_TOKENS=] [default: 20]
+      --max-batch-size <MAX_BATCH_SIZE>
+          [env: MAX_BATCH_SIZE=]
+      --hostname <HOSTNAME>
+          [env: HOSTNAME=] [default: 0.0.0.0]
+  -p, --port <PORT>
+          [env: PORT=] [default: 3000]
+      --master-shard-uds-path <MASTER_SHARD_UDS_PATH>
+          [env: MASTER_SHARD_UDS_PATH=] [default: /tmp/text-generation-server-0]
+      --tokenizer-name <TOKENIZER_NAME>
+          [env: TOKENIZER_NAME=] [default: bigscience/bloom]
+      --tokenizer-config-path <TOKENIZER_CONFIG_PATH>
+          [env: TOKENIZER_CONFIG_PATH=]
+      --revision <REVISION>
+          [env: REVISION=]
+      --validation-workers <VALIDATION_WORKERS>
+          [env: VALIDATION_WORKERS=] [default: 2]
+      --json-output
+          [env: JSON_OUTPUT=]
+      --otlp-endpoint <OTLP_ENDPOINT>
+          [env: OTLP_ENDPOINT=]
+      --otlp-service-name <OTLP_SERVICE_NAME>
+          [env: OTLP_SERVICE_NAME=]
+      --cors-allow-origin <CORS_ALLOW_ORIGIN>
+          [env: CORS_ALLOW_ORIGIN=]
+      --ngrok
+          [env: NGROK=]
+      --ngrok-authtoken <NGROK_AUTHTOKEN>
+          [env: NGROK_AUTHTOKEN=]
+      --ngrok-edge <NGROK_EDGE>
+          [env: NGROK_EDGE=]
+      --messages-api-enabled
+          [env: MESSAGES_API_ENABLED=]
+      --disable-grammar-support
+          [env: DISABLE_GRAMMAR_SUPPORT=]
+      --max-client-batch-size <MAX_CLIENT_BATCH_SIZE>
+          [env: MAX_CLIENT_BATCH_SIZE=] [default: 4]
+  -h, --help
+          Print help
+  -V, --version
+          Print version
+```
+## The Model Server
+The model server is a python server, capable of starting a server waiting for gRPC requests, loads a given model, perform sharding to provide [tensor parallelism](https://huggingface.co/docs/text-generation-inference/conceptual/tensor_parallelism), and stays alive while waiting for new requests.
+The model server supports models instantiated using Pytorch and optimized for inference mainly on CUDA/ROCM.
+### Model Server Variants
+Several variants of the model server exist that are actively supported by Hugging Face:
+- By default, the model server will attempt building [a server optimized for Nvidia GPUs with CUDA](https://huggingface.co/docs/text-generation-inference/installation_nvidia). The code for this version is hosted in the [main TGI repository](https://github.com/huggingface/text-generation-inference).
+- A [version optimized for AMD with ROCm](https://huggingface.co/docs/text-generation-inference/installation_amd) is hosted in the main TGI repository. Some model features differ.
+- The [version for Intel Gaudi](https://huggingface.co/docs/text-generation-inference/installation_gaudi) is maintained on a forked repository, often resynchronized with the main [TGI repository](https://github.com/huggingface/tgi-gaudi).
+- A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained as part of [Optimum Neuron](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference).
+- A version for Google TPUs is maintained as part of [Optimum TPU](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference).
+Not all variants provide the same features, as hardware and middleware capabilities do not provide the same optimizations.
+### Command Line Interface
+The official command line interface (CLI) for the server supports three subcommands, `download-weights`, `quantize` and `serve`:
+- `download-weights` will download weights from the hub and, in some variants it will convert weights to a format that is adapted to the given implementation;
+- `quantize` will allow to quantize a model using the `qptq` package. This feature is not available nor supported on all variants;
+- `serve` will start the server that load a model (or a model shard), receives gRPC calls from the router, performs an inference and provides a formatted response to the given request.
+Serve's command line parameters on the TGI repository are these:
+```
+ Usage: cli.py serve [OPTIONS] MODEL_ID
+╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────╮
+│ *    model_id      TEXT  [default: None] [required]                                                      │
+╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ --revision                                       TEXT                        [default: None]             │
+│ --sharded              --no-sharded                                          [default: no-sharded]       │
+│ --quantize                                       [bitsandbytes|bitsandbytes  [default: None]             │
+│                                                  -nf4|bitsandbytes-fp4|gptq                              │
+│                                                  |awq|eetq|exl2|fp8]                                     │
+│ --speculate                                      INTEGER                     [default: None]             │
+│ --dtype                                          [float16|bfloat16]          [default: None]             │
+│ --trust-remote-code    --no-trust-remote-code                                [default:                   │
+│                                                                              no-trust-remote-code]       │
+│ --uds-path                                       PATH                        [default:                   │
+│                                                                              /tmp/text-generation-serve… │
+│ --logger-level                                   TEXT                        [default: INFO]             │
+│ --json-output          --no-json-output                                      [default: no-json-output]   │
+│ --otlp-endpoint                                  TEXT                        [default: None]             │
+│ --otlp-service-name                              TEXT                        [default:                   │
+│                                                                              text-generation-inference...│
+│ --help                                                                       Show this message and exit. │
+╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+```
+Note that some variants might support different parameters, and they could possibly accept more options that can be passed on using environment variables.
+## Call Flow
+Once both components are initialized, weights downloaded and model server is up and running, router and model server exchange data and info through the gRPC call. There are currently two supported schemas, [v2](https://github.com/huggingface/text-generation-inference/blob/main/proto/generate.proto) and [v3](https://github.com/huggingface/text-generation-inference/blob/main/proto/v3/generate.proto). These two versions are almost identical, except for:
+- input chunks support, for text and image data,
+- paged attention support
+Here's a diagram that displays the exchanges that follow the router and model server startup.
+```mermaid
+sequenceDiagram
+    Router->>Model Server: service discovery
+    Model Server-->>Router: urls for other shards
+    Router->>Model Server: get model info
+    Model Server-->>Router: shard info
+    Router->>Model Server: health check
+    Model Server-->>Router: health OK
+    Router->>Model Server: warmup(max_input_tokens, max_batch_prefill_tokens, max_total_tokens, max_batch_size)
+    Model Server-->>Router: warmup result
+```
+After these are done, the router is ready to receive generate calls from multiple clients. Here's an example.
+```mermaid
+sequenceDiagram
+    participant Client 1
+    participant Client 2
+    participant Client 3
+    participant Router
+    participant Model Server
+    Client 1->>Router: generate_stream
+    Router->>Model Server: prefill(batch1)
+    Model Server-->>Router: generations, cached_batch1, timings
+    Router-->>Client 1: token 1
+    Router->>Model Server: decode(cached_batch1)
+    Model Server-->>Router: generations, cached_batch1, timings
+    Router-->>Client 1: token 2
+    Router->>Model Server: decode(cached_batch1)
+    Model Server-->>Router: generations, cached_batch1, timings
+    Router-->>Client 1: token 3
+    Client 2->>Router: generate_stream
+    Router->>Model Server: prefill(batch2)
+    Note right of Model Server: This stops previous batch, that is restarted
+    Model Server-->>Router: generations, cached_batch2, timings
+    Router-->>Client 2: token 1'
+    Router->>Model Server: decode(cached_batch1, cached_batch2)
+    Model Server-->>Router: generations, cached_batch1, timings
+    Router-->>Client 1: token 4
+    Router-->>Client 2: token 2'
+    Note left of Client 1: Client 1 leaves
+    Router->>Model Server: filter_batch(cached_batch1, request_ids_to_keep=batch2)
+    Model Server-->>Router: filtered batch
+    Router->>Model Server: decode(cached_batch2)
+    Model Server-->>Router: generations, cached_batch2, timings
+    Router-->>Client 2: token 3'
+    Client 3->>Router: generate_stream
+    Note right of Model Server: This stops previous batch, that is restarted
+    Router->>Model Server: prefill(batch3)
+    Note left of Client 1: Client 3 leaves without receiving any batch
+    Router->>Model Server: clear_cache(batch3)
+    Note right of Model Server: This stops previous batch, that is restarted
+    Router->>Model Server: decode(cached_batch3)
+    Note right of Model Server: Last token (stopping criteria)
+    Model Server-->>Router: generations, cached_batch3, timings
+    Router-->>Client 2: token 4'
+```
--- a/docs/source/basic_tutorials/consuming_tgi.md
+++ b/docs/source/basic_tutorials/consuming_tgi.md
+# Consuming Text Generation Inference
+There are many ways you can consume Text Generation Inference server in your applications. After launching, you can use the `/generate` route and make a `POST` request to get results from the server. You can also use the `/generate_stream` route if you want TGI to return a stream of tokens. You can make the requests using the tool of your preference, such as curl, Python or TypeScrpt. For a final end-to-end experience, we also open-sourced ChatUI, a chat interface for open-source models.
+## curl
+After the launch, you can query the model using either the `/generate` or `/generate_stream` routes:
+```bash
+curl 127.0.0.1:8080/generate \
+    -X POST \
+    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
+    -H 'Content-Type: application/json'
+```
+## Inference Client
+[`huggingface-hub`](https://huggingface.co/docs/huggingface_hub/main/en/index) is a Python library to interact with the Hugging Face Hub, including its endpoints. It provides a nice high-level class, [`~huggingface_hub.InferenceClient`], which makes it easy to make calls to a TGI endpoint. `InferenceClient` also takes care of parameter validation and provides a simple to-use interface.
+You can simply install `huggingface-hub` package with pip.
+```bash
+pip install huggingface-hub
+```
+Once you start the TGI server, instantiate `InferenceClient()` with the URL to the endpoint serving the model. You can then call `text_generation()` to hit the endpoint through Python.
+```python
+from huggingface_hub import InferenceClient
+client = InferenceClient(model="http://127.0.0.1:8080")
+client.text_generation(prompt="Write a code for snake game")
+```
+You can do streaming with `InferenceClient` by passing `stream=True`. Streaming will return tokens as they are being generated in the server. To use streaming, you can do as follows:
+```python
+for token in client.text_generation("How do you make cheese?", max_new_tokens=12, stream=True):
+    print(token)
+```
+Another parameter you can use with TGI backend is `details`. You can get more details on generation (tokens, probabilities, etc.) by setting `details` to `True`. When it's specified, TGI will return a `TextGenerationResponse` or `TextGenerationStreamResponse` rather than a string or stream.
+```python
+output = client.text_generation(prompt="Meaning of life is", details=True)
+print(output)
+# TextGenerationResponse(generated_text=' a complex concept that is not always clear to the individual. It is a concept that is not always', details=Details(finish_reason=<FinishReason.Length: 'length'>, generated_tokens=20, seed=None, prefill=[], tokens=[Token(id=267, text=' a', logprob=-2.0723474, special=False), Token(id=11235, text=' complex', logprob=-3.1272552, special=False), Token(id=17908, text=' concept', logprob=-1.3632495, special=False),..))
+```
+You can see how to stream below.
+```python
+output = client.text_generation(prompt="Meaning of life is", stream=True, details=True)
+print(next(iter(output)))
+# TextGenerationStreamResponse(token=Token(id=267, text=' a', logprob=-2.0723474, special=False), generated_text=None, details=None)
+```
+You can check out the details of the function [here](https://huggingface.co/docs/huggingface_hub/main/en/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation). There is also an async version of the client, `AsyncInferenceClient`, based on `asyncio` and `aiohttp`. You can find docs for it [here](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.AsyncInferenceClient)
+## ChatUI
+ChatUI is an open-source interface built for LLM serving. It offers many customization options, such as web search with SERP API and more. ChatUI can automatically consume the TGI server and even provides an option to switch between different TGI endpoints. You can try it out at [Hugging Chat](https://huggingface.co/chat/), or use the [ChatUI Docker Space](https://huggingface.co/new-space?template=huggingchat/chat-ui-template) to deploy your own Hugging Chat to Spaces.
+To serve both ChatUI and TGI in same environment, simply add your own endpoints to the `MODELS` variable in `.env.local` file inside the `chat-ui` repository. Provide the endpoints pointing to where TGI is served.
+```
+{
+// rest of the model config here
+"endpoints": [{"url": "https://HOST:PORT/generate_stream"}]
+}
+```
+![ChatUI](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/chatui_screen.png)
+## Gradio
+Gradio is a Python library that helps you build web applications for your machine learning models with a few lines of code. It has a `ChatInterface` wrapper that helps create neat UIs for chatbots. Let's take a look at how to create a chatbot with streaming mode using TGI and Gradio. Let's install Gradio and Hub Python library first.
+```bash
+pip install huggingface-hub gradio
+```
+Assume you are serving your model on port 8080, we will query through [InferenceClient](consuming_tgi#inference-client).
+```python
+import gradio as gr
+from huggingface_hub import InferenceClient
+client = InferenceClient(model="http://127.0.0.1:8080")
+def inference(message, history):
+    partial_message = ""
+    for token in client.text_generation(message, max_new_tokens=20, stream=True):
+        partial_message += token
+        yield partial_message
+gr.ChatInterface(
+    inference,
+    chatbot=gr.Chatbot(height=300),
+    textbox=gr.Textbox(placeholder="Chat with me!", container=False, scale=7),
+    description="This is the demo for Gradio UI consuming TGI endpoint with LLaMA 7B-Chat model.",
+    title="Gradio 🤝 TGI",
+    examples=["Are tomatoes vegetables?"],
+    retry_btn="Retry",
+    undo_btn="Undo",
+    clear_btn="Clear",
+).queue().launch()
+```
+The UI looks like this 👇
+<div class="flex justify-center">
+    <img
+        class="block dark:hidden"
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi.png"
+    />
+    <img
+        class="hidden dark:block"
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi-dark.png"
+    />
+</div>
+You can try the demo directly here 👇
+<div class="block dark:hidden">
+	<iframe
+        src="https://merve-gradio-tgi-2.hf.space?__theme=light"
+        width="850"
+        height="750"
+    ></iframe>
+</div>
+<div class="hidden dark:block">
+    <iframe
+        src="https://merve-gradio-tgi-2.hf.space?__theme=dark"
+        width="850"
+        height="750"
+    ></iframe>
+</div>
+You can disable streaming mode using `return` instead of `yield` in your inference function, like below.
+```python
+def inference(message, history):
+    return client.text_generation(message, max_new_tokens=20)
+```
+You can read more about how to customize a `ChatInterface` [here](https://www.gradio.app/guides/creating-a-chatbot-fast).
+## API documentation
+You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route. The Swagger UI is also available [here](https://huggingface.github.io/text-generation-inference).
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
+# Serving Private & Gated Models
+If the model you wish to serve is behind gated access or the model repository on Hugging Face Hub is private, and you have access to the model, you can provide your Hugging Face Hub access token. You can generate and copy a read token from [Hugging Face Hub tokens page](https://huggingface.co/settings/tokens)
+If you're using the CLI, set the `HF_TOKEN` environment variable. For example:
+```
+export HF_TOKEN=<YOUR READ TOKEN>
+```
+If you would like to do it through Docker, you can provide your token by specifying `HF_TOKEN` as shown below.
+```bash
+model=meta-llama/Llama-2-7b-chat-hf
+volume=$PWD/data
+token=<your READ token>
+docker run --gpus all \
+    --shm-size 1g \
+    -e HF_TOKEN=$token \
+    -p 8080:80 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.4 \
+    --model-id $model
+```
--- a/docs/source/basic_tutorials/launcher.md
+++ b/docs/source/basic_tutorials/launcher.md
+# Text-generation-launcher arguments
+<!-- WRAP CODE BLOCKS -->
+```shell
+Text Generation Launcher
+Usage: text-generation-launcher [OPTIONS]
+Options:
+```
+## MODEL_ID
+```shell
+      --model-id <MODEL_ID>
+          The name of the model to load. Can be a MODEL_ID as listed on <https://hf.co/models> like `gpt2` or `OpenAssistant/oasst-sft-1-pythia-12b`. Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of transformers
+          [env: MODEL_ID=]
+          [default: bigscience/bloom-560m]
+```
+## REVISION
+```shell
+      --revision <REVISION>
+          The actual revision of the model if you're referring to a model on the hub. You can use a specific commit id or a branch like `refs/pr/2`
+          [env: REVISION=]
+```
+## VALIDATION_WORKERS
+```shell
+      --validation-workers <VALIDATION_WORKERS>
+          The number of tokenizer workers used for payload validation and truncation inside the router
+          [env: VALIDATION_WORKERS=]
+          [default: 2]
+```
+## SHARDED
+```shell
+      --sharded <SHARDED>
+          Whether to shard the model across multiple GPUs By default text-generation-inference will use all available GPUs to run the model. Setting it to `false` deactivates `num_shard`
+          [env: SHARDED=]
+          [possible values: true, false]
+```
+## NUM_SHARD
+```shell
+      --num-shard <NUM_SHARD>
+          The number of shards to use if you don't want to use all GPUs on a given machine. You can use `CUDA_VISIBLE_DEVICES=0,1 text-generation-launcher... --num_shard 2` and `CUDA_VISIBLE_DEVICES=2,3 text-generation-launcher... --num_shard 2` to launch 2 copies with 2 shard each on a given machine with 4 GPUs for instance
+          [env: NUM_SHARD=]
+```
+## QUANTIZE
+```shell
+      --quantize <QUANTIZE>
+          Whether you want the model to be quantized
+          [env: QUANTIZE=]
+          Possible values:
+          - awq:              4 bit quantization. Requires a specific AWQ quantized model: <https://hf.co/models?search=awq>. Should replace GPTQ models wherever possible because of the better latency
+          - eetq:             8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
+          - exl2:             Variable bit quantization. Requires a specific EXL2 quantized model: <https://hf.co/models?search=exl2>. Requires exllama2 kernels and does not support tensor parallelism (num_shard > 1)
+          - gptq:             4 bit quantization. Requires a specific GTPQ quantized model: <https://hf.co/models?search=gptq>. text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels
+          - marlin:           4 bit quantization. Requires a specific Marlin quantized model: <https://hf.co/models?search=marlin>
+          - bitsandbytes:     Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half, but it is known that the model will be much slower to run than the native f16
+          - bitsandbytes-nf4: Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x, but it is known that the model will be much slower to run than the native f16
+          - bitsandbytes-fp4: Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better perplexity performance for you model
+          - fp8:              [FP8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/) (e4m3) works on H100 and above This dtype has native ops should be the fastest if available. This is currently not the fastest because of local unpacking + padding to satisfy matrix multiplication limitations
+```
+## SPECULATE
+```shell
+      --speculate <SPECULATE>
+          The number of input_ids to speculate on If using a medusa model, the heads will be picked up automatically Other wise, it will use n-gram speculation which is relatively free in terms of compute, but the speedup heavily depends on the task
+          [env: SPECULATE=]
+```
+## DTYPE
+```shell
+      --dtype <DTYPE>
+          The dtype to be forced upon the model. This option cannot be used with `--quantize`
+          [env: DTYPE=]
+          [possible values: float16, bfloat16]
+```
+## TRUST_REMOTE_CODE
+```shell
+      --trust-remote-code
+          Whether you want to execute hub modelling code. Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision
+          [env: TRUST_REMOTE_CODE=]
+```
+## MAX_CONCURRENT_REQUESTS
+```shell
+      --max-concurrent-requests <MAX_CONCURRENT_REQUESTS>
+          The maximum amount of concurrent requests for this particular deployment. Having a low limit will refuse clients requests instead of having them wait for too long and is usually good to handle backpressure correctly
+          [env: MAX_CONCURRENT_REQUESTS=]
+          [default: 128]
+```
+## MAX_BEST_OF
+```shell
+      --max-best-of <MAX_BEST_OF>
+          This is the maximum allowed value for clients to set `best_of`. Best of makes `n` generations at the same time, and return the best in terms of overall log probability over the entire generated sequence
+          [env: MAX_BEST_OF=]
+          [default: 2]
+```
+## MAX_STOP_SEQUENCES
+```shell
+      --max-stop-sequences <MAX_STOP_SEQUENCES>
+          This is the maximum allowed value for clients to set `stop_sequences`. Stop sequences are used to allow the model to stop on more than just the EOS token, and enable more complex "prompting" where users can preprompt the model in a specific way and define their "own" stop token aligned with their prompt
+          [env: MAX_STOP_SEQUENCES=]
+          [default: 4]
+```
+## MAX_TOP_N_TOKENS
+```shell
+      --max-top-n-tokens <MAX_TOP_N_TOKENS>
+          This is the maximum allowed value for clients to set `top_n_tokens`. `top_n_tokens` is used to return information about the the `n` most likely tokens at each generation step, instead of just the sampled token. This information can be used for downstream tasks like for classification or ranking
+          [env: MAX_TOP_N_TOKENS=]
+          [default: 5]
+```
+## MAX_INPUT_TOKENS
+```shell
+      --max-input-tokens <MAX_INPUT_TOKENS>
+          This is the maximum allowed input length (expressed in number of tokens) for users. The larger this value, the longer prompt users can send which can impact the overall memory required to handle the load. Please note that some models have a finite range of sequence they can handle. Default to min(max_position_embeddings - 1, 4095)
+          [env: MAX_INPUT_TOKENS=]
+```
+## MAX_INPUT_LENGTH
+```shell
+      --max-input-length <MAX_INPUT_LENGTH>
+          Legacy version of [`Args::max_input_tokens`]
+          [env: MAX_INPUT_LENGTH=]
+```
+## MAX_TOTAL_TOKENS
+```shell
+      --max-total-tokens <MAX_TOTAL_TOKENS>
+          This is the most important value to set as it defines the "memory budget" of running clients requests. Clients will send input sequences and ask to generate `max_new_tokens` on top. with a value of `1512` users can send either a prompt of `1000` and ask for `512` new tokens, or send a prompt of `1` and ask for `1511` max_new_tokens. The larger this value, the larger amount each request will be in your RAM and the less effective batching can be. Default to min(max_position_embeddings, 4096)
+          [env: MAX_TOTAL_TOKENS=]
+```
+## WAITING_SERVED_RATIO
+```shell
+      --waiting-served-ratio <WAITING_SERVED_RATIO>
+          This represents the ratio of waiting queries vs running queries where you want to start considering pausing the running queries to include the waiting ones into the same batch. `waiting_served_ratio=1.2` Means when 12 queries are waiting and there's only 10 queries left in the current batch we check if we can fit those 12 waiting queries into the batching strategy, and if yes, then batching happens delaying the 10 running queries by a `prefill` run.
+          This setting is only applied if there is room in the batch as defined by `max_batch_total_tokens`.
+          [env: WAITING_SERVED_RATIO=]
+          [default: 0.3]
+```
+## MAX_BATCH_PREFILL_TOKENS
+```shell
+      --max-batch-prefill-tokens <MAX_BATCH_PREFILL_TOKENS>
+          Limits the number of tokens for the prefill operation. Since this operation take the most memory and is compute bound, it is interesting to limit the number of requests that can be sent. Default to `max_input_tokens + 50` to give a bit of room
+          [env: MAX_BATCH_PREFILL_TOKENS=]
+```
+## MAX_BATCH_TOTAL_TOKENS
+```shell
+      --max-batch-total-tokens <MAX_BATCH_TOTAL_TOKENS>
+          **IMPORTANT** This is one critical control to allow maximum usage of the available hardware.
+          This represents the total amount of potential tokens within a batch. When using padding (not recommended) this would be equivalent of `batch_size` * `max_total_tokens`.
+          However in the non-padded (flash attention) version this can be much finer.
+          For `max_batch_total_tokens=1000`, you could fit `10` queries of `total_tokens=100` or a single query of `1000` tokens.
+          Overall this number should be the largest possible amount that fits the remaining memory (after the model is loaded). Since the actual memory overhead depends on other parameters like if you're using quantization, flash attention or the model implementation, text-generation-inference cannot infer this number automatically.
+          [env: MAX_BATCH_TOTAL_TOKENS=]
+```
+## MAX_WAITING_TOKENS
+```shell
+      --max-waiting-tokens <MAX_WAITING_TOKENS>
+          This setting defines how many tokens can be passed before forcing the waiting queries to be put on the batch (if the size of the batch allows for it). New queries require 1 `prefill` forward, which is different from `decode` and therefore you need to pause the running batch in order to run `prefill` to create the correct values for the waiting queries to be able to join the batch.
+          With a value too small, queries will always "steal" the compute to run `prefill` and running queries will be delayed by a lot.
+          With a value too big, waiting queries could wait for a very long time before being allowed a slot in the running batch. If your server is busy that means that requests that could run in ~2s on an empty server could end up running in ~20s because the query had to wait for 18s.
+          This number is expressed in number of tokens to make it a bit more "model" agnostic, but what should really matter is the overall latency for end users.
+          [env: MAX_WAITING_TOKENS=]
+          [default: 20]
+```
+## MAX_BATCH_SIZE
+```shell
+      --max-batch-size <MAX_BATCH_SIZE>
+          Enforce a maximum number of requests per batch Specific flag for hardware targets that do not support unpadded inference
+          [env: MAX_BATCH_SIZE=]
+```
+## CUDA_GRAPHS
+```shell
+      --cuda-graphs <CUDA_GRAPHS>
+          Specify the batch sizes to compute cuda graphs for. Use "0" to disable. Default = "1,2,4,8,16,32"
+          [env: CUDA_GRAPHS=]
+```
+## HOSTNAME
+```shell
+      --hostname <HOSTNAME>
+          The IP address to listen on
+          [env: HOSTNAME=]
+          [default: 0.0.0.0]
+```
+## PORT
+```shell
+  -p, --port <PORT>
+          The port to listen on
+          [env: PORT=]
+          [default: 3000]
+```
+## SHARD_UDS_PATH
+```shell
+      --shard-uds-path <SHARD_UDS_PATH>
+          The name of the socket for gRPC communication between the webserver and the shards
+          [env: SHARD_UDS_PATH=]
+          [default: /tmp/text-generation-server]
+```
+## MASTER_ADDR
+```shell
+      --master-addr <MASTER_ADDR>
+          The address the master shard will listen on. (setting used by torch distributed)
+          [env: MASTER_ADDR=]
+          [default: localhost]
+```
+## MASTER_PORT
+```shell
+      --master-port <MASTER_PORT>
+          The address the master port will listen on. (setting used by torch distributed)
+          [env: MASTER_PORT=]
+          [default: 29500]
+```
+## HUGGINGFACE_HUB_CACHE
+```shell
+      --huggingface-hub-cache <HUGGINGFACE_HUB_CACHE>
+          The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk for instance
+          [env: HUGGINGFACE_HUB_CACHE=]
+```
+## WEIGHTS_CACHE_OVERRIDE
+```shell
+      --weights-cache-override <WEIGHTS_CACHE_OVERRIDE>
+          The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk for instance
+          [env: WEIGHTS_CACHE_OVERRIDE=]
+```
+## DISABLE_CUSTOM_KERNELS
+```shell
+      --disable-custom-kernels
+          For some models (like bloom), text-generation-inference implemented custom cuda kernels to speed up inference. Those kernels were only tested on A100. Use this flag to disable them if you're running on different hardware and encounter issues
+          [env: DISABLE_CUSTOM_KERNELS=]
+```
+## CUDA_MEMORY_FRACTION
+```shell
+      --cuda-memory-fraction <CUDA_MEMORY_FRACTION>
+          Limit the CUDA available memory. The allowed value equals the total visible memory multiplied by cuda-memory-fraction
+          [env: CUDA_MEMORY_FRACTION=]
+          [default: 1.0]
+```
+## ROPE_SCALING
+```shell
+      --rope-scaling <ROPE_SCALING>
+          Rope scaling will only be used for RoPE models and allow rescaling the position rotary to accomodate for larger prompts.
+          Goes together with `rope_factor`.
+          `--rope-factor 2.0` gives linear scaling with a factor of 2.0 `--rope-scaling dynamic` gives dynamic scaling with a factor of 1.0 `--rope-scaling linear` gives linear scaling with a factor of 1.0 (Nothing will be changed basically)
+          `--rope-scaling linear --rope-factor` fully describes the scaling you want
+          [env: ROPE_SCALING=]
+          [possible values: linear, dynamic]
+```
+## ROPE_FACTOR
+```shell
+      --rope-factor <ROPE_FACTOR>
+          Rope scaling will only be used for RoPE models See `rope_scaling`
+          [env: ROPE_FACTOR=]
+```
+## JSON_OUTPUT
+```shell
+      --json-output
+          Outputs the logs in JSON format (useful for telemetry)
+          [env: JSON_OUTPUT=]
+```
+## OTLP_ENDPOINT
+```shell
+      --otlp-endpoint <OTLP_ENDPOINT>
+          [env: OTLP_ENDPOINT=]
+```
+## OTLP_SERVICE_NAME
+```shell
+      --otlp-service-name <OTLP_SERVICE_NAME>
+          [env: OTLP_SERVICE_NAME=]
+          [default: text-generation-inference.router]
+```
+## CORS_ALLOW_ORIGIN
+```shell
+      --cors-allow-origin <CORS_ALLOW_ORIGIN>
+          [env: CORS_ALLOW_ORIGIN=]
+```
+## WATERMARK_GAMMA
+```shell
+      --watermark-gamma <WATERMARK_GAMMA>
+          [env: WATERMARK_GAMMA=]
+```
+## WATERMARK_DELTA
+```shell
+      --watermark-delta <WATERMARK_DELTA>
+          [env: WATERMARK_DELTA=]
+```
+## NGROK
+```shell
+      --ngrok
+          Enable ngrok tunneling
+          [env: NGROK=]
+```
+## NGROK_AUTHTOKEN
+```shell
+      --ngrok-authtoken <NGROK_AUTHTOKEN>
+          ngrok authentication token
+          [env: NGROK_AUTHTOKEN=]
+```
+## NGROK_EDGE
+```shell
+      --ngrok-edge <NGROK_EDGE>
+          ngrok edge
+          [env: NGROK_EDGE=]
+```
+## TOKENIZER_CONFIG_PATH
+```shell
+      --tokenizer-config-path <TOKENIZER_CONFIG_PATH>
+          The path to the tokenizer config file. This path is used to load the tokenizer configuration which may include a `chat_template`. If not provided, the default config will be used from the model hub
+          [env: TOKENIZER_CONFIG_PATH=]
+```
+## DISABLE_GRAMMAR_SUPPORT
+```shell
+      --disable-grammar-support
+          Disable outlines grammar constrained generation. This is a feature that allows you to generate text that follows a specific grammar
+          [env: DISABLE_GRAMMAR_SUPPORT=]
+```
+## ENV
+```shell
+  -e, --env
+          Display a lot of information about your runtime environment
+```
+## MAX_CLIENT_BATCH_SIZE
+```shell
+      --max-client-batch-size <MAX_CLIENT_BATCH_SIZE>
+          Control the maximum number of inputs that a client can send in a single request
+          [env: MAX_CLIENT_BATCH_SIZE=]
+          [default: 4]
+```
+## LORA_ADAPTERS
+```shell
+      --lora-adapters <LORA_ADAPTERS>
+          Lora Adapters a list of adapter ids i.e. `repo/adapter1,repo/adapter2` to load during startup that will be available to callers via the `adapter_id` field in a request
+          [env: LORA_ADAPTERS=]
+```
+## HELP
+```shell
+  -h, --help
+          Print help (see a summary with '-h')
+```
+## VERSION
+```shell
+  -V, --version
+          Print version
+```
--- a/docs/source/basic_tutorials/monitoring.md
+++ b/docs/source/basic_tutorials/monitoring.md
+# Monitoring TGI server with Prometheus and Grafana dashboard
+TGI server deployment can easily be monitored through a Grafana dashboard, consuming a Prometheus data collection. Example of inspectable metrics are statistics on the effective batch sizes used by TGI, prefill/decode latencies, number of generated tokens, etc.
+In this tutorial, we look at how to set up a local Grafana dashboard to monitor TGI usage.
+![Grafana dashboard for TGI](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/grafana.png)
+## Setup on the server machine
+First, on your server machine, TGI needs to be launched as usual. TGI exposes [multiple](https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527) metrics that can be collected by Prometheus monitoring server.
+In the rest of this tutorial, we assume that TGI was launched through Docker with `--network host`.
+On the server where TGI is hosted, a Prometheus server needs to be installed and launched. To do so, please follow [Prometheus installation instructions](https://prometheus.io/download/#prometheus). For example, at the time of writing on a Linux machine:
+```
+wget https://github.com/prometheus/prometheus/releases/download/v2.52.0/prometheus-2.52.0.linux-amd64.tar.gz
+tar -xvzf prometheus-2.52.0.linux-amd64.tar.gz
+cd prometheus
+```
+Prometheus needs to be configured to listen on TGI's port. To do so, in Prometheus configuration file `prometheus.yml`, one needs to edit the lines:
+```
+    static_configs:
+      - targets: ["0.0.0.0:80"]
+```
+to use the correct IP address and port.
+We suggest to try `curl 0.0.0.0:80/generate -X POST -d '{"inputs":"hey chatbot, how are","parameters":{"max_new_tokens":15}}' -H 'Content-Type: application/json'` on the server side to make sure to configure the correct IP and port.
+Once Prometheus is configured, Prometheus server can be launched on the same machine where TGI is launched:
+```
+./prometheus --config.file="prometheus.yml"
+```
+In this guide, Prometheus monitoring data will be consumed on a local computer. Hence, we need to forward Prometheus port (by default 9090) to the local computer. To do so, we can for example:
+* Use ssh [local port forwarding](https://www.ssh.com/academy/ssh/tunneling-example)
+* Use ngrok port tunneling
+For simplicity, we will use [Ngrok](https://ngrok.com/docs/) in this guide to tunnel Prometheus port from the TGI server to the outside word.
+For that, you should follow the steps at https://dashboard.ngrok.com/get-started/setup/linux, and once Ngrok is installed, use:
+```bash
+ngrok http http://0.0.0.0:9090
+```
+As a sanity check, one can make sure that Prometheus server can be accessed at the URL given by Ngrok (in the style of https://d661-4-223-164-145.ngrok-free.app) from a local machine.
+## Setup on the monitoring machine
+Monitoring is typically done on an other machine than the server one. We use a Grafana dashboard to monitor TGI's server usage.
+Two options are available:
+* Use Grafana Cloud for an hosted dashboard solution (https://grafana.com/products/cloud/).
+* Self-host a grafana dashboard.
+In this tutorial, for simplicity, we will self host the dashbard. We recommend installing Grafana Open-source edition following [the official install instructions](https://grafana.com/grafana/download?platform=linux&edition=oss), using the available Linux binaries. For example:
+```bash
+wget https://dl.grafana.com/oss/release/grafana-11.0.0.linux-amd64.tar.gz
+tar -zxvf grafana-11.0.0.linux-amd64.tar.gz
+cd grafana-11.0.0
+./bin/grafana-server
+```
+Once the Grafana server is launched, the Grafana interface is available at http://localhost:3000. One needs to log in with the `admin` username and `admin` password.
+Once logged in, the Prometheus data source for Grafana needs to be configured, in the option `Add your first data source`. There, a Prometheus data source needs to be added with the Ngrok address we got earlier, that exposes Prometheus port (example: https://d661-4-223-164-145.ngrok-free.app).
+Once Prometheus data source is configured, we can finally create our dashboard! From home, go to `Create your first dashboard` and then `Import dashboard`. There, we will use the recommended dashboard template [tgi_grafana.json](https://github.com/huggingface/text-generation-inference/blob/main/assets/tgi_grafana.json) for a dashboard ready to be used, but you may configure your own dashboard as you like.
+Community contributed dashboard templates are also available, for example [here](https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/) or [here](https://grafana.com/grafana/dashboards/20246-text-generation-inference/).
+Load your dashboard configuration, and your TGI dashboard should be ready to go!
--- a/docs/source/basic_tutorials/non_core_models.md
+++ b/docs/source/basic_tutorials/non_core_models.md
+# Non-core Model Serving
+TGI supports various LLM architectures (see full list [here](../supported_models)). If you wish to serve a model that is not one of the supported models, TGI will fallback to the `transformers` implementation of that model. This means you will be unable to use some of the features introduced by TGI, such as tensor-parallel sharding or flash attention. However, you can still get many benefits of TGI, such as continuous batching or streaming outputs.
+You can serve these models using the same Docker command-line invocation as with fully supported models 👇
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id gpt2
+```
+If the model you wish to serve is a custom transformers model, and its weights and implementation are available in the Hub, you can still serve the model by passing the `--trust-remote-code` flag to the `docker run` command like below 👇
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id <CUSTOM_MODEL_ID> --trust-remote-code
+```
+Finally, if the model is not on Hugging Face Hub but on your local, you can pass the path to the folder that contains your model like below 👇
+```bash
+# Make sure your model is in the $volume directory
+docker run --shm-size 1g -p 8080:80 -v $volume:/data  ghcr.io/huggingface/text-generation-inference:latest --model-id /data/<PATH-TO-FOLDER>
+```
+You can refer to [transformers docs on custom models](https://huggingface.co/docs/transformers/main/en/custom_models) for more information.
--- a/docs/source/basic_tutorials/preparing_model.md
+++ b/docs/source/basic_tutorials/preparing_model.md
+# Preparing the Model
+Text Generation Inference improves the model in several aspects.
+## Quantization
+TGI supports [bits-and-bytes](https://github.com/TimDettmers/bitsandbytes#bitsandbytes), [GPT-Q](https://arxiv.org/abs/2210.17323) and [AWQ](https://arxiv.org/abs/2306.00978) quantization. To speed up inference with quantization, simply set `quantize` flag to `bitsandbytes`, `gptq` or `awq` depending on the quantization technique you wish to use. When using GPT-Q quantization, you need to point to one of the models [here](https://huggingface.co/models?search=gptq) when using AWQ quantization, you need to point to one of the models [here](https://huggingface.co/models?search=awq). To get more information about quantization, please refer to [quantization guide](./../conceptual/quantization)
+## RoPE Scaling
+RoPE scaling can be used to increase the sequence length of the model during the inference time without necessarily fine-tuning it. To enable RoPE scaling, simply pass `--rope-scaling`, `--max-input-length` and `--rope-factors` flags when running through CLI. `--rope-scaling` can take the values `linear` or `dynamic`. If your model is not fine-tuned to a longer sequence length, use `dynamic`. `--rope-factor` is the ratio between the intended max sequence length and the model's original max sequence length. Make sure to pass `--max-input-length` to provide maximum input length for extension.
+<Tip>
+We recommend using `dynamic` RoPE scaling.
+</Tip>
+## Safetensors
+[Safetensors](https://github.com/huggingface/safetensors) is a fast and safe persistence format for deep learning models, and is required for tensor parallelism. TGI supports `safetensors` model loading under the hood. By default, given a repository with `safetensors` and `pytorch` weights, TGI will always load `safetensors`. If there's no `pytorch` weights, TGI will convert the weights to `safetensors` format.
--- a/docs/source/basic_tutorials/safety.md
+++ b/docs/source/basic_tutorials/safety.md
+# Model safety.
+[Pytorch uses pickle](https://pytorch.org/docs/master/generated/torch.load.html) by default meaning that for quite a long while
+*Every* model using that format is potentially executing unintended code while purely loading the model.
+There is a big red warning on Python's page for pickle [link](https://docs.python.org/3/library/pickle.html) but for quite a while
+this was ignored by the community. Now that AI/ML is getting used much more ubiquitously we need to switch away from this format.
+HuggingFace is leading the effort here by creating a new format which contains pure data ([safetensors](https://github.com/huggingface/safetensors))
+and moving slowly but surely all the libs to make use of it by default.
+The move is intentionnally slow in order to make breaking changes as little impact as possible on users throughout.
+# TGI 2.0
+Since the release of TGI 2.0, we take the opportunity of this major version increase to break backward compatibility for these pytorch
+models (since they are a huge security risk for anyone deploying them).
+From now on, TGI will not convert automatically pickle files without having `--trust-remote-code` flag or `TRUST_REMOTE_CODE=true` in the environment variables.
+This flag is already used for community defined inference code, and is therefore quite representative of the level of confidence you are giving the model providers.
+If you want to use a model that uses pickle, but you still do not want to trust the authors entirely we recommend making a convertion on our space made for that.
+https://huggingface.co/spaces/safetensors/convert
+This space will create a PR on the original model, which you are use directly regardless of merge status from the original authors. Just use
+```
+docker run .... --revision refs/pr/#ID # Or use REVISION=refs/pr/#ID in the environment
+```
--- a/docs/source/basic_tutorials/train_medusa.md
+++ b/docs/source/basic_tutorials/train_medusa.md
+# Train Medusa
+This tutorial will show you how to train a Medusa model on a dataset of your choice. Please check out the [speculation documentation](../conceptual/speculation) for more information on how Medusa works and speculation in general.
+## What are the benefits of training a Medusa model?
+Training Medusa heads can greatly improve the speed of generation. Medusa adds extra "heads" to LLMs to predict multiple future tokens simultaneously. When augmenting a model with Medusa, the original model stays untouched, and only the new heads are fine-tuned during training.
+One of the most important things is to have a good dataset (with similar data to what will be used in production) because Medusa has a much higher hit-rate when the generation is in-domain.
+If you train Medusa on a dataset that is very different from the one you will use in production then the model will not be able to predict the future tokens accurately and consequently the speedup will be minimal or non-existent.
+## Self-distillation (Generating data for training)
+There are many methods for preparing data for training, but one of the easiest and most effective ways is to "self-distill" the data. This means that you can use the same model to generate the data that you will use to train the model.
+Essentially, you prompt the model with a similar input to what you will use in production and the model will generate the output.
+We'll use this output to help train the medusa heads to predict the `n+1`, `n+2`, `n+3`, etc tokens in the sequence.
+## Training
+The original implementation of Medusa is available at [https://github.com/FasterDecoding/Medusa](https://github.com/FasterDecoding/Medusa) and we'll follow a very similar process to train the model as described on the original repository.
+### Getting Started
+There are two methods for training the model:
+- `torchrun` that is a wrapper around `torch.distributed.launch`
+- a forked version of `axlotl` that supports Medusa
+In this tutorial we'll use `torchrun` to train the model as it is the most straightforward way to train the model but similar steps can be followed to train the model using `axlotl` if you prefer.
+### Training with `torchrun`
+```bash
+mkdir medusa-training
+cd medusa-training
+pyenv install 3.10
+pyenv local 3.10
+uv venv -p 3.10
+source .venv/bin/activate
+```
+Now lets clone the original `Medusa` repository and install the library.
+```bash
+git clone https://github.com/FasterDecoding/Medusa.git
+cd Medusa
+pip install -e .
+```
+Next we'll need some data to train on, we can use the `ShareGPT_Vicuna_unfiltered` dataset that is available on the Hugging Face Hub.
+```bash
+apt install git-lfs
+git lfs install
+git clone https://huggingface.co/datasets/Aeala/ShareGPT_Vicuna_unfiltered
+```
+Currently our directory structure looks like this:
+```bash
+.
+├── assets
+├── CITATION.cff
+├── create_data.py
+├── data_generation
+├── deepspeed.json
+├── last_run_prepared
+├── LICENSE
+├── llm_judge
+├── medusa
+├── medusa_llm.egg-info
+├── mistral.json
+├── notebooks
+├── pyproject.toml
+├── README.md
+├── ROADMAP.md
+├── scripts
+├── ShareGPT_Vicuna_unfiltered
+│   ├── README.md
+│   ├── ShareGPT_2023.05.04v0_Wasteland_Edition.json
+│   └── ShareGPT_V4.3_unfiltered_cleaned_split.json
+├── simple_gradio_interface.py
+├── tiny-llama.json
+└── vicuna_7b_qlora_stage1
+```
+## Start Training
+Now the lets generate the data and start training the model. This process will take a while since we are generating data from the model.
+First make sure you have an instance of TGI running with the model you want to use for self-distillation.
+```bash
+model=HuggingFaceH4/zephyr-7b-beta
+volume=/home/ubuntu/.cache/huggingface/hub/
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model
+```
+Now we can generate the data using the `create_data.py` script.
+```bash
+python create_data.py \
+    --input-filename ShareGPT_Vicuna_unfiltered/ShareGPT_V4.3_unfiltered_cleaned_split.json \
+    --output-filename zephyr_self_distill.json
+```
+At this point our terminal should look like this:
+<div class="flex justify-center">
+    <img
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/medusa-train-large.gif"
+        width="550"
+    />
+</div>
+> Note: In the screen shot above we are only using a the first 500 examples from the dataset to speed up the process, you should have a much larger dataset for training.
+Now we can finally get to the fun part and start training the model!
+Using `torchrun` we can easily launch the `medusa` training script with the `zephyr_self_distill.json` configuration file.
+> NOTE: If you just self-distilled you may still have the model running, make sure to stop it before starting the training in order to allow all of the resources to be used for training.
+```bash
+WANDB_MODE=offline torchrun --nproc_per_node=4 medusa/train/train_legacy.py \
+    --model_name_or_path HuggingFaceH4/zephyr-7b-beta \
+    --data_path zephyr_self_distill.json \
+    --bf16 True \
+    --output_dir zephyr_out \
+    --num_train_epochs 5 \
+    --per_device_train_batch_size 4 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 4 \
+    --evaluation_strategy "no" \
+    --save_strategy "no" \
+    --learning_rate 1e-3 \
+    --weight_decay 0.0 \
+    --warmup_ratio 0.1 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --lazy_preprocess True \
+    --medusa_num_heads 3 \
+    --medusa_num_layers 1 \
+    --deepspeed deepspeed.json
+```
+<div class="flex justify-center">
+    <img
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/medusa-train-heads-large.gif"
+        width="550"
+    />
+</div>
+If successful, you should see the similar output to the one below:
+```bash
+wandb: Run history:
+wandb:                    train/epoch ▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
+wandb:              train/global_step ▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
+wandb:            train/learning_rate ▅███▇▇▆▅▅▄▃▂▂▁▁▁
+wandb:                     train/loss ██▆▄▄▃▃▂▂▃▁▁▂▁▁▁
+wandb:             train/medusa0_loss ▆▆▇▆▆▅▄▅▃▃▃▃▂▂▂▂▂▃▂▂▂▁▁▁▂▁▁▁▁▁█▁▁▁▂▁▁▁▁▁
+wandb:             train/medusa0_top1 ▁▁▁▁▁▁▁▁▃▂▃▃▄▄▄▃▄▃▄▄▅▅▆▅▆▆▇▅▇▇▄▇█▇▅▇█▆▇▇
+wandb:             train/medusa1_loss ▇▇█▇▇▆▅▅▃▄▃▃▃▃▃▃▃▃▃▃▂▁▂▂▂▁▁▂▁▁▇▁▁▁▂▁▁▁▁▁
+wandb:             train/medusa1_top1 ▁▁▁▁▁▁▁▁▃▂▃▃▃▄▄▃▃▂▃▃▅▅▆▄█▆▇▅▇▇▅█▇▇▅▇█▆▆▇
+wandb:             train/medusa2_loss ▃▃▄▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁█▁▁▁▂▁▁▁▁▁
+wandb:             train/medusa2_top1 ▁▁▁▂▁▁▁▁▂▂▃▃▃▄▄▃▃▂▃▃▅▆▅▄█▆▆▅▆▆▄█▇▇▄▇█▆▆▇
+wandb:               train/total_flos ▁
+wandb:               train/train_loss ▁
+wandb:            train/train_runtime ▁
+wandb: train/train_samples_per_second ▁
+wandb:   train/train_steps_per_second ▁
+wandb:
+wandb: Run summary:
+wandb:                    train/epoch 2.0
+wandb:              train/global_step 16
+wandb:            train/learning_rate 0.0
+wandb:                     train/loss 14.8906
+wandb:             train/medusa0_loss 4.25
+wandb:             train/medusa0_top1 0.28809
+wandb:             train/medusa1_loss 4.8125
+wandb:             train/medusa1_top1 0.22727
+wandb:             train/medusa2_loss 5.5
+wandb:             train/medusa2_top1 0.17293
+wandb:               train/total_flos 0.0
+wandb:               train/train_loss 23.98242
+wandb:            train/train_runtime 396.9266
+wandb: train/train_samples_per_second 2.519
+wandb:   train/train_steps_per_second 0.04
+```
+Last but most importantly, don't forget to push this model to the Hugging Face Hub so you can use it in your projects.
+```bash
+python -m medusa.hf_utils \
+    --folder zephyr_out_medusa_mlp_zephyr-7b-beta_medusa_3_lr_0.001_layers_1 \
+    --repo drbh/zephyr_medusa_demo
+```
+Woo, we've successfully trained a Medusa model and pushed it to the Hugging Face Hub! 🎉
--- a/docs/source/basic_tutorials/using_cli.md
+++ b/docs/source/basic_tutorials/using_cli.md
+# Using TGI CLI
+You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters. To install the CLI, please refer to [the installation section](../installation#install-cli).
+`text-generation-server` lets you download the model with `download-weights` command like below 👇
+```bash
+text-generation-server download-weights MODEL_HUB_ID
+```
+You can also use it to quantize models like below 👇
+```bash
+text-generation-server quantize MODEL_HUB_ID OUTPUT_DIR
+```
+You can use `text-generation-launcher` to serve models.
+```bash
+text-generation-launcher --model-id MODEL_HUB_ID --port 8080
+```
+There are many options and parameters you can pass to `text-generation-launcher`. The documentation for CLI is kept minimal and intended to rely on self-generating documentation, which can be found by running
+```bash
+text-generation-launcher --help
+```
+You can also find it hosted in this [Swagger UI](https://huggingface.github.io/text-generation-inference/).
+Same documentation can be found for `text-generation-server`.
+```bash
+text-generation-server --help
+```
--- a/docs/source/basic_tutorials/using_guidance.md
+++ b/docs/source/basic_tutorials/using_guidance.md
+# Guidance
+Text Generation Inference (TGI) now supports [JSON and regex grammars](#grammar-and-constraints) and [tools and functions](#tools-and-functions) to help developers guide LLM responses to fit their needs.
+These feature are available starting from version `1.4.3`. They are accessible via the [`huggingface_hub`](https://pypi.org/project/huggingface-hub/) library. The tool support is compatible with OpenAI's client libraries. The following guide will walk you through the new features and how to use them!
+_note: guidance is supported as grammar in the `/generate` endpoint and as tools in the `/chat/completions` endpoint._
+## How it works
+TGI leverages the [outlines](https://github.com/outlines-dev/outlines) library to efficiently parse and compile the grammatical structures and tools specified by users. This integration transforms the defined grammars into an intermediate representation that acts as a framework to guide and constrain content generation, ensuring that outputs adhere to the specified grammatical rules.
+If you are interested in the technical details on how outlines is used in TGI, you can check out the [conceptual guidance documentation](../conceptual/guidance).
+## Table of Contents 📚
+### Grammar and Constraints
+- [The Grammar Parameter](#the-grammar-parameter): Shape your AI's responses with precision.
+- [Constrain with Pydantic](#constrain-with-pydantic): Define a grammar using Pydantic models.
+- [JSON Schema Integration](#json-schema-integration): Fine-grained control over your requests via JSON schema.
+- [Using the client](#using-the-client): Use TGI's client libraries to shape the AI's responses.
+### Tools and Functions
+- [The Tools Parameter](#the-tools-parameter): Enhance the AI's capabilities with predefined functions.
+- [Via the client](#text-generation-inference-client): Use TGI's client libraries to interact with the Messages API and Tool functions.
+- [OpenAI integration](#openai-integration): Use OpenAI's client libraries to interact with TGI's Messages API and Tool functions.
+## Grammar and Constraints 🛣️
+### The Grammar Parameter
+In TGI `1.4.3`, we've introduced the grammar parameter, which allows you to specify the format of the response you want from the LLM.
+Using curl, you can make a request to TGI's Messages API with the grammar parameter. This is the most primitive way to interact with the API and using [Pydantic](#constrain-with-pydantic) is recommended for ease of use and readability.
+```json
+curl localhost:3000/generate \
+    -X POST \
+    -H 'Content-Type: application/json' \
+    -d '{
+    "inputs": "I saw a puppy a cat and a raccoon during my bike ride in the park",
+    "parameters": {
+        "repetition_penalty": 1.3,
+        "grammar": {
+            "type": "json",
+            "value": {
+                "properties": {
+                    "location": {
+                        "type": "string"
+                    },
+                    "activity": {
+                        "type": "string"
+                    },
+                    "animals_seen": {
+                        "type": "integer",
+                        "minimum": 1,
+                        "maximum": 5
+                    },
+                    "animals": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "required": ["location", "activity", "animals_seen", "animals"]
+            }
+        }
+    }
+}'
+// {"generated_text":"{ \n\n\"activity\": \"biking\",\n\"animals\": [\"puppy\",\"cat\",\"raccoon\"],\n\"animals_seen\": 3,\n\"location\": \"park\"\n}"}
+```
+### Hugging Face Hub Python Library
+The Hugging Face Hub Python library provides a client that makes it easy to interact with the Messages API. Here's an example of how to use the client to send a request with a grammar parameter.
+```python
+from huggingface_hub import InferenceClient
+client = InferenceClient("http://localhost:3000")
+schema = {
+    "properties": {
+        "location": {"title": "Location", "type": "string"},
+        "activity": {"title": "Activity", "type": "string"},
+        "animals_seen": {
+            "maximum": 5,
+            "minimum": 1,
+            "title": "Animals Seen",
+            "type": "integer",
+        },
+        "animals": {"items": {"type": "string"}, "title": "Animals", "type": "array"},
+    },
+    "required": ["location", "activity", "animals_seen", "animals"],
+    "title": "Animals",
+    "type": "object",
+}
+user_input = "I saw a puppy a cat and a raccoon during my bike ride in the park"
+resp = client.text_generation(
+    f"convert to JSON: 'f{user_input}'. please use the following schema: {schema}",
+    max_new_tokens=100,
+    seed=42,
+    grammar={"type": "json", "value": schema},
+)
+print(resp)
+# { "activity": "bike ride", "animals": ["puppy", "cat", "raccoon"], "animals_seen": 3, "location": "park" }
+```
+A grammar can be defined using Pydantic models, JSON schemas, or regular expressions. The LLM will then generate a response that conforms to the specified grammar.
+> Note: A grammar must compile to an intermediate representation to constrain the output. Grammar compilation is a computationally expensive and may take a few seconds to complete on the first request. Subsequent requests will use the cached grammar and will be much faster.
+### Constrain with Pydantic
+Using Pydantic models we can define a similar grammar as the previous example in a shorter and more readable way.
+```python
+from huggingface_hub import InferenceClient
+from pydantic import BaseModel, conint
+from typing import List
+class Animals(BaseModel):
+    location: str
+    activity: str
+    animals_seen: conint(ge=1, le=5)  # Constrained integer type
+    animals: List[str]
+client = InferenceClient("http://localhost:3000")
+user_input = "I saw a puppy a cat and a raccoon during my bike ride in the park"
+resp = client.text_generation(
+    f"convert to JSON: 'f{user_input}'. please use the following schema: {Animals.schema()}",
+    max_new_tokens=100,
+    seed=42,
+    grammar={"type": "json", "value": Animals.schema()},
+)
+print(resp)
+# { "activity": "bike ride", "animals": ["puppy", "cat", "raccoon"], "animals_seen": 3, "location": "park" }
+```
+defining a grammar as regular expressions
+```python
+from huggingface_hub import InferenceClient
+client = InferenceClient("http://localhost:3000")
+regexp = "((25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)"
+resp = client.text_generation(
+    f"Whats Googles DNS? Please use the following regex: {regexp}",
+    seed=42,
+    grammar={
+        "type": "regex",
+        "value": regexp,
+    },
+)
+print(resp)
+# 7.1.1.1
+```
+## Tools and Functions 🛠️
+### The Tools Parameter
+In addition to the grammar parameter, we've also introduced a set of tools and functions to help you get the most out of the Messages API.
+Tools are a set of user defined functions that can be used in tandem with the chat functionality to enhance the LLM's capabilities. Functions, similar to grammar are defined as JSON schema and can be passed as part of the parameters to the Messages API.
+Functions, similar to grammar are defined as JSON schema and can be passed as part of the parameters to the Messages API.
+```json
+curl localhost:3000/v1/chat/completions \
+    -X POST \
+    -H 'Content-Type: application/json' \
+    -d '{
+    "model": "tgi",
+    "messages": [
+        {
+            "role": "user",
+            "content": "What is the weather like in New York?"
+        }
+    ],
+    "tools": [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The city and state, e.g. San Francisco, CA"
+                        },
+                        "format": {
+                            "type": "string",
+                            "enum": ["celsius", "fahrenheit"],
+                            "description": "The temperature unit to use. Infer this from the users location."
+                        }
+                    },
+                    "required": ["location", "format"]
+                }
+            }
+        }
+    ],
+    "tool_choice": "get_current_weather"
+}'
+// {"id":"","object":"text_completion","created":1709051640,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-native","choices":[{"index":0,"message":{"role":"assistant","tool_calls":{"id":0,"type":"function","function":{"description":null,"name":"tools","parameters":{"format":"celsius","location":"New York"}}}},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":157,"completion_tokens":19,"total_tokens":176}}
+```
+### Chat Completion with Tools
+Grammars are supported in the `/generate` endpoint, while tools are supported in the `/chat/completions` endpoint. Here's an example of how to use the client to send a request with a tool parameter.
+```python
+from huggingface_hub import InferenceClient
+client = InferenceClient("http://localhost:3000")
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "format": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                        "description": "The temperature unit to use. Infer this from the users location.",
+                    },
+                },
+                "required": ["location", "format"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_n_day_weather_forecast",
+            "description": "Get an N-day weather forecast",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "format": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                        "description": "The temperature unit to use. Infer this from the users location.",
+                    },
+                    "num_days": {
+                        "type": "integer",
+                        "description": "The number of days to forecast",
+                    },
+                },
+                "required": ["location", "format", "num_days"],
+            },
+        },
+    },
+]
+chat = client.chat_completion(
+    messages=[
+        {
+            "role": "system",
+            "content": "You're a helpful assistant! Answer the users question best you can.",
+        },
+        {
+            "role": "user",
+            "content": "What is the weather like in Brooklyn, New York?",
+        },
+    ],
+    tools=tools,
+    seed=42,
+    max_tokens=100,
+)
+print(chat.choices[0].message.tool_calls)
+# [ChatCompletionOutputToolCall(function=ChatCompletionOutputFunctionDefinition(arguments={'format': 'fahrenheit', 'location': 'Brooklyn, New York', 'num_days': 7}, name='get_n_day_weather_forecast', description=None), id=0, type='function')]
+```
+### OpenAI integration
+TGI exposes an OpenAI-compatible API, which means you can use OpenAI's client libraries to interact with TGI's Messages API and Tool functions.
+However there are some minor differences in the API, for example `tool_choice="auto"` will ALWAYS choose the tool for you. This is different from OpenAI's API where `tool_choice="auto"` will choose a tool if the model thinks it's necessary.
+```python
+from openai import OpenAI
+# Initialize the client, pointing it to one of the available models
+client = OpenAI(
+    base_url="http://localhost:3000/v1",
+    api_key="_",
+)
+# NOTE: tools defined above and removed for brevity
+chat_completion = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {
+            "role": "system",
+            "content": "Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous.",
+        },
+        {
+            "role": "user",
+            "content": "What's the weather like the next 3 days in San Francisco, CA?",
+        },
+    ],
+    tools=tools,
+    tool_choice="auto",  # tool selected by model
+    max_tokens=500,
+)
+called = chat_completion.choices[0].message.tool_calls
+print(called)
+# {
+#     "id": 0,
+#     "type": "function",
+#     "function": {
+#         "description": None,
+#         "name": "tools",
+#         "parameters": {
+#             "format": "celsius",
+#             "location": "San Francisco, CA",
+#             "num_days": 3,
+#         },
+#     },
+# }
+```
--- a/docs/source/basic_tutorials/visual_language_models.md
+++ b/docs/source/basic_tutorials/visual_language_models.md
+# Vision Language Model Inference in TGI
+Visual Language Model (VLM) are models that consume both image and text inputs to generate text.
+VLM's are trained on a combination of image and text data and can handle a wide range of tasks, such as image captioning, visual question answering, and visual dialog.
+> What distinguishes VLMs from other text and image models is their ability to handle long context and generate text that is coherent and relevant to the image even after multiple turns or in some cases, multiple images.
+Below are couple of common use cases for vision language models:
+- **Image Captioning**: Given an image, generate a caption that describes the image.
+- **Visual Question Answering (VQA)**: Given an image and a question about the image, generate an answer to the question.
+- **Mulimodal Dialog**: Generate response to multiple turns of images and conversations.
+- **Image Information Retrieval**: Given an image, retrieve information from the image.
+## How to Use a Vision Language Model?
+### Hugging Face Hub Python Library
+To infer with vision language models through Python, you can use the [`huggingface_hub`](https://pypi.org/project/huggingface-hub/) library. The `InferenceClient` class provides a simple way to interact with the [Inference API](https://huggingface.co/docs/api-inference/index). Images can be passed as URLs or base64-encoded strings. The `InferenceClient` will automatically detect the image format.
+```python
+from huggingface_hub import InferenceClient
+client = InferenceClient("http://127.0.0.1:3000")
+image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+prompt = f"![]({image})What is this a picture of?\n\n"
+for token in client.text_generation(prompt, max_new_tokens=16, stream=True):
+    print(token)
+# This is a picture of an anthropomorphic rabbit in a space suit.
+```
+```python
+from huggingface_hub import InferenceClient
+import base64
+import requests
+import io
+client = InferenceClient("http://127.0.0.1:3000")
+# read image from local file
+image_path = "rabbit.png"
+with open(image_path, "rb") as f:
+    image = base64.b64encode(f.read()).decode("utf-8")
+image = f"data:image/png;base64,{image}"
+prompt = f"![]({image})What is this a picture of?\n\n"
+for token in client.text_generation(prompt, max_new_tokens=10, stream=True):
+    print(token)
+# This is a picture of an anthropomorphic rabbit in a space suit.
+```
+or via the `chat_completion` endpoint:
+```python
+from huggingface_hub import InferenceClient
+client = InferenceClient("http://127.0.0.1:3000")
+chat = client.chat_completion(
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Whats in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+                    },
+                },
+            ],
+        },
+    ],
+    seed=42,
+    max_tokens=100,
+)
+print(chat)
+# ChatCompletionOutput(choices=[ChatCompletionOutputComplete(finish_reason='length', index=0, message=ChatCompletionOutputMessage(role='assistant', content=" The image you've provided features an anthropomorphic rabbit in spacesuit attire. This rabbit is depicted with human-like posture and movement, standing on a rocky terrain with a vast, reddish-brown landscape in the background. The spacesuit is detailed with mission patches, circuitry, and a helmet that covers the rabbit's face and ear, with an illuminated red light on the chest area.\n\nThe artwork style is that of a", name=None, tool_calls=None), logprobs=None)], created=1714589614, id='', model='llava-hf/llava-v1.6-mistral-7b-hf', object='text_completion', system_fingerprint='2.0.2-native', usage=ChatCompletionOutputUsage(completion_tokens=100, prompt_tokens=2943, total_tokens=3043))
+```
+or with OpenAi's library:
+```python
+from openai import OpenAI
+# init the client but point it to TGI
+client = OpenAI(base_url="http://localhost:3000/v1", api_key="-")
+chat_completion = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Whats in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+                    },
+                },
+            ],
+        },
+    ],
+    stream=False,
+)
+print(chat_completion)
+# ChatCompletion(id='', choices=[Choice(finish_reason='eos_token', index=0, logprobs=None, message=ChatCompletionMessage(content=' The image depicts an anthropomorphic rabbit dressed in a space suit with gear that resembles NASA attire. The setting appears to be a solar eclipse with dramatic mountain peaks and a partial celestial body in the sky. The artwork is detailed and vivid, with a warm color palette and a sense of an adventurous bunny exploring or preparing for a journey beyond Earth. ', role='assistant', function_call=None, tool_calls=None))], created=1714589732, model='llava-hf/llava-v1.6-mistral-7b-hf', object='text_completion', system_fingerprint='2.0.2-native', usage=CompletionUsage(completion_tokens=84, prompt_tokens=2943, total_tokens=3027))
+```
+### Inference Through Sending `cURL` Requests
+To use the `generate_stream` endpoint with curl, you can add the `-N` flag. This flag disables curl default buffering and shows data as it arrives from the server.
+```bash
+curl -N 127.0.0.1:3000/generate_stream \
+    -X POST \
+    -d '{"inputs":"![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)What is this a picture of?\n\n","parameters":{"max_new_tokens":16, "seed": 42}}' \
+    -H 'Content-Type: application/json'
+# ...
+# data:{"index":16,"token":{"id":28723,"text":".","logprob":-0.6196289,"special":false},"generated_text":"This is a picture of an anthropomorphic rabbit in a space suit.","details":null}
+```
+### Inference Through JavaScript
+First, we need to install the `@huggingface/inference` library.
+```bash
+npm install @huggingface/inference
+```
+If you're using the free Inference API, you can use [Huggingface.js](https://huggingface.co/docs/huggingface.js/inference/README)'s `HfInference`. If you're using inference endpoints, you can use `HfInferenceEndpoint` class to easily interact with the Inference API.
+We can create a `HfInferenceEndpoint` providing our endpoint URL and We can create a `HfInferenceEndpoint` providing our endpoint URL and [Hugging Face access token](https://huggingface.co/settings/tokens).
+```js
+import { HfInferenceEndpoint } from "@huggingface/inference";
+const hf = new HfInferenceEndpoint("http://127.0.0.1:3000", "HF_TOKEN");
+const prompt =
+  "![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)What is this a picture of?\n\n";
+const stream = hf.textGenerationStream({
+  inputs: prompt,
+  parameters: { max_new_tokens: 16, seed: 42 },
+});
+for await (const r of stream) {
+  // yield the generated token
+  process.stdout.write(r.token.text);
+}
+// This is a picture of an anthropomorphic rabbit in a space suit.
+```
+## Combining Vision Language Models with Other Features
+VLMs in TGI have several advantages, for example these models can be used in tandem with other features for more complex tasks. For example, you can use VLMs with [Guided Generation](/docs/conceptual/guided-generation) to generate specific JSON data from an image.
+<div class="flex justify-center">
+    <img
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+        width="400"
+    />
+</div>
+For example we can extract information from the rabbit image and generate a JSON object with the location, activity, number of animals seen, and the animals seen. That would look like this:
+```json
+{
+  "activity": "Standing",
+  "animals": ["Rabbit"],
+  "animals_seen": 1,
+  "location": "Rocky surface with mountains in the background and a red light on the rabbit's chest"
+}
+```
+All we need to do is provide a JSON schema to the VLM model and it will generate the JSON object for us.
+```bash
+curl localhost:3000/generate \
+    -X POST \
+    -H 'Content-Type: application/json' \
+    -d '{
+    "inputs":"![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)What is this a picture of?\n\n",
+    "parameters": {
+        "max_new_tokens": 100,
+        "seed": 42,
+        "grammar": {
+            "type": "json",
+            "value": {
+                "properties": {
+                    "location": {
+                        "type": "string"
+                    },
+                    "activity": {
+                        "type": "string"
+                    },
+                    "animals_seen": {
+                        "type": "integer",
+                        "minimum": 1,
+                        "maximum": 5
+                    },
+                    "animals": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "required": ["location", "activity", "animals_seen", "animals"]
+            }
+        }
+    }
+}'
+# {
+#   "generated_text": "{ \"activity\": \"Standing\", \"animals\": [ \"Rabbit\" ], \"animals_seen\": 1, \"location\": \"Rocky surface with mountains in the background and a red light on the rabbit's chest\" }"
+# }
+```
+Want to learn more about how Vision Language Models work? Check out the [awesome blog post on the topic](https://huggingface.co/blog/vlms).