add tgi2.4.0

81a882ad · jixx · 9822d7f6 · 81a882ad · 81a882ad · 81a882ad
Commit 81a882ad authored Nov 21, 2024 by jixx
20 changed files
--- a/clients/python/text_generation/client.py
+++ b/clients/python/text_generation/client.py
@@ -757,7 +757,12 @@ class AsyncClient:
                        continue
                    payload = byte_payload.decode("utf-8")
                    if payload.startswith("data:"):
-                        json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
+                        payload_data = (
+                            payload.lstrip("data:").rstrip("\n").removeprefix(" ")
+                        )
+                        if payload_data == "[DONE]":
+                            break
+                        json_payload = json.loads(payload_data)
                        try:
                            response = ChatCompletionChunk(**json_payload)
                            yield response

--- a/clients/python/text_generation/inference_api.py
+++ b/clients/python/text_generation/inference_api.py
@@ -21,7 +21,7 @@ def deployed_models(headers: Optional[Dict] = None) -> List[DeployedModel]:
        List[DeployedModel]: list of all currently deployed models
    """
    resp = requests.get(
-        f"https://api-inference.huggingface.co/framework/text-generation-inference",
+        "https://api-inference.huggingface.co/framework/text-generation-inference",
        headers=headers,
        timeout=5,
    )

--- a/clients/python/text_generation/types.py
+++ b/clients/python/text_generation/types.py
@@ -28,11 +28,17 @@ class ToolCall(BaseModel):
    function: dict


+class Chunk(BaseModel):
+    type: str
+    text: Optional[str] = None
+    image_url: Any = None
+
+
 class Message(BaseModel):
    # Role of the message sender
    role: str
    # Content of the message
-    content: Optional[str] = None
+    content: Optional[Union[str, List[Chunk]]] = None
    # Optional name of the message sender
    name: Optional[str] = None
    # Tool calls associated with the chat completion
@@ -61,7 +67,7 @@ class ChoiceDeltaToolCall(BaseModel):
 class ChoiceDelta(BaseModel):
    role: str
    content: Optional[str] = None
-    tool_calls: Optional[ChoiceDeltaToolCall]
+    tool_calls: Optional[ChoiceDeltaToolCall] = None


 class Choice(BaseModel):
@@ -168,7 +174,7 @@ class ChatCompletionComplete(BaseModel):
    # Log probabilities for the chat completion
    logprobs: Optional[Any]
    # Reason for completion
-    finish_reason: str
+    finish_reason: Optional[str]
    # Usage details of the chat completion
    usage: Optional[Any] = None

@@ -191,6 +197,7 @@ class ChatCompletionChunk(BaseModel):
    model: str
    system_fingerprint: str
    choices: List[Choice]
+    usage: Optional[Any] = None


 class Parameters(BaseModel):

--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -10,7 +10,7 @@
      "name": "Apache 2.0",
      "url": "https://www.apache.org/licenses/LICENSE-2.0"
    },
-    "version": "2.1.1"
+    "version": "2.4.0"
  },
  "paths": {
    "/": {
@@ -316,6 +316,98 @@
        }
      }
    },
+    "/invocations": {
+      "post": {
+        "tags": [
+          "Text Generation Inference"
+        ],
+        "summary": "Generate tokens from Sagemaker request",
+        "operationId": "sagemaker_compatibility",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/SagemakerRequest"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "Generated Chat Completion",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/SagemakerResponse"
+                }
+              },
+              "text/event-stream": {
+                "schema": {
+                  "$ref": "#/components/schemas/SagemakerStreamResponse"
+                }
+              }
+            }
+          },
+          "422": {
+            "description": "Input validation error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Input validation error",
+                  "error_type": "validation"
+                }
+              }
+            }
+          },
+          "424": {
+            "description": "Generation Error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Request failed during generation",
+                  "error_type": "generation"
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Model is overloaded",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Model is overloaded",
+                  "error_type": "overloaded"
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Incomplete generation",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Incomplete generation",
+                  "error_type": "incomplete_generation"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
    "/metrics": {
      "get": {
        "tags": [
@@ -492,12 +584,12 @@
            "content": {
              "application/json": {
                "schema": {
-                  "$ref": "#/components/schemas/Completion"
+                  "$ref": "#/components/schemas/CompletionFinal"
                }
              },
              "text/event-stream": {
                "schema": {
-                  "$ref": "#/components/schemas/CompletionCompleteChunk"
+                  "$ref": "#/components/schemas/Chunk"
                }
              }
            }
@@ -556,6 +648,37 @@
          }
        }
      }
+    },
+    "/v1/models": {
+      "get": {
+        "tags": [
+          "Text Generation Inference"
+        ],
+        "summary": "Get model info",
+        "operationId": "openai_get_model_info",
+        "responses": {
+          "200": {
+            "description": "Served model info",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ModelInfo"
+                }
+              }
+            }
+          },
+          "404": {
+            "description": "Model not found",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                }
+              }
+            }
+          }
+        }
+      }
    }
  },
  "components": {
@@ -711,6 +834,14 @@
          },
          "system_fingerprint": {
            "type": "string"
+          },
+          "usage": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/Usage"
+              }
+            ],
+            "nullable": true
          }
        }
      },
@@ -809,7 +940,6 @@
      "ChatRequest": {
        "type": "object",
        "required": [
-          "model",
          "messages"
        ],
        "properties": {
@@ -820,6 +950,13 @@
            "example": "1.0",
            "nullable": true
          },
+          "guideline": {
+            "type": "string",
+            "description": "A guideline to be used in the chat_template",
+            "default": "null",
+            "example": "null",
+            "nullable": true
+          },
          "logit_bias": {
            "type": "array",
            "items": {
@@ -854,7 +991,8 @@
          "model": {
            "type": "string",
            "description": "[UNUSED] ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.",
-            "example": "mistralai/Mistral-7B-Instruct-v0.2"
+            "example": "mistralai/Mistral-7B-Instruct-v0.2",
+            "nullable": true
          },
          "n": {
            "type": "integer",
@@ -899,6 +1037,14 @@
          "stream": {
            "type": "boolean"
          },
+          "stream_options": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/StreamOptions"
+              }
+            ],
+            "nullable": true
+          },
          "temperature": {
            "type": "number",
            "format": "float",
@@ -909,7 +1055,7 @@
          "tool_choice": {
            "allOf": [
              {
-                "$ref": "#/components/schemas/ToolType"
+                "$ref": "#/components/schemas/ToolChoice"
              }
            ],
            "nullable": true
@@ -917,7 +1063,7 @@
          "tool_prompt": {
            "type": "string",
            "description": "A prompt to be appended before the tools",
-            "example": "\"You will be presented with a JSON schema representing a set of tools.\nIf the user request lacks of sufficient information to make a precise tool selection: Do not invent any tool's properties, instead notify with an error message.\n\nJSON Schema:\n\"",
+            "example": "Given the functions available, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {name: function name, parameters: dictionary of argument name and its value}.Do not use variables.",
            "nullable": true
          },
          "tools": {
@@ -1116,7 +1262,6 @@
      "CompletionRequest": {
        "type": "object",
        "required": [
-          "model",
          "prompt"
        ],
        "properties": {
@@ -1138,7 +1283,8 @@
          "model": {
            "type": "string",
            "description": "UNUSED\nID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.",
-            "example": "mistralai/Mistral-7B-Instruct-v0.2"
+            "example": "mistralai/Mistral-7B-Instruct-v0.2",
+            "nullable": true
          },
          "prompt": {
            "$ref": "#/components/schemas/Prompt"
@@ -1324,6 +1470,17 @@
          }
        }
      },
+      "FunctionName": {
+        "type": "object",
+        "required": [
+          "name"
+        ],
+        "properties": {
+          "name": {
+            "type": "string"
+          }
+        }
+      },
      "GenerateParameters": {
        "type": "object",
        "properties": {
@@ -1569,16 +1726,11 @@
        "type": "object",
        "required": [
          "model_id",
-          "model_dtype",
-          "model_device_type",
          "max_concurrent_requests",
          "max_best_of",
          "max_stop_sequences",
          "max_input_tokens",
          "max_total_tokens",
-          "waiting_served_ratio",
-          "max_batch_total_tokens",
-          "max_waiting_tokens",
          "validation_workers",
          "max_client_batch_size",
          "router",
@@ -1590,18 +1742,6 @@
            "example": "null",
            "nullable": true
          },
-          "max_batch_size": {
-            "type": "integer",
-            "example": "null",
-            "nullable": true,
-            "minimum": 0
-          },
-          "max_batch_total_tokens": {
-            "type": "integer",
-            "format": "int32",
-            "example": "32000",
-            "minimum": 0
-          },
          "max_best_of": {
            "type": "integer",
            "example": "2",
@@ -1633,19 +1773,6 @@
            "example": "2048",
            "minimum": 0
          },
-          "max_waiting_tokens": {
-            "type": "integer",
-            "example": "20",
-            "minimum": 0
-          },
-          "model_device_type": {
-            "type": "string",
-            "example": "cuda"
-          },
-          "model_dtype": {
-            "type": "string",
-            "example": "torch.float16"
-          },
          "model_id": {
            "type": "string",
            "description": "Model info",
@@ -1679,11 +1806,6 @@
          "version": {
            "type": "string",
            "example": "0.5.0"
-          },
-          "waiting_served_ratio": {
-            "type": "number",
-            "format": "float",
-            "example": "1.2"
          }
        }
      },
@@ -1708,6 +1830,101 @@
          }
        }
      },
+      "MessageChunk": {
+        "oneOf": [
+          {
+            "type": "object",
+            "required": [
+              "text",
+              "type"
+            ],
+            "properties": {
+              "text": {
+                "type": "string"
+              },
+              "type": {
+                "type": "string",
+                "enum": [
+                  "text"
+                ]
+              }
+            }
+          },
+          {
+            "type": "object",
+            "required": [
+              "image_url",
+              "type"
+            ],
+            "properties": {
+              "image_url": {
+                "$ref": "#/components/schemas/Url"
+              },
+              "type": {
+                "type": "string",
+                "enum": [
+                  "image_url"
+                ]
+              }
+            }
+          }
+        ],
+        "discriminator": {
+          "propertyName": "type"
+        }
+      },
+      "MessageContent": {
+        "oneOf": [
+          {
+            "type": "string"
+          },
+          {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/MessageChunk"
+            }
+          }
+        ]
+      },
+      "ModelInfo": {
+        "type": "object",
+        "required": [
+          "id",
+          "object",
+          "created",
+          "owned_by"
+        ],
+        "properties": {
+          "created": {
+            "type": "integer",
+            "format": "int64",
+            "example": 1686935002,
+            "minimum": 0
+          },
+          "id": {
+            "type": "string",
+            "example": "gpt2"
+          },
+          "object": {
+            "type": "string",
+            "example": "model"
+          },
+          "owned_by": {
+            "type": "string",
+            "example": "openai"
+          }
+        }
+      },
+      "OutputMessage": {
+        "oneOf": [
+          {
+            "$ref": "#/components/schemas/TextMessage"
+          },
+          {
+            "$ref": "#/components/schemas/ToolCallMessage"
+          }
+        ]
+      },
      "PrefillToken": {
        "type": "object",
        "required": [
@@ -1740,6 +1957,45 @@
          "type": "string"
        }
      },
+      "SagemakerRequest": {
+        "oneOf": [
+          {
+            "$ref": "#/components/schemas/CompatGenerateRequest"
+          },
+          {
+            "$ref": "#/components/schemas/ChatRequest"
+          },
+          {
+            "$ref": "#/components/schemas/CompletionRequest"
+          }
+        ]
+      },
+      "SagemakerResponse": {
+        "oneOf": [
+          {
+            "$ref": "#/components/schemas/GenerateResponse"
+          },
+          {
+            "$ref": "#/components/schemas/ChatCompletion"
+          },
+          {
+            "$ref": "#/components/schemas/CompletionFinal"
+          }
+        ]
+      },
+      "SagemakerStreamResponse": {
+        "oneOf": [
+          {
+            "$ref": "#/components/schemas/StreamResponse"
+          },
+          {
+            "$ref": "#/components/schemas/ChatCompletionChunk"
+          },
+          {
+            "$ref": "#/components/schemas/Chunk"
+          }
+        ]
+      },
      "SimpleToken": {
        "type": "object",
        "required": [
@@ -1775,7 +2031,8 @@
        "type": "object",
        "required": [
          "finish_reason",
-          "generated_tokens"
+          "generated_tokens",
+          "input_length"
        ],
        "properties": {
          "finish_reason": {
@@ -1787,6 +2044,12 @@
            "example": 1,
            "minimum": 0
          },
+          "input_length": {
+            "type": "integer",
+            "format": "int32",
+            "example": 1,
+            "minimum": 0
+          },
          "seed": {
            "type": "integer",
            "format": "int64",
@@ -1796,6 +2059,19 @@
          }
        }
      },
+      "StreamOptions": {
+        "type": "object",
+        "required": [
+          "include_usage"
+        ],
+        "properties": {
+          "include_usage": {
+            "type": "boolean",
+            "description": "If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value.",
+            "example": "true"
+          }
+        }
+      },
      "StreamResponse": {
        "type": "object",
        "required": [
@@ -1834,6 +2110,23 @@
          }
        }
      },
+      "TextMessage": {
+        "type": "object",
+        "required": [
+          "role",
+          "content"
+        ],
+        "properties": {
+          "content": {
+            "type": "string",
+            "example": "My name is David and I"
+          },
+          "role": {
+            "type": "string",
+            "example": "user"
+          }
+        }
+      },
      "Token": {
        "type": "object",
        "required": [
@@ -1906,15 +2199,64 @@
          }
        }
      },
+      "ToolCallDelta": {
+        "type": "object",
+        "required": [
+          "role",
+          "tool_calls"
+        ],
+        "properties": {
+          "role": {
+            "type": "string",
+            "example": "assistant"
+          },
+          "tool_calls": {
+            "$ref": "#/components/schemas/DeltaToolCall"
+          }
+        }
+      },
+      "ToolCallMessage": {
+        "type": "object",
+        "required": [
+          "role",
+          "tool_calls"
+        ],
+        "properties": {
+          "role": {
+            "type": "string",
+            "example": "assistant"
+          },
+          "tool_calls": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/ToolCall"
+            }
+          }
+        }
+      },
+      "ToolChoice": {
+        "allOf": [
+          {
+            "$ref": "#/components/schemas/ToolType"
+          }
+        ],
+        "nullable": true
+      },
      "ToolType": {
        "oneOf": [
          {
-            "type": "object",
-            "default": null,
-            "nullable": true
+            "type": "string",
+            "description": "Means the model can pick between generating a message or calling one or more tools.",
+            "enum": [
+              "auto"
+            ]
          },
          {
-            "type": "string"
+            "type": "string",
+            "description": "Means the model will not call any tool and instead generates a message.",
+            "enum": [
+              "none"
+            ]
          },
          {
            "type": "object",
@@ -1927,7 +2269,20 @@
              }
            }
          }
-        ]
+        ],
+        "description": "Controls which (if any) tool is called by the model.",
+        "example": "auto"
+      },
+      "Url": {
+        "type": "object",
+        "required": [
+          "url"
+        ],
+        "properties": {
+          "url": {
+            "type": "string"
+          }
+        }
      },
      "Usage": {
        "type": "object",

--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -3,6 +3,8 @@
    title: Text Generation Inference
  - local: quicktour
    title: Quick Tour
+  - local: supported_models
+    title: Supported Models
  - local: installation_nvidia
    title: Using TGI with Nvidia GPUs
  - local: installation_amd
@@ -11,14 +13,15 @@
    title: Using TGI with Intel Gaudi
  - local: installation_inferentia
    title: Using TGI with AWS Inferentia
+  - local: installation_intel
+    title: Using TGI with Intel GPUs
  - local: installation
    title: Installation from source
-  - local: supported_models
-    title: Supported Models and Hardware
-  - local: messages_api
-    title: Messages API
+
  - local: architecture
    title: Internal Architecture
+  - local: usage_statistics
+    title: Usage Statistics
  title: Getting started
 - sections:
  - local: basic_tutorials/consuming_tgi
@@ -29,8 +32,6 @@
    title: Serving Private & Gated Models
  - local: basic_tutorials/using_cli
    title: Using TGI CLI
-  - local: basic_tutorials/launcher
-    title: All TGI CLI options
  - local: basic_tutorials/non_core_models
    title: Non-core Model Serving
  - local: basic_tutorials/safety
@@ -44,6 +45,14 @@
  - local: basic_tutorials/train_medusa
    title: Train Medusa
  title: Tutorials
+- sections:
+  - local: reference/launcher
+    title: All TGI CLI options
+  - local: reference/metrics
+    title: Exported Metrics
+  - local: reference/api_reference
+    title: API Reference
+  title: Reference
 - sections:
  - local: conceptual/streaming
    title: Streaming
@@ -60,9 +69,11 @@
  - local: conceptual/speculation
    title: Speculation (Medusa, ngram)
  - local: conceptual/guidance
-    title: How Guidance Works (via outlines
+    title: How Guidance Works (via outlines)
  - local: conceptual/lora
    title: LoRA (Low-Rank Adaptation)
+  - local: conceptual/external
+    title: External Resources


  title: Conceptual Guides
--- a/docs/source/architecture.md
+++ b/docs/source/architecture.md
@@ -10,7 +10,7 @@ This diagram shows well there are these separate components:

 - **The router**, also named `webserver`, that receives the client requests, buffers them, creates some batches, and prepares gRPC calls to a model server.
 - **The model server**, responsible of receiving the gRPC requests and to process the inference on the model. If the model is sharded across multiple accelerators (e.g.: multiple GPUs), the model server shards might be synchronized via NCCL or equivalent.
- **The launcher** is a helper thar will be able to launch one or several model servers (if model is sharded), and it launches the router with the compatible arguments.
+- **The launcher** is a helper that will be able to launch one or several model servers (if model is sharded), and it launches the router with the compatible arguments.

 The router and the model server can be two different machines, they do not need to be deployed together.

@@ -103,6 +103,7 @@ Several variants of the model server exist that are actively supported by Huggin

 - By default, the model server will attempt building [a server optimized for Nvidia GPUs with CUDA](https://huggingface.co/docs/text-generation-inference/installation_nvidia). The code for this version is hosted in the [main TGI repository](https://github.com/huggingface/text-generation-inference).
 - A [version optimized for AMD with ROCm](https://huggingface.co/docs/text-generation-inference/installation_amd) is hosted in the main TGI repository. Some model features differ.
+- A [version optimized for Intel GPUs](https://huggingface.co/docs/text-generation-inference/installation_intel) is hosted in the main TGI repository. Some model features differ.
 - The [version for Intel Gaudi](https://huggingface.co/docs/text-generation-inference/installation_gaudi) is maintained on a forked repository, often resynchronized with the main [TGI repository](https://github.com/huggingface/tgi-gaudi).
 - A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained as part of [Optimum Neuron](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference).
 - A version for Google TPUs is maintained as part of [Optimum TPU](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference).

--- a/docs/source/basic_tutorials/consuming_tgi.md
+++ b/docs/source/basic_tutorials/consuming_tgi.md
 # Consuming Text Generation Inference

-There are many ways you can consume Text Generation Inference server in your applications. After launching, you can use the `/generate` route and make a `POST` request to get results from the server. You can also use the `/generate_stream` route if you want TGI to return a stream of tokens. You can make the requests using the tool of your preference, such as curl, Python or TypeScrpt. For a final end-to-end experience, we also open-sourced ChatUI, a chat interface for open-source models.
+There are many ways to consume Text Generation Inference (TGI) server in your applications. After launching the server, you can use the [Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api) `/v1/chat/completions` route and make a `POST` request to get results from the server. You can also pass `"stream": true` to the call if you want TGI to return a stream of tokens.
+
+For more information on the API, consult the OpenAPI documentation of `text-generation-inference` available [here](https://huggingface.github.io/text-generation-inference).
+
+You can make the requests using any tool of your preference, such as curl, Python, or TypeScript. For an end-to-end experience, we've open-sourced [ChatUI](https://github.com/huggingface/chat-ui), a chat interface for open-access models.

 ## curl

-After the launch, you can query the model using either the `/generate` or `/generate_stream` routes:
+After a successful server launch, you can query the model using the `v1/chat/completions` route, to get responses that are compliant to the OpenAI Chat Completion spec:

 ```bash
-curl 127.0.0.1:8080/generate \
+curl localhost:8080/v1/chat/completions \
    -X POST \
-    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
+    -d '{
+  "model": "tgi",
+  "messages": [
+    {
+      "role": "system",
+      "content": "You are a helpful assistant."
+    },
+    {
+      "role": "user",
+      "content": "What is deep learning?"
+    }
+  ],
+  "stream": true,
+  "max_tokens": 20
+}' \
    -H 'Content-Type: application/json'
 ```

-
-## Inference Client
-
-[`huggingface-hub`](https://huggingface.co/docs/huggingface_hub/main/en/index) is a Python library to interact with the Hugging Face Hub, including its endpoints. It provides a nice high-level class, [`~huggingface_hub.InferenceClient`], which makes it easy to make calls to a TGI endpoint. `InferenceClient` also takes care of parameter validation and provides a simple to-use interface.
-You can simply install `huggingface-hub` package with pip.
+For non-chat use-cases, you can also use the `/generate` and `/generate_stream` routes.

 ```bash
-pip install huggingface-hub
-```
-
-Once you start the TGI server, instantiate `InferenceClient()` with the URL to the endpoint serving the model. You can then call `text_generation()` to hit the endpoint through Python.
-
-```python
-from huggingface_hub import InferenceClient
-
-client = InferenceClient(model="http://127.0.0.1:8080")
-client.text_generation(prompt="Write a code for snake game")
+curl 127.0.0.1:8080/generate \
+    -X POST \
+    -d '{
+  "inputs":"What is Deep Learning?",
+  "parameters":{
+    "max_new_tokens":20
+  }
+}' \
+    -H 'Content-Type: application/json'
 ```

-You can do streaming with `InferenceClient` by passing `stream=True`. Streaming will return tokens as they are being generated in the server. To use streaming, you can do as follows:
+## Python

-```python
-for token in client.text_generation("How do you make cheese?", max_new_tokens=12, stream=True):
-    print(token)
-```
+### Inference Client

-Another parameter you can use with TGI backend is `details`. You can get more details on generation (tokens, probabilities, etc.) by setting `details` to `True`. When it's specified, TGI will return a `TextGenerationResponse` or `TextGenerationStreamResponse` rather than a string or stream.
+[`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/main/en/index) is a Python library to interact with the Hugging Face Hub, including its endpoints. It provides a high-level class, [`huggingface_hub.InferenceClient`](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient), which makes it easy to make calls to TGI's Messages API. `InferenceClient` also takes care of parameter validation and provides a simple-to-use interface.

-```python
-output = client.text_generation(prompt="Meaning of life is", details=True)
-print(output)
+Install `huggingface_hub` package via pip.

-# TextGenerationResponse(generated_text=' a complex concept that is not always clear to the individual. It is a concept that is not always', details=Details(finish_reason=<FinishReason.Length: 'length'>, generated_tokens=20, seed=None, prefill=[], tokens=[Token(id=267, text=' a', logprob=-2.0723474, special=False), Token(id=11235, text=' complex', logprob=-3.1272552, special=False), Token(id=17908, text=' concept', logprob=-1.3632495, special=False),..))
+```bash
+pip install huggingface_hub
 ```

-You can see how to stream below.
+You can now use `InferenceClient` the exact same way you would use `OpenAI` client in Python

 ```python
-output = client.text_generation(prompt="Meaning of life is", stream=True, details=True)
-print(next(iter(output)))
+from huggingface_hub import InferenceClient

-# TextGenerationStreamResponse(token=Token(id=267, text=' a', logprob=-2.0723474, special=False), generated_text=None, details=None)
+client = InferenceClient(
+    base_url="http://localhost:8080/v1/",
+)
+
+output = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Count to 10"},
+    ],
+    stream=True,
+    max_tokens=1024,
+)
+
+for chunk in output:
+    print(chunk.choices[0].delta.content)
 ```

-You can check out the details of the function [here](https://huggingface.co/docs/huggingface_hub/main/en/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation). There is also an async version of the client, `AsyncInferenceClient`, based on `asyncio` and `aiohttp`. You can find docs for it [here](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.AsyncInferenceClient)
+You can check out more details about OpenAI compatibility [here](https://huggingface.co/docs/huggingface_hub/en/guides/inference#openai-compatibility).

+There is also an async version of the client, `AsyncInferenceClient`, based on `asyncio` and `aiohttp`. You can find docs for it [here](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.AsyncInferenceClient)

-## ChatUI
+### OpenAI Client

-ChatUI is an open-source interface built for LLM serving. It offers many customization options, such as web search with SERP API and more. ChatUI can automatically consume the TGI server and even provides an option to switch between different TGI endpoints. You can try it out at [Hugging Chat](https://huggingface.co/chat/), or use the [ChatUI Docker Space](https://huggingface.co/new-space?template=huggingchat/chat-ui-template) to deploy your own Hugging Chat to Spaces.
+You can directly use the OpenAI [Python](https://github.com/openai/openai-python) or [JS](https://github.com/openai/openai-node) clients to interact with TGI.

-To serve both ChatUI and TGI in same environment, simply add your own endpoints to the `MODELS` variable in `.env.local` file inside the `chat-ui` repository. Provide the endpoints pointing to where TGI is served.
+Install the OpenAI Python package via pip.

+```bash
+pip install openai
 ```
-{
-// rest of the model config here
-"endpoints": [{"url": "https://HOST:PORT/generate_stream"}]
-}
+
+```python
+from openai import OpenAI
+
+# init the client but point it to TGI
+client = OpenAI(
+    base_url="http://localhost:8080/v1/",
+    api_key="-"
+)
+
+chat_completion = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant." },
+        {"role": "user", "content": "What is deep learning?"}
+    ],
+    stream=True
+)
+
+# iterate and print stream
+for message in chat_completion:
+    print(message)
 ```

-![ChatUI](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/chatui_screen.png)
+## UI

-## Gradio
+### Gradio

 Gradio is a Python library that helps you build web applications for your machine learning models with a few lines of code. It has a `ChatInterface` wrapper that helps create neat UIs for chatbots. Let's take a look at how to create a chatbot with streaming mode using TGI and Gradio. Let's install Gradio and Hub Python library first.

@@ -89,19 +133,28 @@ Assume you are serving your model on port 8080, we will query through [Inference
 import gradio as gr
 from huggingface_hub import InferenceClient

-client = InferenceClient(model="http://127.0.0.1:8080")
+client = InferenceClient(base_url="http://127.0.0.1:8080")

 def inference(message, history):
    partial_message = ""
-    for token in client.text_generation(message, max_new_tokens=20, stream=True):
-        partial_message += token
+    output = client.chat.completions.create(
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": message},
+        ],
+        stream=True,
+        max_tokens=1024,
+    )
+
+    for chunk in output:
+        partial_message += chunk.choices[0].delta.content
        yield partial_message

 gr.ChatInterface(
    inference,
    chatbot=gr.Chatbot(height=300),
    textbox=gr.Textbox(placeholder="Chat with me!", container=False, scale=7),
-    description="This is the demo for Gradio UI consuming TGI endpoint with LLaMA 7B-Chat model.",
+    description="This is the demo for Gradio UI consuming TGI endpoint.",
    title="Gradio 🤝 TGI",
    examples=["Are tomatoes vegetables?"],
    retry_btn="Retry",
@@ -110,20 +163,7 @@ gr.ChatInterface(
 ).queue().launch()
 ```

-The UI looks like this 👇
-
-<div class="flex justify-center">
-    <img
-        class="block dark:hidden"
-        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi.png"
-    />
-    <img
-        class="hidden dark:block"
-        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi-dark.png"
-    />
-</div>
-
-You can try the demo directly here 👇
+You can check out the UI and try the demo directly here 👇

 <div class="block dark:hidden">
 	<iframe
@@ -141,15 +181,19 @@ You can try the demo directly here 👇
 </div>


-You can disable streaming mode using `return` instead of `yield` in your inference function, like below.
+You can read more about how to customize a `ChatInterface` [here](https://www.gradio.app/guides/creating-a-chatbot-fast).

-```python
-def inference(message, history):
-    return client.text_generation(message, max_new_tokens=20)
-```
+### ChatUI

-You can read more about how to customize a `ChatInterface` [here](https://www.gradio.app/guides/creating-a-chatbot-fast).
+[ChatUI](https://github.com/huggingface/chat-ui) is an open-source interface built for consuming LLMs. It offers many customization options, such as web search with SERP API and more. ChatUI can automatically consume the TGI server and even provides an option to switch between different TGI endpoints. You can try it out at [Hugging Chat](https://huggingface.co/chat/), or use the [ChatUI Docker Space](https://huggingface.co/new-space?template=huggingchat/chat-ui-template) to deploy your own Hugging Chat to Spaces.

-## API documentation
+To serve both ChatUI and TGI in same environment, simply add your own endpoints to the `MODELS` variable in `.env.local` file inside the `chat-ui` repository. Provide the endpoints pointing to where TGI is served.

-You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route. The Swagger UI is also available [here](https://huggingface.github.io/text-generation-inference).
+```
+{
+// rest of the model config here
+"endpoints": [{"url": "https://HOST:PORT/generate_stream"}]
+}
+```
+
+![ChatUI](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/chatui_screen.png)
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@@ -19,6 +19,6 @@ docker run --gpus all \
    --shm-size 1g \
    -e HF_TOKEN=$token \
    -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.4 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 \
    --model-id $model
 ```
--- a/docs/source/basic_tutorials/preparing_model.md
+++ b/docs/source/basic_tutorials/preparing_model.md
@@ -4,7 +4,7 @@ Text Generation Inference improves the model in several aspects.

 ## Quantization

-TGI supports [bits-and-bytes](https://github.com/TimDettmers/bitsandbytes#bitsandbytes), [GPT-Q](https://arxiv.org/abs/2210.17323) and [AWQ](https://arxiv.org/abs/2306.00978) quantization. To speed up inference with quantization, simply set `quantize` flag to `bitsandbytes`, `gptq` or `awq` depending on the quantization technique you wish to use. When using GPT-Q quantization, you need to point to one of the models [here](https://huggingface.co/models?search=gptq) when using AWQ quantization, you need to point to one of the models [here](https://huggingface.co/models?search=awq). To get more information about quantization, please refer to [quantization guide](./../conceptual/quantization)
+TGI supports [bits-and-bytes](https://github.com/TimDettmers/bitsandbytes#bitsandbytes), [GPT-Q](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [Marlin](https://github.com/IST-DASLab/marlin), [EETQ](https://github.com/NetEase-FuXi/EETQ), [EXL2](https://github.com/turboderp/exllamav2), and [fp8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/) quantization. To speed up inference with quantization, simply set `quantize` flag to `bitsandbytes`, `gptq`, `awq`, `marlin`, `exl2`, `eetq` or `fp8` depending on the quantization technique you wish to use. When using GPT-Q quantization, you need to point to one of the models [here](https://huggingface.co/models?search=gptq). Similarly, when using AWQ quantization, you need to point to one of [these models](https://huggingface.co/models?search=awq). To get more information about quantization, please refer to [quantization guide](./../conceptual/quantization)


 ## RoPE Scaling

--- a/docs/source/basic_tutorials/using_guidance.md
+++ b/docs/source/basic_tutorials/using_guidance.md
@@ -4,7 +4,7 @@ Text Generation Inference (TGI) now supports [JSON and regex grammars](#grammar-

 These feature are available starting from version `1.4.3`. They are accessible via the [`huggingface_hub`](https://pypi.org/project/huggingface-hub/) library. The tool support is compatible with OpenAI's client libraries. The following guide will walk you through the new features and how to use them!

-_note: guidance is supported as grammar in the `/generate` endpoint and as tools in the `/chat/completions` endpoint._
+_note: guidance is supported as grammar in the `/generate` endpoint and as tools in the `v1/chat/completions` endpoint._

 ## How it works

@@ -157,7 +157,12 @@ from huggingface_hub import InferenceClient

 client = InferenceClient("http://localhost:3000")

-regexp = "((25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)"
+section_regex = "(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"
+regexp = f"HELLO\.{section_regex}\.WORLD\.{section_regex}"
+
+# This is a more realistic example of an ip address regex
+# regexp = f"{section_regex}\.{section_regex}\.{section_regex}\.{section_regex}"
+

 resp = client.text_generation(
    f"Whats Googles DNS? Please use the following regex: {regexp}",
@@ -170,7 +175,7 @@ resp = client.text_generation(


 print(resp)
-# 7.1.1.1
+# HELLO.255.WORLD.255

 ```


--- a/docs/source/basic_tutorials/visual_language_models.md
+++ b/docs/source/basic_tutorials/visual_language_models.md
@@ -84,7 +84,7 @@ print(chat)

 ```

-or with OpenAi's library:
+or with OpenAI's [client library](https://github.com/openai/openai-python):

 ```python
 from openai import OpenAI

--- a/docs/source/conceptual/external.md
+++ b/docs/source/conceptual/external.md
+# External Resources
+
+- Adyen wrote a detailed article about the interplay between TGI's main components: router and server.
+[LLM inference at scale with TGI (Martin Iglesias Goyanes - Adyen, 2024)](https://www.adyen.com/knowledge-hub/llm-inference-at-scale-with-tgi)
--- a/docs/source/conceptual/lora.md
+++ b/docs/source/conceptual/lora.md
@@ -36,6 +36,24 @@ To use LoRA in TGI, when starting the server, you can specify the list of LoRA m
 LORA_ADAPTERS=predibase/customer_support,predibase/dbpedia
 ```

+To specify model revision, use `adapter_id@revision`, as follows:
+
+```bash
+LORA_ADAPTERS=predibase/customer_support@main,predibase/dbpedia@rev2
+```
+
+To use a locally stored lora adapter, use `adapter-name=/path/to/adapter`, as seen below. When you want to use this adapter, set `"parameters": {"adapter_id": "adapter-name"}"`
+
+```bash
+LORA_ADAPTERS=myadapter=/some/path/to/adapter,myadapter2=/another/path/to/adapter
+```
+
+note it's possible to mix adapter_ids with adapter_id=adapter_path e.g.
+
+```bash
+LORA_ADAPTERS=predibase/dbpedia,myadapter=/path/to/dir/
+```
+
 In the server logs, you will see the following message:

 ```txt
@@ -60,6 +78,22 @@ curl 127.0.0.1:3000/generate \
 }'
 ```

+If you are using a lora adapter stored locally that was set in the following manner: `LORA_ADAPTERS=myadapter=/some/path/to/adapter`, here is an example payload:
+
+```json
+curl 127.0.0.1:3000/generate \
+    -X POST \
+    -H 'Content-Type: application/json' \
+    -d '{
+  "inputs": "Hello who are you?",
+  "parameters": {
+    "max_new_tokens": 40,
+    "adapter_id": "myadapter"
+  }
+}'
+```
+
+
 > **Note:** The Lora feature is new and still being improved. If you encounter any issues or have any feedback, please let us know by opening an issue on the [GitHub repository](https://github.com/huggingface/text-generation-inference/issues/new/choose). Additionally documentation and an improved client library will be published soon.

 An updated tutorial with detailed examples will be published soon. Stay tuned!
--- a/docs/source/conceptual/quantization.md
+++ b/docs/source/conceptual/quantization.md
 # Quantization

-TGI offers GPTQ and bits-and-bytes quantization to quantize large language models.
+TGI offers many quantization schemes to run LLMs effectively and fast based on your use-case. TGI supports GPTQ, AWQ, bits-and-bytes, EETQ, Marlin, EXL2 and fp8 quantization.
+
+To leverage GPTQ, AWQ, Marlin and EXL2 quants, you must provide pre-quantized weights. Whereas for bits-and-bytes, EETQ and fp8, weights are quantized by TGI on the fly.
+
+We recommend using the official quantization scripts for creating your quants:
+1. [AWQ](https://github.com/casper-hansen/AutoAWQ/blob/main/examples/quantize.py)
+2. [GPTQ/ Marlin](https://github.com/AutoGPTQ/AutoGPTQ/blob/main/examples/quantization/basic_usage.py)
+3. [EXL2](https://github.com/turboderp/exllamav2/blob/master/doc/convert.md)
+
+For on-the-fly quantization you simply need to pass one of the supported quantization types and TGI takes care of the rest.
+
+## Quantization with bitsandbytes, EETQ & fp8
+
+bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models. Unlike GPTQ quantization, bitsandbytes doesn't require a calibration dataset or any post-processing – weights are automatically quantized on load. However, inference with bitsandbytes is slower than GPTQ or FP16 precision.
+
+8-bit quantization enables multi-billion parameter scale models to fit in smaller hardware without degrading performance too much.
+In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
+
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 --model-id $model --quantize bitsandbytes
+```
+
+4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
+
+In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
+
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 --model-id $model --quantize bitsandbytes-nf4
+```
+
+You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
+
+Similarly you can use pass you can pass `--quantize eetq` or `--quantize fp8` for respective quantization schemes.
+
+In addition to this, TGI allows creating GPTQ quants directly by passing the model weights and a calibration dataset.

 ## Quantization with GPTQ

@@ -14,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
 TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇

 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize gptq
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 --model-id $model --quantize gptq
 ```

 Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.
@@ -36,24 +70,3 @@ You can learn more about the quantization options by running `text-generation-se

 If you wish to do more with GPTQ models (e.g. train an adapter on top), you can read about transformers GPTQ integration [here](https://huggingface.co/blog/gptq-integration).
 You can learn more about GPTQ from the [paper](https://arxiv.org/pdf/2210.17323.pdf).
-
-## Quantization with bitsandbytes
-
-bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models. Unlike GPTQ quantization, bitsandbytes doesn't require a calibration dataset or any post-processing – weights are automatically quantized on load. However, inference with bitsandbytes is slower than GPTQ or FP16 precision.
-
-8-bit quantization enables multi-billion parameter scale models to fit in smaller hardware without degrading performance too much.
-In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
-
-```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize bitsandbytes
-```
-
-4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
-
-In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
-
-```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize bitsandbytes-nf4
-```
-
-You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
--- a/docs/source/conceptual/streaming.md
+++ b/docs/source/conceptual/streaming.md
 # Streaming

+
 ## What is Streaming?

 Token streaming is the mode in which the server returns the tokens one by one as the model generates them. This enables showing progressive generations to the user rather than waiting for the whole generation. Streaming is an essential aspect of the end-user experience as it reduces latency, one of the most critical aspects of a smooth experience.
@@ -48,34 +49,29 @@ To stream tokens with `InferenceClient`, simply pass `stream=True` and iterate o
 ```python
 from huggingface_hub import InferenceClient

-client = InferenceClient("http://127.0.0.1:8080")
-for token in client.text_generation("How do you make cheese?", max_new_tokens=12, stream=True):
-    print(token)
-
-# To
-# make
-# cheese
-#,
-# you
-# need
-# to
-# start
-# with
-# milk
-#.
-```
-
-If you want additional details, you can add `details=True`. In this case, you get a `TextGenerationStreamResponse` which contains additional information such as the probabilities and the tokens. For the final response in the stream, it also returns the full generated text.
-
-```python
-for details in client.text_generation("How do you make cheese?", max_new_tokens=12, details=True, stream=True):
-    print(details)
-
-#TextGenerationStreamResponse(token=Token(id=193, text='\n', logprob=-0.007358551, special=False), generated_text=None, details=None)
-#TextGenerationStreamResponse(token=Token(id=2044, text='To', logprob=-1.1357422, special=False), generated_text=None, details=None)
-#TextGenerationStreamResponse(token=Token(id=717, text=' make', logprob=-0.009841919, special=False), generated_text=None, details=None)
-#...
-#TextGenerationStreamResponse(token=Token(id=25, text='.', logprob=-1.3408203, special=False), generated_text='\nTo make cheese, you need to start with milk.', details=StreamDetails(finish_reason=<FinishReason.Length: 'length'>, generated_tokens=12, seed=None))
+client = InferenceClient(base_url="http://127.0.0.1:8080")
+output = client.chat.completions.create(
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Count to 10"},
+    ],
+    stream=True,
+    max_tokens=1024,
+)
+
+for chunk in output:
+    print(chunk.choices[0].delta.content)
+
+# 1
+# 2
+# 3
+# 4
+# 5
+# 6
+# 7
+# 8
+# 9
+# 10
 ```

 The `huggingface_hub` library also comes with an `AsyncInferenceClient` in case you need to handle the requests concurrently.
@@ -83,31 +79,46 @@ The `huggingface_hub` library also comes with an `AsyncInferenceClient` in case
 ```python
 from huggingface_hub import AsyncInferenceClient

-client = AsyncInferenceClient("http://127.0.0.1:8080")
-async for token in await client.text_generation("How do you make cheese?", stream=True):
-    print(token)
-
-# To
-# make
-# cheese
-#,
-# you
-# need
-# to
-# start
-# with
-# milk
+client = AsyncInferenceClient(base_url="http://127.0.0.1:8080")
+async def main():
+    stream = await client.chat.completions.create(
+        messages=[{"role": "user", "content": "Say this is a test"}],
+        stream=True,
+    )
+    async for chunk in stream:
+        print(chunk.choices[0].delta.content or "", end="")
+
+asyncio.run(main())
+
+# This
+# is
+# a
+# test
 #.
 ```

 ### Streaming with cURL

-To use the `generate_stream` endpoint with curl, you can add the `-N` flag, which disables curl default buffering and shows data as it arrives from the server
+To use the OpenAI Chat Completions compatible Messages API `v1/chat/completions` endpoint with curl, you can add the `-N` flag, which disables curl default buffering and shows data as it arrives from the server

 ```curl
-curl -N 127.0.0.1:8080/generate_stream \
+curl localhost:8080/v1/chat/completions \
    -X POST \
-    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
+    -d '{
+  "model": "tgi",
+  "messages": [
+    {
+      "role": "system",
+      "content": "You are a helpful assistant."
+    },
+    {
+      "role": "user",
+      "content": "What is deep learning?"
+    }
+  ],
+  "stream": true,
+  "max_tokens": 20
+}' \
    -H 'Content-Type: application/json'
 ```


--- a/docs/source/installation_amd.md
+++ b/docs/source/installation_amd.md
@@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
    --device=/dev/kfd --device=/dev/dri --group-add video \
    --ipc=host --shm-size 256g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.1.0-rocm \
+    ghcr.io/huggingface/text-generation-inference:2.4.0-rocm \
    --model-id $model
 ```

@@ -31,6 +31,12 @@ Two implementations of Flash Attention are available for ROCm, the first is [ROC

 By default, the Composable Kernel implementation is used. However, the Triton implementation has slightly lower latency on MI250 and MI300, but requires a warmup which can be prohibitive as it needs to be done again for each new prompt length. If needed, FA Triton impelmentation can be enabled with `--env ROCM_USE_FLASH_ATTN_V2_TRITON="0"` when launching TGI's docker container.

+## Custom PagedAttention
+
+For better performance on ROCm, a custom Paged Attention kernel is available and is enabled by default. To disable it and fall back to the PagedAttention v2 kernel, set the environment variable `ROCM_USE_CUSTOM_PAGED_ATTN=0`.
+
+The custom kernel supports bf16 and fp16 data types, block size of 16, head size of 128, a maximum context length of 16k, and GQA ratios between 1 and 16. For other configurations, we use the PagedAttention v2 kernel.
+
 ## Unsupported features

 The following features are currently not supported in the ROCm version of TGI, and the supported may be extended in the future:

--- a/docs/source/installation_intel.md
+++ b/docs/source/installation_intel.md
+# Using TGI with Intel GPUs
+
+TGI optimized models are supported on Intel Data Center GPU [Max1100](https://www.intel.com/content/www/us/en/products/sku/232876/intel-data-center-gpu-max-1100/specifications.html), [Max1550](https://www.intel.com/content/www/us/en/products/sku/232873/intel-data-center-gpu-max-1550/specifications.html), the recommended usage is through Docker.
+
+
+On a server powered by Intel GPUs, TGI can be launched with the following command:
+
+```bash
+model=teknium/OpenHermes-2.5-Mistral-7B
+volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
+
+docker run --rm --privileged --cap-add=sys_nice \
+    --device=/dev/dri \
+    --ipc=host --shm-size 1g --net host -v $volume:/data \
+    ghcr.io/huggingface/text-generation-inference:2.4.0-intel-xpu \
+    --model-id $model --cuda-graphs 0
+```
+
+# Using TGI with Intel CPUs
+
+Intel® Extension for PyTorch (IPEX) also provides further optimizations for Intel CPUs. The IPEX provides optimization operations such as flash attention, page attention, Add + LayerNorm, ROPE and more.
+
+On a server powered by Intel CPU, TGI can be launched with the following command:
+
+```bash
+model=teknium/OpenHermes-2.5-Mistral-7B
+volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
+
+docker run --rm --privileged --cap-add=sys_nice \
+    --device=/dev/dri \
+    --ipc=host --shm-size 1g --net host -v $volume:/data \
+    ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu \
+    --model-id $model --cuda-graphs 0
+```
+
+The launched TGI server can then be queried from clients, make sure to check out the [Consuming TGI](./basic_tutorials/consuming_tgi) guide.
--- a/docs/source/installation_nvidia.md
+++ b/docs/source/installation_nvidia.md
@@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

 docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.1.0 \
+    ghcr.io/huggingface/text-generation-inference:2.4.0 \
    --model-id $model
 ```


--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@@ -11,17 +11,25 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.1.0 \
+    ghcr.io/huggingface/text-generation-inference:2.4.0 \
    --model-id $model
 ```

+<Tip>
+
+If you want to serve gated or private models, please refer to
+[this guide](https://huggingface.co/docs/text-generation-inference/en/basic_tutorials/gated_model_access)
+for detailed instructions.
+
+</Tip>
+
 ### Supported hardware

-TGI supports various hardware. Make sure to check the [Using TGI with Nvidia GPUs](./installation_nvidia), [Using TGI with AMD GPUs](./installation_amd), [Using TGI with Gaudi](./installation_gaudi), [Using TGI with Inferentia](./installation_inferentia) guides depending on which hardware you would like to deploy TGI on.
+TGI supports various hardware. Make sure to check the [Using TGI with Nvidia GPUs](./installation_nvidia), [Using TGI with AMD GPUs](./installation_amd), [Using TGI with Intel GPUs](./installation_intel), [Using TGI with Gaudi](./installation_gaudi), [Using TGI with Inferentia](./installation_inferentia) guides depending on which hardware you would like to deploy TGI on.

 ## Consuming TGI

-Once TGI is running, you can use the `generate` endpoint by doing requests. To learn more about how to query the endpoints, check the [Consuming TGI](./basic_tutorials/consuming_tgi) section, where we show examples with utility libraries and UIs. Below you can see a simple snippet to query the endpoint.
+Once TGI is running, you can use the `generate` endpoint or the Open AI Chat Completion API compatible [Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api) by doing requests. To learn more about how to query the endpoints, check the [Consuming TGI](./basic_tutorials/consuming_tgi) section, where we show examples with utility libraries and UIs. Below you can see a simple snippet to query the endpoint.

 <inferencesnippet>
 <python>
@@ -88,7 +96,7 @@ curl 127.0.0.1:8080/generate \
 To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.

 ```bash
-docker run ghcr.io/huggingface/text-generation-inference:2.1.0 --help
+docker run ghcr.io/huggingface/text-generation-inference:2.4.0 --help
 ```

 </Tip>
--- a/docs/source/messages_api.md
+++ b/docs/source/messages_api.md
-# Messages API
+# HTTP API Reference

-Text Generation Inference (TGI) now supports the Messages API, which is fully compatible with the OpenAI Chat Completion API. This feature is available starting from version 1.4.0. You can use OpenAI's client libraries or third-party libraries expecting OpenAI schema to interact with TGI's Messages API. Below are some examples of how to utilize this compatibility.
+#### Table of Contents

-> **Note:** The Messages API is supported from TGI version 1.4.0 and above. Ensure you are using a compatible version to access this feature.
+- [Text Generation Inference custom API](#text-generation-inference-custom-api)
+- [OpenAI Messages API](#openai-messages-api)
+  - [Making a Request](#making-a-request)
+  - [Streaming](#streaming)
+  - [Synchronous](#synchronous)
+  - [Hugging Face Inference Endpoints](#hugging-face-inference-endpoints)
+  - [Cloud Providers](#cloud-providers)
+      - [Amazon SageMaker](#amazon-sagemaker)

-#### Table of Contents
+The HTTP API is a RESTful API that allows you to interact with the text-generation-inference component. Two endpoints are available:
+* Text Generation Inference [custom API](https://huggingface.github.io/text-generation-inference/)
+* OpenAI's [Messages API](#openai-messages-api)
+
+
+## Text Generation Inference custom API

- [Making a Request](#making-a-request)
- [Streaming](#streaming)
- [Synchronous](#synchronous)
- [Hugging Face Inference Endpoints](#hugging-face-inference-endpoints)
- [Cloud Providers](#cloud-providers)
-  - [Amazon SageMaker](#amazon-sagemaker)
+Check the [API documentation](https://huggingface.github.io/text-generation-inference/) for more information on how to interact with the Text Generation Inference API.
+
+## OpenAI Messages API
+
+Text Generation Inference (TGI) now supports the Messages API, which is fully compatible with the OpenAI Chat Completion API. This feature is available starting from version 1.4.0. You can use OpenAI's client libraries or third-party libraries expecting OpenAI schema to interact with TGI's Messages API. Below are some examples of how to utilize this compatibility.
+
+> **Note:** The Messages API is supported from TGI version 1.4.0 and above. Ensure you are using a compatible version to access this feature.

 ## Making a Request

@@ -128,9 +141,7 @@ TGI can be deployed on various cloud providers for scalable and robust text gene

 ## Amazon SageMaker

-To enable the Messages API in Amazon SageMaker you need to set the environment variable `MESSAGES_API_ENABLED=true`.
-
-This will modify the `/invocations` route to accept Messages dictonaries consisting out of role and content. See the example below on how to deploy Llama with the new Messages API.
+Amazon Sagemaker natively supports the message API:

 ```python
 import json
@@ -148,12 +159,11 @@ except ValueError:
 hub = {
 'HF_MODEL_ID':'HuggingFaceH4/zephyr-7b-beta',
 'SM_NUM_GPUS': json.dumps(1),
- 'MESSAGES_API_ENABLED': True
 }

 # create Hugging Face Model Class
 huggingface_model = HuggingFaceModel(
- image_uri=get_huggingface_llm_image_uri("huggingface",version="1.4.0"),
+ image_uri=get_huggingface_llm_image_uri("huggingface",version="2.4.0"),
 env=hub,
 role=role,
 )