"lm_eval/tasks/bigbench/generate_until_template_yaml" did not exist on "1a77b4d54e84a7b9a49db2bbc7325ede487c9d00"
Commit 81a882ad authored by jixx's avatar jixx
Browse files

add tgi2.4.0

parent 9822d7f6
......@@ -757,7 +757,12 @@ class AsyncClient:
continue
payload = byte_payload.decode("utf-8")
if payload.startswith("data:"):
json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
payload_data = (
payload.lstrip("data:").rstrip("\n").removeprefix(" ")
)
if payload_data == "[DONE]":
break
json_payload = json.loads(payload_data)
try:
response = ChatCompletionChunk(**json_payload)
yield response
......
......@@ -21,7 +21,7 @@ def deployed_models(headers: Optional[Dict] = None) -> List[DeployedModel]:
List[DeployedModel]: list of all currently deployed models
"""
resp = requests.get(
f"https://api-inference.huggingface.co/framework/text-generation-inference",
"https://api-inference.huggingface.co/framework/text-generation-inference",
headers=headers,
timeout=5,
)
......
......@@ -28,11 +28,17 @@ class ToolCall(BaseModel):
function: dict
class Chunk(BaseModel):
type: str
text: Optional[str] = None
image_url: Any = None
class Message(BaseModel):
# Role of the message sender
role: str
# Content of the message
content: Optional[str] = None
content: Optional[Union[str, List[Chunk]]] = None
# Optional name of the message sender
name: Optional[str] = None
# Tool calls associated with the chat completion
......@@ -61,7 +67,7 @@ class ChoiceDeltaToolCall(BaseModel):
class ChoiceDelta(BaseModel):
role: str
content: Optional[str] = None
tool_calls: Optional[ChoiceDeltaToolCall]
tool_calls: Optional[ChoiceDeltaToolCall] = None
class Choice(BaseModel):
......@@ -168,7 +174,7 @@ class ChatCompletionComplete(BaseModel):
# Log probabilities for the chat completion
logprobs: Optional[Any]
# Reason for completion
finish_reason: str
finish_reason: Optional[str]
# Usage details of the chat completion
usage: Optional[Any] = None
......@@ -191,6 +197,7 @@ class ChatCompletionChunk(BaseModel):
model: str
system_fingerprint: str
choices: List[Choice]
usage: Optional[Any] = None
class Parameters(BaseModel):
......
......@@ -10,7 +10,7 @@
"name": "Apache 2.0",
"url": "https://www.apache.org/licenses/LICENSE-2.0"
},
"version": "2.1.1"
"version": "2.4.0"
},
"paths": {
"/": {
......@@ -316,6 +316,98 @@
}
}
},
"/invocations": {
"post": {
"tags": [
"Text Generation Inference"
],
"summary": "Generate tokens from Sagemaker request",
"operationId": "sagemaker_compatibility",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SagemakerRequest"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Generated Chat Completion",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SagemakerResponse"
}
},
"text/event-stream": {
"schema": {
"$ref": "#/components/schemas/SagemakerStreamResponse"
}
}
}
},
"422": {
"description": "Input validation error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Input validation error",
"error_type": "validation"
}
}
}
},
"424": {
"description": "Generation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Request failed during generation",
"error_type": "generation"
}
}
}
},
"429": {
"description": "Model is overloaded",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Model is overloaded",
"error_type": "overloaded"
}
}
}
},
"500": {
"description": "Incomplete generation",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Incomplete generation",
"error_type": "incomplete_generation"
}
}
}
}
}
}
},
"/metrics": {
"get": {
"tags": [
......@@ -492,12 +584,12 @@
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Completion"
"$ref": "#/components/schemas/CompletionFinal"
}
},
"text/event-stream": {
"schema": {
"$ref": "#/components/schemas/CompletionCompleteChunk"
"$ref": "#/components/schemas/Chunk"
}
}
}
......@@ -556,6 +648,37 @@
}
}
}
},
"/v1/models": {
"get": {
"tags": [
"Text Generation Inference"
],
"summary": "Get model info",
"operationId": "openai_get_model_info",
"responses": {
"200": {
"description": "Served model info",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ModelInfo"
}
}
}
},
"404": {
"description": "Model not found",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
}
}
}
}
}
}
}
},
"components": {
......@@ -711,6 +834,14 @@
},
"system_fingerprint": {
"type": "string"
},
"usage": {
"allOf": [
{
"$ref": "#/components/schemas/Usage"
}
],
"nullable": true
}
}
},
......@@ -809,7 +940,6 @@
"ChatRequest": {
"type": "object",
"required": [
"model",
"messages"
],
"properties": {
......@@ -820,6 +950,13 @@
"example": "1.0",
"nullable": true
},
"guideline": {
"type": "string",
"description": "A guideline to be used in the chat_template",
"default": "null",
"example": "null",
"nullable": true
},
"logit_bias": {
"type": "array",
"items": {
......@@ -854,7 +991,8 @@
"model": {
"type": "string",
"description": "[UNUSED] ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.",
"example": "mistralai/Mistral-7B-Instruct-v0.2"
"example": "mistralai/Mistral-7B-Instruct-v0.2",
"nullable": true
},
"n": {
"type": "integer",
......@@ -899,6 +1037,14 @@
"stream": {
"type": "boolean"
},
"stream_options": {
"allOf": [
{
"$ref": "#/components/schemas/StreamOptions"
}
],
"nullable": true
},
"temperature": {
"type": "number",
"format": "float",
......@@ -909,7 +1055,7 @@
"tool_choice": {
"allOf": [
{
"$ref": "#/components/schemas/ToolType"
"$ref": "#/components/schemas/ToolChoice"
}
],
"nullable": true
......@@ -917,7 +1063,7 @@
"tool_prompt": {
"type": "string",
"description": "A prompt to be appended before the tools",
"example": "\"You will be presented with a JSON schema representing a set of tools.\nIf the user request lacks of sufficient information to make a precise tool selection: Do not invent any tool's properties, instead notify with an error message.\n\nJSON Schema:\n\"",
"example": "Given the functions available, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {name: function name, parameters: dictionary of argument name and its value}.Do not use variables.",
"nullable": true
},
"tools": {
......@@ -1116,7 +1262,6 @@
"CompletionRequest": {
"type": "object",
"required": [
"model",
"prompt"
],
"properties": {
......@@ -1138,7 +1283,8 @@
"model": {
"type": "string",
"description": "UNUSED\nID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.",
"example": "mistralai/Mistral-7B-Instruct-v0.2"
"example": "mistralai/Mistral-7B-Instruct-v0.2",
"nullable": true
},
"prompt": {
"$ref": "#/components/schemas/Prompt"
......@@ -1324,6 +1470,17 @@
}
}
},
"FunctionName": {
"type": "object",
"required": [
"name"
],
"properties": {
"name": {
"type": "string"
}
}
},
"GenerateParameters": {
"type": "object",
"properties": {
......@@ -1569,16 +1726,11 @@
"type": "object",
"required": [
"model_id",
"model_dtype",
"model_device_type",
"max_concurrent_requests",
"max_best_of",
"max_stop_sequences",
"max_input_tokens",
"max_total_tokens",
"waiting_served_ratio",
"max_batch_total_tokens",
"max_waiting_tokens",
"validation_workers",
"max_client_batch_size",
"router",
......@@ -1590,18 +1742,6 @@
"example": "null",
"nullable": true
},
"max_batch_size": {
"type": "integer",
"example": "null",
"nullable": true,
"minimum": 0
},
"max_batch_total_tokens": {
"type": "integer",
"format": "int32",
"example": "32000",
"minimum": 0
},
"max_best_of": {
"type": "integer",
"example": "2",
......@@ -1633,19 +1773,6 @@
"example": "2048",
"minimum": 0
},
"max_waiting_tokens": {
"type": "integer",
"example": "20",
"minimum": 0
},
"model_device_type": {
"type": "string",
"example": "cuda"
},
"model_dtype": {
"type": "string",
"example": "torch.float16"
},
"model_id": {
"type": "string",
"description": "Model info",
......@@ -1679,11 +1806,6 @@
"version": {
"type": "string",
"example": "0.5.0"
},
"waiting_served_ratio": {
"type": "number",
"format": "float",
"example": "1.2"
}
}
},
......@@ -1708,6 +1830,101 @@
}
}
},
"MessageChunk": {
"oneOf": [
{
"type": "object",
"required": [
"text",
"type"
],
"properties": {
"text": {
"type": "string"
},
"type": {
"type": "string",
"enum": [
"text"
]
}
}
},
{
"type": "object",
"required": [
"image_url",
"type"
],
"properties": {
"image_url": {
"$ref": "#/components/schemas/Url"
},
"type": {
"type": "string",
"enum": [
"image_url"
]
}
}
}
],
"discriminator": {
"propertyName": "type"
}
},
"MessageContent": {
"oneOf": [
{
"type": "string"
},
{
"type": "array",
"items": {
"$ref": "#/components/schemas/MessageChunk"
}
}
]
},
"ModelInfo": {
"type": "object",
"required": [
"id",
"object",
"created",
"owned_by"
],
"properties": {
"created": {
"type": "integer",
"format": "int64",
"example": 1686935002,
"minimum": 0
},
"id": {
"type": "string",
"example": "gpt2"
},
"object": {
"type": "string",
"example": "model"
},
"owned_by": {
"type": "string",
"example": "openai"
}
}
},
"OutputMessage": {
"oneOf": [
{
"$ref": "#/components/schemas/TextMessage"
},
{
"$ref": "#/components/schemas/ToolCallMessage"
}
]
},
"PrefillToken": {
"type": "object",
"required": [
......@@ -1740,6 +1957,45 @@
"type": "string"
}
},
"SagemakerRequest": {
"oneOf": [
{
"$ref": "#/components/schemas/CompatGenerateRequest"
},
{
"$ref": "#/components/schemas/ChatRequest"
},
{
"$ref": "#/components/schemas/CompletionRequest"
}
]
},
"SagemakerResponse": {
"oneOf": [
{
"$ref": "#/components/schemas/GenerateResponse"
},
{
"$ref": "#/components/schemas/ChatCompletion"
},
{
"$ref": "#/components/schemas/CompletionFinal"
}
]
},
"SagemakerStreamResponse": {
"oneOf": [
{
"$ref": "#/components/schemas/StreamResponse"
},
{
"$ref": "#/components/schemas/ChatCompletionChunk"
},
{
"$ref": "#/components/schemas/Chunk"
}
]
},
"SimpleToken": {
"type": "object",
"required": [
......@@ -1775,7 +2031,8 @@
"type": "object",
"required": [
"finish_reason",
"generated_tokens"
"generated_tokens",
"input_length"
],
"properties": {
"finish_reason": {
......@@ -1787,6 +2044,12 @@
"example": 1,
"minimum": 0
},
"input_length": {
"type": "integer",
"format": "int32",
"example": 1,
"minimum": 0
},
"seed": {
"type": "integer",
"format": "int64",
......@@ -1796,6 +2059,19 @@
}
}
},
"StreamOptions": {
"type": "object",
"required": [
"include_usage"
],
"properties": {
"include_usage": {
"type": "boolean",
"description": "If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value.",
"example": "true"
}
}
},
"StreamResponse": {
"type": "object",
"required": [
......@@ -1834,6 +2110,23 @@
}
}
},
"TextMessage": {
"type": "object",
"required": [
"role",
"content"
],
"properties": {
"content": {
"type": "string",
"example": "My name is David and I"
},
"role": {
"type": "string",
"example": "user"
}
}
},
"Token": {
"type": "object",
"required": [
......@@ -1906,15 +2199,64 @@
}
}
},
"ToolCallDelta": {
"type": "object",
"required": [
"role",
"tool_calls"
],
"properties": {
"role": {
"type": "string",
"example": "assistant"
},
"tool_calls": {
"$ref": "#/components/schemas/DeltaToolCall"
}
}
},
"ToolCallMessage": {
"type": "object",
"required": [
"role",
"tool_calls"
],
"properties": {
"role": {
"type": "string",
"example": "assistant"
},
"tool_calls": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ToolCall"
}
}
}
},
"ToolChoice": {
"allOf": [
{
"$ref": "#/components/schemas/ToolType"
}
],
"nullable": true
},
"ToolType": {
"oneOf": [
{
"type": "object",
"default": null,
"nullable": true
"type": "string",
"description": "Means the model can pick between generating a message or calling one or more tools.",
"enum": [
"auto"
]
},
{
"type": "string"
"type": "string",
"description": "Means the model will not call any tool and instead generates a message.",
"enum": [
"none"
]
},
{
"type": "object",
......@@ -1927,7 +2269,20 @@
}
}
}
]
],
"description": "Controls which (if any) tool is called by the model.",
"example": "auto"
},
"Url": {
"type": "object",
"required": [
"url"
],
"properties": {
"url": {
"type": "string"
}
}
},
"Usage": {
"type": "object",
......
......@@ -3,6 +3,8 @@
title: Text Generation Inference
- local: quicktour
title: Quick Tour
- local: supported_models
title: Supported Models
- local: installation_nvidia
title: Using TGI with Nvidia GPUs
- local: installation_amd
......@@ -11,14 +13,15 @@
title: Using TGI with Intel Gaudi
- local: installation_inferentia
title: Using TGI with AWS Inferentia
- local: installation_intel
title: Using TGI with Intel GPUs
- local: installation
title: Installation from source
- local: supported_models
title: Supported Models and Hardware
- local: messages_api
title: Messages API
- local: architecture
title: Internal Architecture
- local: usage_statistics
title: Usage Statistics
title: Getting started
- sections:
- local: basic_tutorials/consuming_tgi
......@@ -29,8 +32,6 @@
title: Serving Private & Gated Models
- local: basic_tutorials/using_cli
title: Using TGI CLI
- local: basic_tutorials/launcher
title: All TGI CLI options
- local: basic_tutorials/non_core_models
title: Non-core Model Serving
- local: basic_tutorials/safety
......@@ -44,6 +45,14 @@
- local: basic_tutorials/train_medusa
title: Train Medusa
title: Tutorials
- sections:
- local: reference/launcher
title: All TGI CLI options
- local: reference/metrics
title: Exported Metrics
- local: reference/api_reference
title: API Reference
title: Reference
- sections:
- local: conceptual/streaming
title: Streaming
......@@ -60,9 +69,11 @@
- local: conceptual/speculation
title: Speculation (Medusa, ngram)
- local: conceptual/guidance
title: How Guidance Works (via outlines
title: How Guidance Works (via outlines)
- local: conceptual/lora
title: LoRA (Low-Rank Adaptation)
- local: conceptual/external
title: External Resources
title: Conceptual Guides
......@@ -10,7 +10,7 @@ This diagram shows well there are these separate components:
- **The router**, also named `webserver`, that receives the client requests, buffers them, creates some batches, and prepares gRPC calls to a model server.
- **The model server**, responsible of receiving the gRPC requests and to process the inference on the model. If the model is sharded across multiple accelerators (e.g.: multiple GPUs), the model server shards might be synchronized via NCCL or equivalent.
- **The launcher** is a helper thar will be able to launch one or several model servers (if model is sharded), and it launches the router with the compatible arguments.
- **The launcher** is a helper that will be able to launch one or several model servers (if model is sharded), and it launches the router with the compatible arguments.
The router and the model server can be two different machines, they do not need to be deployed together.
......@@ -103,6 +103,7 @@ Several variants of the model server exist that are actively supported by Huggin
- By default, the model server will attempt building [a server optimized for Nvidia GPUs with CUDA](https://huggingface.co/docs/text-generation-inference/installation_nvidia). The code for this version is hosted in the [main TGI repository](https://github.com/huggingface/text-generation-inference).
- A [version optimized for AMD with ROCm](https://huggingface.co/docs/text-generation-inference/installation_amd) is hosted in the main TGI repository. Some model features differ.
- A [version optimized for Intel GPUs](https://huggingface.co/docs/text-generation-inference/installation_intel) is hosted in the main TGI repository. Some model features differ.
- The [version for Intel Gaudi](https://huggingface.co/docs/text-generation-inference/installation_gaudi) is maintained on a forked repository, often resynchronized with the main [TGI repository](https://github.com/huggingface/tgi-gaudi).
- A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained as part of [Optimum Neuron](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference).
- A version for Google TPUs is maintained as part of [Optimum TPU](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference).
......
# Consuming Text Generation Inference
There are many ways you can consume Text Generation Inference server in your applications. After launching, you can use the `/generate` route and make a `POST` request to get results from the server. You can also use the `/generate_stream` route if you want TGI to return a stream of tokens. You can make the requests using the tool of your preference, such as curl, Python or TypeScrpt. For a final end-to-end experience, we also open-sourced ChatUI, a chat interface for open-source models.
There are many ways to consume Text Generation Inference (TGI) server in your applications. After launching the server, you can use the [Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api) `/v1/chat/completions` route and make a `POST` request to get results from the server. You can also pass `"stream": true` to the call if you want TGI to return a stream of tokens.
For more information on the API, consult the OpenAPI documentation of `text-generation-inference` available [here](https://huggingface.github.io/text-generation-inference).
You can make the requests using any tool of your preference, such as curl, Python, or TypeScript. For an end-to-end experience, we've open-sourced [ChatUI](https://github.com/huggingface/chat-ui), a chat interface for open-access models.
## curl
After the launch, you can query the model using either the `/generate` or `/generate_stream` routes:
After a successful server launch, you can query the model using the `v1/chat/completions` route, to get responses that are compliant to the OpenAI Chat Completion spec:
```bash
curl 127.0.0.1:8080/generate \
curl localhost:8080/v1/chat/completions \
-X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
-d '{
"model": "tgi",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is deep learning?"
}
],
"stream": true,
"max_tokens": 20
}' \
-H 'Content-Type: application/json'
```
## Inference Client
[`huggingface-hub`](https://huggingface.co/docs/huggingface_hub/main/en/index) is a Python library to interact with the Hugging Face Hub, including its endpoints. It provides a nice high-level class, [`~huggingface_hub.InferenceClient`], which makes it easy to make calls to a TGI endpoint. `InferenceClient` also takes care of parameter validation and provides a simple to-use interface.
You can simply install `huggingface-hub` package with pip.
For non-chat use-cases, you can also use the `/generate` and `/generate_stream` routes.
```bash
pip install huggingface-hub
```
Once you start the TGI server, instantiate `InferenceClient()` with the URL to the endpoint serving the model. You can then call `text_generation()` to hit the endpoint through Python.
```python
from huggingface_hub import InferenceClient
client = InferenceClient(model="http://127.0.0.1:8080")
client.text_generation(prompt="Write a code for snake game")
curl 127.0.0.1:8080/generate \
-X POST \
-d '{
"inputs":"What is Deep Learning?",
"parameters":{
"max_new_tokens":20
}
}' \
-H 'Content-Type: application/json'
```
You can do streaming with `InferenceClient` by passing `stream=True`. Streaming will return tokens as they are being generated in the server. To use streaming, you can do as follows:
## Python
```python
for token in client.text_generation("How do you make cheese?", max_new_tokens=12, stream=True):
print(token)
```
### Inference Client
Another parameter you can use with TGI backend is `details`. You can get more details on generation (tokens, probabilities, etc.) by setting `details` to `True`. When it's specified, TGI will return a `TextGenerationResponse` or `TextGenerationStreamResponse` rather than a string or stream.
[`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/main/en/index) is a Python library to interact with the Hugging Face Hub, including its endpoints. It provides a high-level class, [`huggingface_hub.InferenceClient`](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient), which makes it easy to make calls to TGI's Messages API. `InferenceClient` also takes care of parameter validation and provides a simple-to-use interface.
```python
output = client.text_generation(prompt="Meaning of life is", details=True)
print(output)
Install `huggingface_hub` package via pip.
# TextGenerationResponse(generated_text=' a complex concept that is not always clear to the individual. It is a concept that is not always', details=Details(finish_reason=<FinishReason.Length: 'length'>, generated_tokens=20, seed=None, prefill=[], tokens=[Token(id=267, text=' a', logprob=-2.0723474, special=False), Token(id=11235, text=' complex', logprob=-3.1272552, special=False), Token(id=17908, text=' concept', logprob=-1.3632495, special=False),..))
```bash
pip install huggingface_hub
```
You can see how to stream below.
You can now use `InferenceClient` the exact same way you would use `OpenAI` client in Python
```python
output = client.text_generation(prompt="Meaning of life is", stream=True, details=True)
print(next(iter(output)))
from huggingface_hub import InferenceClient
# TextGenerationStreamResponse(token=Token(id=267, text=' a', logprob=-2.0723474, special=False), generated_text=None, details=None)
client = InferenceClient(
base_url="http://localhost:8080/v1/",
)
output = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Count to 10"},
],
stream=True,
max_tokens=1024,
)
for chunk in output:
print(chunk.choices[0].delta.content)
```
You can check out the details of the function [here](https://huggingface.co/docs/huggingface_hub/main/en/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation). There is also an async version of the client, `AsyncInferenceClient`, based on `asyncio` and `aiohttp`. You can find docs for it [here](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.AsyncInferenceClient)
You can check out more details about OpenAI compatibility [here](https://huggingface.co/docs/huggingface_hub/en/guides/inference#openai-compatibility).
There is also an async version of the client, `AsyncInferenceClient`, based on `asyncio` and `aiohttp`. You can find docs for it [here](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.AsyncInferenceClient)
## ChatUI
### OpenAI Client
ChatUI is an open-source interface built for LLM serving. It offers many customization options, such as web search with SERP API and more. ChatUI can automatically consume the TGI server and even provides an option to switch between different TGI endpoints. You can try it out at [Hugging Chat](https://huggingface.co/chat/), or use the [ChatUI Docker Space](https://huggingface.co/new-space?template=huggingchat/chat-ui-template) to deploy your own Hugging Chat to Spaces.
You can directly use the OpenAI [Python](https://github.com/openai/openai-python) or [JS](https://github.com/openai/openai-node) clients to interact with TGI.
To serve both ChatUI and TGI in same environment, simply add your own endpoints to the `MODELS` variable in `.env.local` file inside the `chat-ui` repository. Provide the endpoints pointing to where TGI is served.
Install the OpenAI Python package via pip.
```bash
pip install openai
```
{
// rest of the model config here
"endpoints": [{"url": "https://HOST:PORT/generate_stream"}]
}
```python
from openai import OpenAI
# init the client but point it to TGI
client = OpenAI(
base_url="http://localhost:8080/v1/",
api_key="-"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=True
)
# iterate and print stream
for message in chat_completion:
print(message)
```
![ChatUI](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/chatui_screen.png)
## UI
## Gradio
### Gradio
Gradio is a Python library that helps you build web applications for your machine learning models with a few lines of code. It has a `ChatInterface` wrapper that helps create neat UIs for chatbots. Let's take a look at how to create a chatbot with streaming mode using TGI and Gradio. Let's install Gradio and Hub Python library first.
......@@ -89,19 +133,28 @@ Assume you are serving your model on port 8080, we will query through [Inference
import gradio as gr
from huggingface_hub import InferenceClient
client = InferenceClient(model="http://127.0.0.1:8080")
client = InferenceClient(base_url="http://127.0.0.1:8080")
def inference(message, history):
partial_message = ""
for token in client.text_generation(message, max_new_tokens=20, stream=True):
partial_message += token
output = client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": message},
],
stream=True,
max_tokens=1024,
)
for chunk in output:
partial_message += chunk.choices[0].delta.content
yield partial_message
gr.ChatInterface(
inference,
chatbot=gr.Chatbot(height=300),
textbox=gr.Textbox(placeholder="Chat with me!", container=False, scale=7),
description="This is the demo for Gradio UI consuming TGI endpoint with LLaMA 7B-Chat model.",
description="This is the demo for Gradio UI consuming TGI endpoint.",
title="Gradio 🤝 TGI",
examples=["Are tomatoes vegetables?"],
retry_btn="Retry",
......@@ -110,20 +163,7 @@ gr.ChatInterface(
).queue().launch()
```
The UI looks like this 👇
<div class="flex justify-center">
<img
class="block dark:hidden"
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi.png"
/>
<img
class="hidden dark:block"
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi-dark.png"
/>
</div>
You can try the demo directly here 👇
You can check out the UI and try the demo directly here 👇
<div class="block dark:hidden">
<iframe
......@@ -141,15 +181,19 @@ You can try the demo directly here 👇
</div>
You can disable streaming mode using `return` instead of `yield` in your inference function, like below.
You can read more about how to customize a `ChatInterface` [here](https://www.gradio.app/guides/creating-a-chatbot-fast).
```python
def inference(message, history):
return client.text_generation(message, max_new_tokens=20)
```
### ChatUI
You can read more about how to customize a `ChatInterface` [here](https://www.gradio.app/guides/creating-a-chatbot-fast).
[ChatUI](https://github.com/huggingface/chat-ui) is an open-source interface built for consuming LLMs. It offers many customization options, such as web search with SERP API and more. ChatUI can automatically consume the TGI server and even provides an option to switch between different TGI endpoints. You can try it out at [Hugging Chat](https://huggingface.co/chat/), or use the [ChatUI Docker Space](https://huggingface.co/new-space?template=huggingchat/chat-ui-template) to deploy your own Hugging Chat to Spaces.
## API documentation
To serve both ChatUI and TGI in same environment, simply add your own endpoints to the `MODELS` variable in `.env.local` file inside the `chat-ui` repository. Provide the endpoints pointing to where TGI is served.
You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route. The Swagger UI is also available [here](https://huggingface.github.io/text-generation-inference).
```
{
// rest of the model config here
"endpoints": [{"url": "https://HOST:PORT/generate_stream"}]
}
```
![ChatUI](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/chatui_screen.png)
......@@ -19,6 +19,6 @@ docker run --gpus all \
--shm-size 1g \
-e HF_TOKEN=$token \
-p 8080:80 \
-v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.4 \
-v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 \
--model-id $model
```
......@@ -4,7 +4,7 @@ Text Generation Inference improves the model in several aspects.
## Quantization
TGI supports [bits-and-bytes](https://github.com/TimDettmers/bitsandbytes#bitsandbytes), [GPT-Q](https://arxiv.org/abs/2210.17323) and [AWQ](https://arxiv.org/abs/2306.00978) quantization. To speed up inference with quantization, simply set `quantize` flag to `bitsandbytes`, `gptq` or `awq` depending on the quantization technique you wish to use. When using GPT-Q quantization, you need to point to one of the models [here](https://huggingface.co/models?search=gptq) when using AWQ quantization, you need to point to one of the models [here](https://huggingface.co/models?search=awq). To get more information about quantization, please refer to [quantization guide](./../conceptual/quantization)
TGI supports [bits-and-bytes](https://github.com/TimDettmers/bitsandbytes#bitsandbytes), [GPT-Q](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [Marlin](https://github.com/IST-DASLab/marlin), [EETQ](https://github.com/NetEase-FuXi/EETQ), [EXL2](https://github.com/turboderp/exllamav2), and [fp8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/) quantization. To speed up inference with quantization, simply set `quantize` flag to `bitsandbytes`, `gptq`, `awq`, `marlin`, `exl2`, `eetq` or `fp8` depending on the quantization technique you wish to use. When using GPT-Q quantization, you need to point to one of the models [here](https://huggingface.co/models?search=gptq). Similarly, when using AWQ quantization, you need to point to one of [these models](https://huggingface.co/models?search=awq). To get more information about quantization, please refer to [quantization guide](./../conceptual/quantization)
## RoPE Scaling
......
......@@ -4,7 +4,7 @@ Text Generation Inference (TGI) now supports [JSON and regex grammars](#grammar-
These feature are available starting from version `1.4.3`. They are accessible via the [`huggingface_hub`](https://pypi.org/project/huggingface-hub/) library. The tool support is compatible with OpenAI's client libraries. The following guide will walk you through the new features and how to use them!
_note: guidance is supported as grammar in the `/generate` endpoint and as tools in the `/chat/completions` endpoint._
_note: guidance is supported as grammar in the `/generate` endpoint and as tools in the `v1/chat/completions` endpoint._
## How it works
......@@ -157,7 +157,12 @@ from huggingface_hub import InferenceClient
client = InferenceClient("http://localhost:3000")
regexp = "((25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)"
section_regex = "(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"
regexp = f"HELLO\.{section_regex}\.WORLD\.{section_regex}"
# This is a more realistic example of an ip address regex
# regexp = f"{section_regex}\.{section_regex}\.{section_regex}\.{section_regex}"
resp = client.text_generation(
f"Whats Googles DNS? Please use the following regex: {regexp}",
......@@ -170,7 +175,7 @@ resp = client.text_generation(
print(resp)
# 7.1.1.1
# HELLO.255.WORLD.255
```
......
......@@ -84,7 +84,7 @@ print(chat)
```
or with OpenAi's library:
or with OpenAI's [client library](https://github.com/openai/openai-python):
```python
from openai import OpenAI
......
# External Resources
- Adyen wrote a detailed article about the interplay between TGI's main components: router and server.
[LLM inference at scale with TGI (Martin Iglesias Goyanes - Adyen, 2024)](https://www.adyen.com/knowledge-hub/llm-inference-at-scale-with-tgi)
......@@ -36,6 +36,24 @@ To use LoRA in TGI, when starting the server, you can specify the list of LoRA m
LORA_ADAPTERS=predibase/customer_support,predibase/dbpedia
```
To specify model revision, use `adapter_id@revision`, as follows:
```bash
LORA_ADAPTERS=predibase/customer_support@main,predibase/dbpedia@rev2
```
To use a locally stored lora adapter, use `adapter-name=/path/to/adapter`, as seen below. When you want to use this adapter, set `"parameters": {"adapter_id": "adapter-name"}"`
```bash
LORA_ADAPTERS=myadapter=/some/path/to/adapter,myadapter2=/another/path/to/adapter
```
note it's possible to mix adapter_ids with adapter_id=adapter_path e.g.
```bash
LORA_ADAPTERS=predibase/dbpedia,myadapter=/path/to/dir/
```
In the server logs, you will see the following message:
```txt
......@@ -60,6 +78,22 @@ curl 127.0.0.1:3000/generate \
}'
```
If you are using a lora adapter stored locally that was set in the following manner: `LORA_ADAPTERS=myadapter=/some/path/to/adapter`, here is an example payload:
```json
curl 127.0.0.1:3000/generate \
-X POST \
-H 'Content-Type: application/json' \
-d '{
"inputs": "Hello who are you?",
"parameters": {
"max_new_tokens": 40,
"adapter_id": "myadapter"
}
}'
```
> **Note:** The Lora feature is new and still being improved. If you encounter any issues or have any feedback, please let us know by opening an issue on the [GitHub repository](https://github.com/huggingface/text-generation-inference/issues/new/choose). Additionally documentation and an improved client library will be published soon.
An updated tutorial with detailed examples will be published soon. Stay tuned!
# Quantization
TGI offers GPTQ and bits-and-bytes quantization to quantize large language models.
TGI offers many quantization schemes to run LLMs effectively and fast based on your use-case. TGI supports GPTQ, AWQ, bits-and-bytes, EETQ, Marlin, EXL2 and fp8 quantization.
To leverage GPTQ, AWQ, Marlin and EXL2 quants, you must provide pre-quantized weights. Whereas for bits-and-bytes, EETQ and fp8, weights are quantized by TGI on the fly.
We recommend using the official quantization scripts for creating your quants:
1. [AWQ](https://github.com/casper-hansen/AutoAWQ/blob/main/examples/quantize.py)
2. [GPTQ/ Marlin](https://github.com/AutoGPTQ/AutoGPTQ/blob/main/examples/quantization/basic_usage.py)
3. [EXL2](https://github.com/turboderp/exllamav2/blob/master/doc/convert.md)
For on-the-fly quantization you simply need to pass one of the supported quantization types and TGI takes care of the rest.
## Quantization with bitsandbytes, EETQ & fp8
bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models. Unlike GPTQ quantization, bitsandbytes doesn't require a calibration dataset or any post-processing – weights are automatically quantized on load. However, inference with bitsandbytes is slower than GPTQ or FP16 precision.
8-bit quantization enables multi-billion parameter scale models to fit in smaller hardware without degrading performance too much.
In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 --model-id $model --quantize bitsandbytes
```
4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 --model-id $model --quantize bitsandbytes-nf4
```
You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
Similarly you can use pass you can pass `--quantize eetq` or `--quantize fp8` for respective quantization schemes.
In addition to this, TGI allows creating GPTQ quants directly by passing the model weights and a calibration dataset.
## Quantization with GPTQ
......@@ -14,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇
```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize gptq
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 --model-id $model --quantize gptq
```
Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.
......@@ -36,24 +70,3 @@ You can learn more about the quantization options by running `text-generation-se
If you wish to do more with GPTQ models (e.g. train an adapter on top), you can read about transformers GPTQ integration [here](https://huggingface.co/blog/gptq-integration).
You can learn more about GPTQ from the [paper](https://arxiv.org/pdf/2210.17323.pdf).
## Quantization with bitsandbytes
bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models. Unlike GPTQ quantization, bitsandbytes doesn't require a calibration dataset or any post-processing – weights are automatically quantized on load. However, inference with bitsandbytes is slower than GPTQ or FP16 precision.
8-bit quantization enables multi-billion parameter scale models to fit in smaller hardware without degrading performance too much.
In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize bitsandbytes
```
4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize bitsandbytes-nf4
```
You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
# Streaming
## What is Streaming?
Token streaming is the mode in which the server returns the tokens one by one as the model generates them. This enables showing progressive generations to the user rather than waiting for the whole generation. Streaming is an essential aspect of the end-user experience as it reduces latency, one of the most critical aspects of a smooth experience.
......@@ -48,34 +49,29 @@ To stream tokens with `InferenceClient`, simply pass `stream=True` and iterate o
```python
from huggingface_hub import InferenceClient
client = InferenceClient("http://127.0.0.1:8080")
for token in client.text_generation("How do you make cheese?", max_new_tokens=12, stream=True):
print(token)
# To
# make
# cheese
#,
# you
# need
# to
# start
# with
# milk
#.
```
If you want additional details, you can add `details=True`. In this case, you get a `TextGenerationStreamResponse` which contains additional information such as the probabilities and the tokens. For the final response in the stream, it also returns the full generated text.
```python
for details in client.text_generation("How do you make cheese?", max_new_tokens=12, details=True, stream=True):
print(details)
#TextGenerationStreamResponse(token=Token(id=193, text='\n', logprob=-0.007358551, special=False), generated_text=None, details=None)
#TextGenerationStreamResponse(token=Token(id=2044, text='To', logprob=-1.1357422, special=False), generated_text=None, details=None)
#TextGenerationStreamResponse(token=Token(id=717, text=' make', logprob=-0.009841919, special=False), generated_text=None, details=None)
#...
#TextGenerationStreamResponse(token=Token(id=25, text='.', logprob=-1.3408203, special=False), generated_text='\nTo make cheese, you need to start with milk.', details=StreamDetails(finish_reason=<FinishReason.Length: 'length'>, generated_tokens=12, seed=None))
client = InferenceClient(base_url="http://127.0.0.1:8080")
output = client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Count to 10"},
],
stream=True,
max_tokens=1024,
)
for chunk in output:
print(chunk.choices[0].delta.content)
# 1
# 2
# 3
# 4
# 5
# 6
# 7
# 8
# 9
# 10
```
The `huggingface_hub` library also comes with an `AsyncInferenceClient` in case you need to handle the requests concurrently.
......@@ -83,31 +79,46 @@ The `huggingface_hub` library also comes with an `AsyncInferenceClient` in case
```python
from huggingface_hub import AsyncInferenceClient
client = AsyncInferenceClient("http://127.0.0.1:8080")
async for token in await client.text_generation("How do you make cheese?", stream=True):
print(token)
# To
# make
# cheese
#,
# you
# need
# to
# start
# with
# milk
client = AsyncInferenceClient(base_url="http://127.0.0.1:8080")
async def main():
stream = await client.chat.completions.create(
messages=[{"role": "user", "content": "Say this is a test"}],
stream=True,
)
async for chunk in stream:
print(chunk.choices[0].delta.content or "", end="")
asyncio.run(main())
# This
# is
# a
# test
#.
```
### Streaming with cURL
To use the `generate_stream` endpoint with curl, you can add the `-N` flag, which disables curl default buffering and shows data as it arrives from the server
To use the OpenAI Chat Completions compatible Messages API `v1/chat/completions` endpoint with curl, you can add the `-N` flag, which disables curl default buffering and shows data as it arrives from the server
```curl
curl -N 127.0.0.1:8080/generate_stream \
curl localhost:8080/v1/chat/completions \
-X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
-d '{
"model": "tgi",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is deep learning?"
}
],
"stream": true,
"max_tokens": 20
}' \
-H 'Content-Type: application/json'
```
......
......@@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
--device=/dev/kfd --device=/dev/dri --group-add video \
--ipc=host --shm-size 256g --net host -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:2.1.0-rocm \
ghcr.io/huggingface/text-generation-inference:2.4.0-rocm \
--model-id $model
```
......@@ -31,6 +31,12 @@ Two implementations of Flash Attention are available for ROCm, the first is [ROC
By default, the Composable Kernel implementation is used. However, the Triton implementation has slightly lower latency on MI250 and MI300, but requires a warmup which can be prohibitive as it needs to be done again for each new prompt length. If needed, FA Triton impelmentation can be enabled with `--env ROCM_USE_FLASH_ATTN_V2_TRITON="0"` when launching TGI's docker container.
## Custom PagedAttention
For better performance on ROCm, a custom Paged Attention kernel is available and is enabled by default. To disable it and fall back to the PagedAttention v2 kernel, set the environment variable `ROCM_USE_CUSTOM_PAGED_ATTN=0`.
The custom kernel supports bf16 and fp16 data types, block size of 16, head size of 128, a maximum context length of 16k, and GQA ratios between 1 and 16. For other configurations, we use the PagedAttention v2 kernel.
## Unsupported features
The following features are currently not supported in the ROCm version of TGI, and the supported may be extended in the future:
......
# Using TGI with Intel GPUs
TGI optimized models are supported on Intel Data Center GPU [Max1100](https://www.intel.com/content/www/us/en/products/sku/232876/intel-data-center-gpu-max-1100/specifications.html), [Max1550](https://www.intel.com/content/www/us/en/products/sku/232873/intel-data-center-gpu-max-1550/specifications.html), the recommended usage is through Docker.
On a server powered by Intel GPUs, TGI can be launched with the following command:
```bash
model=teknium/OpenHermes-2.5-Mistral-7B
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
docker run --rm --privileged --cap-add=sys_nice \
--device=/dev/dri \
--ipc=host --shm-size 1g --net host -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:2.4.0-intel-xpu \
--model-id $model --cuda-graphs 0
```
# Using TGI with Intel CPUs
Intel® Extension for PyTorch (IPEX) also provides further optimizations for Intel CPUs. The IPEX provides optimization operations such as flash attention, page attention, Add + LayerNorm, ROPE and more.
On a server powered by Intel CPU, TGI can be launched with the following command:
```bash
model=teknium/OpenHermes-2.5-Mistral-7B
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
docker run --rm --privileged --cap-add=sys_nice \
--device=/dev/dri \
--ipc=host --shm-size 1g --net host -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu \
--model-id $model --cuda-graphs 0
```
The launched TGI server can then be queried from clients, make sure to check out the [Consuming TGI](./basic_tutorials/consuming_tgi) guide.
......@@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:2.1.0 \
ghcr.io/huggingface/text-generation-inference:2.4.0 \
--model-id $model
```
......
......@@ -11,17 +11,25 @@ model=teknium/OpenHermes-2.5-Mistral-7B
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:2.1.0 \
ghcr.io/huggingface/text-generation-inference:2.4.0 \
--model-id $model
```
<Tip>
If you want to serve gated or private models, please refer to
[this guide](https://huggingface.co/docs/text-generation-inference/en/basic_tutorials/gated_model_access)
for detailed instructions.
</Tip>
### Supported hardware
TGI supports various hardware. Make sure to check the [Using TGI with Nvidia GPUs](./installation_nvidia), [Using TGI with AMD GPUs](./installation_amd), [Using TGI with Gaudi](./installation_gaudi), [Using TGI with Inferentia](./installation_inferentia) guides depending on which hardware you would like to deploy TGI on.
TGI supports various hardware. Make sure to check the [Using TGI with Nvidia GPUs](./installation_nvidia), [Using TGI with AMD GPUs](./installation_amd), [Using TGI with Intel GPUs](./installation_intel), [Using TGI with Gaudi](./installation_gaudi), [Using TGI with Inferentia](./installation_inferentia) guides depending on which hardware you would like to deploy TGI on.
## Consuming TGI
Once TGI is running, you can use the `generate` endpoint by doing requests. To learn more about how to query the endpoints, check the [Consuming TGI](./basic_tutorials/consuming_tgi) section, where we show examples with utility libraries and UIs. Below you can see a simple snippet to query the endpoint.
Once TGI is running, you can use the `generate` endpoint or the Open AI Chat Completion API compatible [Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api) by doing requests. To learn more about how to query the endpoints, check the [Consuming TGI](./basic_tutorials/consuming_tgi) section, where we show examples with utility libraries and UIs. Below you can see a simple snippet to query the endpoint.
<inferencesnippet>
<python>
......@@ -88,7 +96,7 @@ curl 127.0.0.1:8080/generate \
To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.
```bash
docker run ghcr.io/huggingface/text-generation-inference:2.1.0 --help
docker run ghcr.io/huggingface/text-generation-inference:2.4.0 --help
```
</Tip>
# Messages API
Text Generation Inference (TGI) now supports the Messages API, which is fully compatible with the OpenAI Chat Completion API. This feature is available starting from version 1.4.0. You can use OpenAI's client libraries or third-party libraries expecting OpenAI schema to interact with TGI's Messages API. Below are some examples of how to utilize this compatibility.
> **Note:** The Messages API is supported from TGI version 1.4.0 and above. Ensure you are using a compatible version to access this feature.
# HTTP API Reference
#### Table of Contents
- [Making a Request](#making-a-request)
- [Streaming](#streaming)
- [Synchronous](#synchronous)
- [Hugging Face Inference Endpoints](#hugging-face-inference-endpoints)
- [Cloud Providers](#cloud-providers)
- [Text Generation Inference custom API](#text-generation-inference-custom-api)
- [OpenAI Messages API](#openai-messages-api)
- [Making a Request](#making-a-request)
- [Streaming](#streaming)
- [Synchronous](#synchronous)
- [Hugging Face Inference Endpoints](#hugging-face-inference-endpoints)
- [Cloud Providers](#cloud-providers)
- [Amazon SageMaker](#amazon-sagemaker)
The HTTP API is a RESTful API that allows you to interact with the text-generation-inference component. Two endpoints are available:
* Text Generation Inference [custom API](https://huggingface.github.io/text-generation-inference/)
* OpenAI's [Messages API](#openai-messages-api)
## Text Generation Inference custom API
Check the [API documentation](https://huggingface.github.io/text-generation-inference/) for more information on how to interact with the Text Generation Inference API.
## OpenAI Messages API
Text Generation Inference (TGI) now supports the Messages API, which is fully compatible with the OpenAI Chat Completion API. This feature is available starting from version 1.4.0. You can use OpenAI's client libraries or third-party libraries expecting OpenAI schema to interact with TGI's Messages API. Below are some examples of how to utilize this compatibility.
> **Note:** The Messages API is supported from TGI version 1.4.0 and above. Ensure you are using a compatible version to access this feature.
## Making a Request
You can make a request to TGI's Messages API using `curl`. Here's an example:
......@@ -128,9 +141,7 @@ TGI can be deployed on various cloud providers for scalable and robust text gene
## Amazon SageMaker
To enable the Messages API in Amazon SageMaker you need to set the environment variable `MESSAGES_API_ENABLED=true`.
This will modify the `/invocations` route to accept Messages dictonaries consisting out of role and content. See the example below on how to deploy Llama with the new Messages API.
Amazon Sagemaker natively supports the message API:
```python
import json
......@@ -148,12 +159,11 @@ except ValueError:
hub = {
'HF_MODEL_ID':'HuggingFaceH4/zephyr-7b-beta',
'SM_NUM_GPUS': json.dumps(1),
'MESSAGES_API_ENABLED': True
}
# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
image_uri=get_huggingface_llm_image_uri("huggingface",version="1.4.0"),
image_uri=get_huggingface_llm_image_uri("huggingface",version="2.4.0"),
env=hub,
role=role,
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment