"tests/ray/vscode:/vscode.git/clone" did not exist on "f87b35b227fb1c1309cb9e9fb72bf22939fbfbc9"
Commit 366dfe82 authored by jixx's avatar jixx
Browse files

init

parents
Pipeline #1939 canceled with stages
import json
import requests
import warnings
from aiohttp import ClientSession, ClientTimeout
from pydantic import ValidationError
from typing import Dict, Optional, List, AsyncIterator, Iterator, Union
from text_generation import DEPRECATION_WARNING
from text_generation.types import (
StreamResponse,
Response,
Request,
Parameters,
Grammar,
CompletionRequest,
Completion,
CompletionComplete,
ChatRequest,
ChatCompletionChunk,
ChatComplete,
Message,
Tool,
)
from text_generation.errors import parse_error
# emit deprecation warnings
warnings.simplefilter("always", DeprecationWarning)
class Client:
"""Client to make calls to a text-generation-inference instance
Example:
```python
>>> from text_generation import Client
>>> client = Client("https://api-inference.huggingface.co/models/bigscience/bloomz")
>>> client.generate("Why is the sky blue?").generated_text
' Rayleigh scattering'
>>> result = ""
>>> for response in client.generate_stream("Why is the sky blue?"):
>>> if not response.token.special:
>>> result += response.token.text
>>> result
' Rayleigh scattering'
```
"""
def __init__(
self,
base_url: str,
headers: Optional[Dict[str, str]] = None,
cookies: Optional[Dict[str, str]] = None,
timeout: int = 10,
):
"""
Args:
base_url (`str`):
text-generation-inference instance base url
headers (`Optional[Dict[str, str]]`):
Additional headers
cookies (`Optional[Dict[str, str]]`):
Cookies to include in the requests
timeout (`int`):
Timeout in seconds
"""
warnings.warn(DEPRECATION_WARNING, DeprecationWarning)
self.base_url = base_url
self.headers = headers
self.cookies = cookies
self.timeout = timeout
def completion(
self,
prompt: str,
frequency_penalty: Optional[float] = None,
max_tokens: Optional[int] = None,
repetition_penalty: Optional[float] = None,
seed: Optional[int] = None,
stream: bool = False,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
stop: Optional[List[str]] = None,
):
"""
Given a prompt, generate a response synchronously
Args:
prompt (`str`):
Prompt
frequency_penalty (`float`):
The parameter for frequency penalty. 0.0 means no penalty
Penalize new tokens based on their existing frequency in the text so far,
decreasing the model's likelihood to repeat the same line verbatim.
max_tokens (`int`):
Maximum number of generated tokens
repetition_penalty (`float`):
The parameter for frequency penalty. 0.0 means no penalty. See [this
paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
seed (`int`):
Random sampling seed
stream (`bool`):
Stream the response
temperature (`float`):
The value used to module the logits distribution.
top_p (`float`):
If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
higher are kept for generation
stop (`List[str]`):
Stop generating tokens if a member of `stop` is generated
"""
request = CompletionRequest(
model="tgi",
prompt=prompt,
frequency_penalty=frequency_penalty,
max_tokens=max_tokens,
repetition_penalty=repetition_penalty,
seed=seed,
stream=stream,
temperature=temperature,
top_p=top_p,
stop=stop,
)
if not stream:
resp = requests.post(
f"{self.base_url}/v1/completions",
json=request.dict(),
headers=self.headers,
cookies=self.cookies,
timeout=self.timeout,
)
payload = resp.json()
if resp.status_code != 200:
raise parse_error(resp.status_code, payload)
return Completion(**payload)
else:
return self._completion_stream_response(request)
def _completion_stream_response(self, request):
resp = requests.post(
f"{self.base_url}/v1/completions",
json=request.dict(),
headers=self.headers,
cookies=self.cookies,
timeout=self.timeout,
stream=True,
)
# iterate and print stream
for byte_payload in resp.iter_lines():
if byte_payload == b"\n":
continue
payload = byte_payload.decode("utf-8")
if payload.startswith("data:"):
json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
try:
response = CompletionComplete(**json_payload)
yield response
except ValidationError:
raise parse_error(resp.status, json_payload)
def chat(
self,
messages: List[Message],
repetition_penalty: Optional[float] = None,
frequency_penalty: Optional[float] = None,
logit_bias: Optional[List[float]] = None,
logprobs: Optional[bool] = None,
top_logprobs: Optional[int] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
presence_penalty: Optional[float] = None,
stream: bool = False,
seed: Optional[int] = None,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
tools: Optional[List[Tool]] = None,
tool_prompt: Optional[str] = None,
tool_choice: Optional[str] = None,
stop: Optional[List[str]] = None,
):
"""
Given a list of messages, generate a response asynchronously
Args:
messages (`List[Message]`):
List of messages
repetition_penalty (`float`):
The parameter for repetition penalty. 0.0 means no penalty. See [this
paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
frequency_penalty (`float`):
The parameter for frequency penalty. 0.0 means no penalty
Penalize new tokens based on their existing frequency in the text so far,
decreasing the model's likelihood to repeat the same line verbatim.
logit_bias (`List[float]`):
Adjust the likelihood of specified tokens
logprobs (`bool`):
Include log probabilities in the response
top_logprobs (`int`):
Include the `n` most likely tokens at each step
max_tokens (`int`):
Maximum number of generated tokens
n (`int`):
Generate `n` completions
presence_penalty (`float`):
The parameter for presence penalty. 0.0 means no penalty. See [this
paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
stream (`bool`):
Stream the response
seed (`int`):
Random sampling seed
temperature (`float`):
The value used to module the logits distribution.
top_p (`float`):
If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
higher are kept for generation
tools (`List[Tool]`):
List of tools to use
tool_prompt (`str`):
A prompt to be appended before the tools
tool_choice (`str`):
The tool to use
stop (`List[str]`):
Stop generating tokens if a member of `stop` is generated
"""
request = ChatRequest(
model="tgi",
messages=messages,
repetition_penalty=repetition_penalty,
frequency_penalty=frequency_penalty,
logit_bias=logit_bias,
logprobs=logprobs,
top_logprobs=top_logprobs,
max_tokens=max_tokens,
n=n,
presence_penalty=presence_penalty,
stream=stream,
seed=seed,
temperature=temperature,
top_p=top_p,
tools=tools,
tool_prompt=tool_prompt,
tool_choice=tool_choice,
stop=stop,
)
if not stream:
resp = requests.post(
f"{self.base_url}/v1/chat/completions",
json=request.dict(),
headers=self.headers,
cookies=self.cookies,
timeout=self.timeout,
)
payload = resp.json()
if resp.status_code != 200:
raise parse_error(resp.status_code, payload)
return ChatComplete(**payload)
else:
return self._chat_stream_response(request)
def _chat_stream_response(self, request):
resp = requests.post(
f"{self.base_url}/v1/chat/completions",
json=request.dict(),
headers=self.headers,
cookies=self.cookies,
timeout=self.timeout,
stream=True,
)
# iterate and print stream
for byte_payload in resp.iter_lines():
if byte_payload == b"\n":
continue
payload = byte_payload.decode("utf-8")
if payload.startswith("data:"):
json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
try:
response = ChatCompletionChunk(**json_payload)
yield response
except ValidationError:
raise parse_error(resp.status, json_payload)
def generate(
self,
prompt: str,
do_sample: bool = False,
max_new_tokens: int = 20,
best_of: Optional[int] = None,
repetition_penalty: Optional[float] = None,
frequency_penalty: Optional[float] = None,
return_full_text: bool = False,
seed: Optional[int] = None,
stop_sequences: Optional[List[str]] = None,
temperature: Optional[float] = None,
top_k: Optional[int] = None,
top_p: Optional[float] = None,
truncate: Optional[int] = None,
typical_p: Optional[float] = None,
watermark: bool = False,
decoder_input_details: bool = False,
top_n_tokens: Optional[int] = None,
grammar: Optional[Grammar] = None,
) -> Response:
"""
Given a prompt, generate the following text
Args:
prompt (`str`):
Input text
do_sample (`bool`):
Activate logits sampling
max_new_tokens (`int`):
Maximum number of generated tokens
best_of (`int`):
Generate best_of sequences and return the one if the highest token logprobs
repetition_penalty (`float`):
The parameter for repetition penalty. 1.0 means no penalty. See [this
paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
frequency_penalty (`float`):
The parameter for frequency penalty. 1.0 means no penalty
Penalize new tokens based on their existing frequency in the text so far,
decreasing the model's likelihood to repeat the same line verbatim.
return_full_text (`bool`):
Whether to prepend the prompt to the generated text
seed (`int`):
Random sampling seed
stop_sequences (`List[str]`):
Stop generating tokens if a member of `stop_sequences` is generated
temperature (`float`):
The value used to module the logits distribution.
top_k (`int`):
The number of highest probability vocabulary tokens to keep for top-k-filtering.
top_p (`float`):
If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
higher are kept for generation.
truncate (`int`):
Truncate inputs tokens to the given size
typical_p (`float`):
Typical Decoding mass
See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
watermark (`bool`):
Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
decoder_input_details (`bool`):
Return the decoder input token logprobs and ids
top_n_tokens (`int`):
Return the `n` most likely tokens at each step
grammar (`Grammar`):
Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
of the text to match a regular expression or JSON schema.
Returns:
Response: generated response
"""
# Validate parameters
parameters = Parameters(
best_of=best_of,
details=True,
do_sample=do_sample,
max_new_tokens=max_new_tokens,
repetition_penalty=repetition_penalty,
frequency_penalty=frequency_penalty,
return_full_text=return_full_text,
seed=seed,
stop=stop_sequences if stop_sequences is not None else [],
temperature=temperature,
top_k=top_k,
top_p=top_p,
truncate=truncate,
typical_p=typical_p,
watermark=watermark,
decoder_input_details=decoder_input_details,
top_n_tokens=top_n_tokens,
grammar=grammar,
)
request = Request(inputs=prompt, stream=False, parameters=parameters)
resp = requests.post(
self.base_url,
json=request.dict(),
headers=self.headers,
cookies=self.cookies,
timeout=self.timeout,
)
payload = resp.json()
if resp.status_code != 200:
raise parse_error(resp.status_code, payload)
return Response(**payload[0])
def generate_stream(
self,
prompt: str,
do_sample: bool = False,
max_new_tokens: int = 20,
repetition_penalty: Optional[float] = None,
frequency_penalty: Optional[float] = None,
return_full_text: bool = False,
seed: Optional[int] = None,
stop_sequences: Optional[List[str]] = None,
temperature: Optional[float] = None,
top_k: Optional[int] = None,
top_p: Optional[float] = None,
truncate: Optional[int] = None,
typical_p: Optional[float] = None,
watermark: bool = False,
top_n_tokens: Optional[int] = None,
grammar: Optional[Grammar] = None,
) -> Iterator[StreamResponse]:
"""
Given a prompt, generate the following stream of tokens
Args:
prompt (`str`):
Input text
do_sample (`bool`):
Activate logits sampling
max_new_tokens (`int`):
Maximum number of generated tokens
repetition_penalty (`float`):
The parameter for repetition penalty. 1.0 means no penalty. See [this
paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
frequency_penalty (`float`):
The parameter for frequency penalty. 1.0 means no penalty
Penalize new tokens based on their existing frequency in the text so far,
decreasing the model's likelihood to repeat the same line verbatim.
return_full_text (`bool`):
Whether to prepend the prompt to the generated text
seed (`int`):
Random sampling seed
stop_sequences (`List[str]`):
Stop generating tokens if a member of `stop_sequences` is generated
temperature (`float`):
The value used to module the logits distribution.
top_k (`int`):
The number of highest probability vocabulary tokens to keep for top-k-filtering.
top_p (`float`):
If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
higher are kept for generation.
truncate (`int`):
Truncate inputs tokens to the given size
typical_p (`float`):
Typical Decoding mass
See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
watermark (`bool`):
Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
top_n_tokens (`int`):
Return the `n` most likely tokens at each step
grammar (`Grammar`):
Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
of the text to match a regular expression or JSON schema.
Returns:
Iterator[StreamResponse]: stream of generated tokens
"""
# Validate parameters
parameters = Parameters(
best_of=None,
details=True,
decoder_input_details=False,
do_sample=do_sample,
max_new_tokens=max_new_tokens,
repetition_penalty=repetition_penalty,
frequency_penalty=frequency_penalty,
return_full_text=return_full_text,
seed=seed,
stop=stop_sequences if stop_sequences is not None else [],
temperature=temperature,
top_k=top_k,
top_p=top_p,
truncate=truncate,
typical_p=typical_p,
watermark=watermark,
top_n_tokens=top_n_tokens,
grammar=grammar,
)
request = Request(inputs=prompt, stream=True, parameters=parameters)
resp = requests.post(
self.base_url,
json=request.dict(),
headers=self.headers,
cookies=self.cookies,
timeout=self.timeout,
stream=True,
)
if resp.status_code != 200:
raise parse_error(resp.status_code, resp.json())
# Parse ServerSentEvents
for byte_payload in resp.iter_lines():
# Skip line
if byte_payload == b"\n":
continue
payload = byte_payload.decode("utf-8")
# Event data
if payload.startswith("data:"):
# Decode payload
json_payload = json.loads(payload.lstrip("data:").rstrip("/n"))
# Parse payload
try:
response = StreamResponse(**json_payload)
except ValidationError:
# If we failed to parse the payload, then it is an error payload
raise parse_error(resp.status_code, json_payload)
yield response
class AsyncClient:
"""Asynchronous Client to make calls to a text-generation-inference instance
Example:
```python
>>> from text_generation import AsyncClient
>>> client = AsyncClient("https://api-inference.huggingface.co/models/bigscience/bloomz")
>>> response = await client.generate("Why is the sky blue?")
>>> response.generated_text
' Rayleigh scattering'
>>> result = ""
>>> async for response in client.generate_stream("Why is the sky blue?"):
>>> if not response.token.special:
>>> result += response.token.text
>>> result
' Rayleigh scattering'
```
"""
def __init__(
self,
base_url: str,
headers: Optional[Dict[str, str]] = None,
cookies: Optional[Dict[str, str]] = None,
timeout: int = 10,
):
"""
Args:
base_url (`str`):
text-generation-inference instance base url
headers (`Optional[Dict[str, str]]`):
Additional headers
cookies (`Optional[Dict[str, str]]`):
Cookies to include in the requests
timeout (`int`):
Timeout in seconds
"""
warnings.warn(DEPRECATION_WARNING, DeprecationWarning)
self.base_url = base_url
self.headers = headers
self.cookies = cookies
self.timeout = ClientTimeout(timeout)
async def completion(
self,
prompt: str,
frequency_penalty: Optional[float] = None,
max_tokens: Optional[int] = None,
repetition_penalty: Optional[float] = None,
seed: Optional[int] = None,
stream: bool = False,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
stop: Optional[List[str]] = None,
) -> Union[Completion, AsyncIterator[CompletionComplete]]:
"""
Given a prompt, generate a response asynchronously
Args:
prompt (`str`):
Prompt
frequency_penalty (`float`):
The parameter for frequency penalty. 0.0 means no penalty
Penalize new tokens based on their existing frequency in the text so far,
decreasing the model's likelihood to repeat the same line verbatim.
max_tokens (`int`):
Maximum number of generated tokens
repetition_penalty (`float`):
The parameter for frequency penalty. 0.0 means no penalty. See [this
paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
seed (`int`):
Random sampling seed
stream (`bool`):
Stream the response
temperature (`float`):
The value used to module the logits distribution.
top_p (`float`):
If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
higher are kept for generation
stop (`List[str]`):
Stop generating tokens if a member of `stop` is generated
"""
request = CompletionRequest(
model="tgi",
prompt=prompt,
frequency_penalty=frequency_penalty,
max_tokens=max_tokens,
repetition_penalty=repetition_penalty,
seed=seed,
stream=stream,
temperature=temperature,
top_p=top_p,
stop=stop,
)
if not stream:
return await self._completion_single_response(request)
else:
return self._completion_stream_response(request)
async def _completion_single_response(self, request):
async with ClientSession(
headers=self.headers, cookies=self.cookies, timeout=self.timeout
) as session:
async with session.post(
f"{self.base_url}/v1/completions", json=request.dict()
) as resp:
payload = await resp.json()
if resp.status != 200:
raise parse_error(resp.status, payload)
return Completion(**payload)
async def _completion_stream_response(self, request):
async with ClientSession(
headers=self.headers, cookies=self.cookies, timeout=self.timeout
) as session:
async with session.post(
f"{self.base_url}/v1/completions", json=request.dict()
) as resp:
async for byte_payload in resp.content:
if byte_payload == b"\n":
continue
payload = byte_payload.decode("utf-8")
if payload.startswith("data:"):
json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
try:
response = CompletionComplete(**json_payload)
yield response
except ValidationError:
raise parse_error(resp.status, json_payload)
async def chat(
self,
messages: List[Message],
repetition_penalty: Optional[float] = None,
frequency_penalty: Optional[float] = None,
logit_bias: Optional[List[float]] = None,
logprobs: Optional[bool] = None,
top_logprobs: Optional[int] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
presence_penalty: Optional[float] = None,
stream: bool = False,
seed: Optional[int] = None,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
tools: Optional[List[Tool]] = None,
tool_prompt: Optional[str] = None,
tool_choice: Optional[str] = None,
stop: Optional[List[str]] = None,
) -> Union[ChatComplete, AsyncIterator[ChatCompletionChunk]]:
"""
Given a list of messages, generate a response asynchronously
Args:
messages (`List[Message]`):
List of messages
repetition_penalty (`float`):
The parameter for frequency penalty. 0.0 means no penalty. See [this
paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
frequency_penalty (`float`):
The parameter for frequency penalty. 0.0 means no penalty
Penalize new tokens based on their existing frequency in the text so far,
decreasing the model's likelihood to repeat the same line verbatim.
logit_bias (`List[float]`):
Adjust the likelihood of specified tokens
logprobs (`bool`):
Include log probabilities in the response
top_logprobs (`int`):
Include the `n` most likely tokens at each step
max_tokens (`int`):
Maximum number of generated tokens
n (`int`):
Generate `n` completions
presence_penalty (`float`):
The parameter for presence penalty. 0.0 means no penalty. See [this
paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
stream (`bool`):
Stream the response
seed (`int`):
Random sampling seed
temperature (`float`):
The value used to module the logits distribution.
top_p (`float`):
If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
higher are kept for generation
tools (`List[Tool]`):
List of tools to use
tool_prompt (`str`):
A prompt to be appended before the tools
tool_choice (`str`):
The tool to use
stop (`List[str]`):
Stop generating tokens if a member of `stop` is generated
"""
request = ChatRequest(
model="tgi",
messages=messages,
repetition_penalty=repetition_penalty,
frequency_penalty=frequency_penalty,
logit_bias=logit_bias,
logprobs=logprobs,
top_logprobs=top_logprobs,
max_tokens=max_tokens,
n=n,
presence_penalty=presence_penalty,
stream=stream,
seed=seed,
temperature=temperature,
top_p=top_p,
tools=tools,
tool_prompt=tool_prompt,
tool_choice=tool_choice,
stop=stop,
)
if not stream:
return await self._chat_single_response(request)
else:
return self._chat_stream_response(request)
async def _chat_single_response(self, request):
async with ClientSession(
headers=self.headers, cookies=self.cookies, timeout=self.timeout
) as session:
async with session.post(
f"{self.base_url}/v1/chat/completions", json=request.dict()
) as resp:
payload = await resp.json()
if resp.status != 200:
raise parse_error(resp.status, payload)
return ChatComplete(**payload)
async def _chat_stream_response(self, request):
async with ClientSession(
headers=self.headers, cookies=self.cookies, timeout=self.timeout
) as session:
async with session.post(
f"{self.base_url}/v1/chat/completions", json=request.dict()
) as resp:
async for byte_payload in resp.content:
if byte_payload == b"\n":
continue
payload = byte_payload.decode("utf-8")
if payload.startswith("data:"):
json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
try:
response = ChatCompletionChunk(**json_payload)
yield response
except ValidationError:
raise parse_error(resp.status, json_payload)
async def generate(
self,
prompt: str,
do_sample: bool = False,
max_new_tokens: int = 20,
best_of: Optional[int] = None,
repetition_penalty: Optional[float] = None,
frequency_penalty: Optional[float] = None,
return_full_text: bool = False,
seed: Optional[int] = None,
stop_sequences: Optional[List[str]] = None,
temperature: Optional[float] = None,
top_k: Optional[int] = None,
top_p: Optional[float] = None,
truncate: Optional[int] = None,
typical_p: Optional[float] = None,
watermark: bool = False,
decoder_input_details: bool = False,
top_n_tokens: Optional[int] = None,
grammar: Optional[Grammar] = None,
) -> Response:
"""
Given a prompt, generate the following text asynchronously
Args:
prompt (`str`):
Input text
do_sample (`bool`):
Activate logits sampling
max_new_tokens (`int`):
Maximum number of generated tokens
best_of (`int`):
Generate best_of sequences and return the one if the highest token logprobs
repetition_penalty (`float`):
The parameter for repetition penalty. 1.0 means no penalty. See [this
paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
frequency_penalty (`float`):
The parameter for frequency penalty. 1.0 means no penalty
Penalize new tokens based on their existing frequency in the text so far,
decreasing the model's likelihood to repeat the same line verbatim.
return_full_text (`bool`):
Whether to prepend the prompt to the generated text
seed (`int`):
Random sampling seed
stop_sequences (`List[str]`):
Stop generating tokens if a member of `stop_sequences` is generated
temperature (`float`):
The value used to module the logits distribution.
top_k (`int`):
The number of highest probability vocabulary tokens to keep for top-k-filtering.
top_p (`float`):
If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
higher are kept for generation.
truncate (`int`):
Truncate inputs tokens to the given size
typical_p (`float`):
Typical Decoding mass
See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
watermark (`bool`):
Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
decoder_input_details (`bool`):
Return the decoder input token logprobs and ids
top_n_tokens (`int`):
Return the `n` most likely tokens at each step
grammar (`Grammar`):
Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
of the text to match a regular expression or JSON schema.
Returns:
Response: generated response
"""
# Validate parameters
parameters = Parameters(
best_of=best_of,
details=True,
decoder_input_details=decoder_input_details,
do_sample=do_sample,
max_new_tokens=max_new_tokens,
repetition_penalty=repetition_penalty,
frequency_penalty=frequency_penalty,
return_full_text=return_full_text,
seed=seed,
stop=stop_sequences if stop_sequences is not None else [],
temperature=temperature,
top_k=top_k,
top_p=top_p,
truncate=truncate,
typical_p=typical_p,
watermark=watermark,
top_n_tokens=top_n_tokens,
grammar=grammar,
)
request = Request(inputs=prompt, stream=False, parameters=parameters)
async with ClientSession(
headers=self.headers, cookies=self.cookies, timeout=self.timeout
) as session:
async with session.post(self.base_url, json=request.dict()) as resp:
payload = await resp.json()
if resp.status != 200:
raise parse_error(resp.status, payload)
return Response(**payload[0])
async def generate_stream(
self,
prompt: str,
do_sample: bool = False,
max_new_tokens: int = 20,
repetition_penalty: Optional[float] = None,
frequency_penalty: Optional[float] = None,
return_full_text: bool = False,
seed: Optional[int] = None,
stop_sequences: Optional[List[str]] = None,
temperature: Optional[float] = None,
top_k: Optional[int] = None,
top_p: Optional[float] = None,
truncate: Optional[int] = None,
typical_p: Optional[float] = None,
watermark: bool = False,
top_n_tokens: Optional[int] = None,
grammar: Optional[Grammar] = None,
) -> AsyncIterator[StreamResponse]:
"""
Given a prompt, generate the following stream of tokens asynchronously
Args:
prompt (`str`):
Input text
do_sample (`bool`):
Activate logits sampling
max_new_tokens (`int`):
Maximum number of generated tokens
repetition_penalty (`float`):
The parameter for repetition penalty. 1.0 means no penalty. See [this
paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
frequency_penalty (`float`):
The parameter for frequency penalty. 1.0 means no penalty
Penalize new tokens based on their existing frequency in the text so far,
decreasing the model's likelihood to repeat the same line verbatim.
return_full_text (`bool`):
Whether to prepend the prompt to the generated text
seed (`int`):
Random sampling seed
stop_sequences (`List[str]`):
Stop generating tokens if a member of `stop_sequences` is generated
temperature (`float`):
The value used to module the logits distribution.
top_k (`int`):
The number of highest probability vocabulary tokens to keep for top-k-filtering.
top_p (`float`):
If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
higher are kept for generation.
truncate (`int`):
Truncate inputs tokens to the given size
typical_p (`float`):
Typical Decoding mass
See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
watermark (`bool`):
Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
top_n_tokens (`int`):
Return the `n` most likely tokens at each step
grammar (`Grammar`):
Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
of the text to match a regular expression or JSON schema.
Returns:
AsyncIterator[StreamResponse]: stream of generated tokens
"""
# Validate parameters
parameters = Parameters(
best_of=None,
details=True,
decoder_input_details=False,
do_sample=do_sample,
max_new_tokens=max_new_tokens,
repetition_penalty=repetition_penalty,
frequency_penalty=frequency_penalty,
return_full_text=return_full_text,
seed=seed,
stop=stop_sequences if stop_sequences is not None else [],
temperature=temperature,
top_k=top_k,
top_p=top_p,
truncate=truncate,
typical_p=typical_p,
watermark=watermark,
top_n_tokens=top_n_tokens,
grammar=grammar,
)
request = Request(inputs=prompt, stream=True, parameters=parameters)
async with ClientSession(
headers=self.headers, cookies=self.cookies, timeout=self.timeout
) as session:
async with session.post(self.base_url, json=request.dict()) as resp:
if resp.status != 200:
raise parse_error(resp.status, await resp.json())
# Parse ServerSentEvents
async for byte_payload in resp.content:
# Skip line
if byte_payload == b"\n":
continue
payload = byte_payload.decode("utf-8")
# Event data
if payload.startswith("data:"):
# Decode payload
json_payload = json.loads(payload.lstrip("data:").rstrip("/n"))
# Parse payload
try:
response = StreamResponse(**json_payload)
except ValidationError:
# If we failed to parse the payload, then it is an error payload
raise parse_error(resp.status, json_payload)
yield response
from typing import Dict
# Text Generation Inference Errors
class ValidationError(Exception):
def __init__(self, message: str):
super().__init__(message)
class GenerationError(Exception):
def __init__(self, message: str):
super().__init__(message)
class OverloadedError(Exception):
def __init__(self, message: str):
super().__init__(message)
class IncompleteGenerationError(Exception):
def __init__(self, message: str):
super().__init__(message)
# API Inference Errors
class BadRequestError(Exception):
def __init__(self, message: str):
super().__init__(message)
class ShardNotReadyError(Exception):
def __init__(self, message: str):
super().__init__(message)
class ShardTimeoutError(Exception):
def __init__(self, message: str):
super().__init__(message)
class NotFoundError(Exception):
def __init__(self, message: str):
super().__init__(message)
class RateLimitExceededError(Exception):
def __init__(self, message: str):
super().__init__(message)
class NotSupportedError(Exception):
def __init__(self, model_id: str):
message = (
f"Model `{model_id}` is not available for inference with this client. \n"
"Use `huggingface_hub.inference_api.InferenceApi` instead."
)
super(NotSupportedError, self).__init__(message)
# Unknown error
class UnknownError(Exception):
def __init__(self, message: str):
super().__init__(message)
def parse_error(status_code: int, payload: Dict[str, str]) -> Exception:
"""
Parse error given an HTTP status code and a json payload
Args:
status_code (`int`):
HTTP status code
payload (`Dict[str, str]`):
Json payload
Returns:
Exception: parsed exception
"""
# Try to parse a Text Generation Inference error
message = payload["error"]
if "error_type" in payload:
error_type = payload["error_type"]
if error_type == "generation":
return GenerationError(message)
if error_type == "incomplete_generation":
return IncompleteGenerationError(message)
if error_type == "overloaded":
return OverloadedError(message)
if error_type == "validation":
return ValidationError(message)
# Try to parse a APIInference error
if status_code == 400:
return BadRequestError(message)
if status_code == 403 or status_code == 424:
return ShardNotReadyError(message)
if status_code == 504:
return ShardTimeoutError(message)
if status_code == 404:
return NotFoundError(message)
if status_code == 429:
return RateLimitExceededError(message)
# Fallback to an unknown error
return UnknownError(message)
import os
import requests
from typing import Dict, Optional, List
from huggingface_hub.utils import build_hf_headers
from text_generation import Client, AsyncClient, __version__
from text_generation.types import DeployedModel
from text_generation.errors import NotSupportedError, parse_error
INFERENCE_ENDPOINT = os.environ.get(
"HF_INFERENCE_ENDPOINT", "https://api-inference.huggingface.co"
)
def deployed_models(headers: Optional[Dict] = None) -> List[DeployedModel]:
"""
Get all currently deployed models with text-generation-inference-support
Returns:
List[DeployedModel]: list of all currently deployed models
"""
resp = requests.get(
f"https://api-inference.huggingface.co/framework/text-generation-inference",
headers=headers,
timeout=5,
)
payload = resp.json()
if resp.status_code != 200:
raise parse_error(resp.status_code, payload)
models = [DeployedModel(**raw_deployed_model) for raw_deployed_model in payload]
return models
def check_model_support(repo_id: str, headers: Optional[Dict] = None) -> bool:
"""
Check if a given model is supported by text-generation-inference
Returns:
bool: whether the model is supported by this client
"""
resp = requests.get(
f"https://api-inference.huggingface.co/status/{repo_id}",
headers=headers,
timeout=5,
)
payload = resp.json()
if resp.status_code != 200:
raise parse_error(resp.status_code, payload)
framework = payload["framework"]
supported = framework == "text-generation-inference"
return supported
class InferenceAPIClient(Client):
"""Client to make calls to the HuggingFace Inference API.
Only supports a subset of the available text-generation or text2text-generation models that are served using
text-generation-inference
Example:
```python
>>> from text_generation import InferenceAPIClient
>>> client = InferenceAPIClient("bigscience/bloomz")
>>> client.generate("Why is the sky blue?").generated_text
' Rayleigh scattering'
>>> result = ""
>>> for response in client.generate_stream("Why is the sky blue?"):
>>> if not response.token.special:
>>> result += response.token.text
>>> result
' Rayleigh scattering'
```
"""
def __init__(self, repo_id: str, token: Optional[str] = None, timeout: int = 10):
"""
Init headers and API information
Args:
repo_id (`str`):
Id of repository (e.g. `bigscience/bloom`).
token (`str`, `optional`):
The API token to use as HTTP bearer authorization. This is not
the authentication token. You can find the token in
https://huggingface.co/settings/token. Alternatively, you can
find both your organizations and personal API tokens using
`HfApi().whoami(token)`.
timeout (`int`):
Timeout in seconds
"""
headers = build_hf_headers(
token=token, library_name="text-generation", library_version=__version__
)
# Text Generation Inference client only supports a subset of the available hub models
if not check_model_support(repo_id, headers):
raise NotSupportedError(repo_id)
base_url = f"{INFERENCE_ENDPOINT}/models/{repo_id}"
super(InferenceAPIClient, self).__init__(
base_url, headers=headers, timeout=timeout
)
class InferenceAPIAsyncClient(AsyncClient):
"""Aynschronous Client to make calls to the HuggingFace Inference API.
Only supports a subset of the available text-generation or text2text-generation models that are served using
text-generation-inference
Example:
```python
>>> from text_generation import InferenceAPIAsyncClient
>>> client = InferenceAPIAsyncClient("bigscience/bloomz")
>>> response = await client.generate("Why is the sky blue?")
>>> response.generated_text
' Rayleigh scattering'
>>> result = ""
>>> async for response in client.generate_stream("Why is the sky blue?"):
>>> if not response.token.special:
>>> result += response.token.text
>>> result
' Rayleigh scattering'
```
"""
def __init__(self, repo_id: str, token: Optional[str] = None, timeout: int = 10):
"""
Init headers and API information
Args:
repo_id (`str`):
Id of repository (e.g. `bigscience/bloom`).
token (`str`, `optional`):
The API token to use as HTTP bearer authorization. This is not
the authentication token. You can find the token in
https://huggingface.co/settings/token. Alternatively, you can
find both your organizations and personal API tokens using
`HfApi().whoami(token)`.
timeout (`int`):
Timeout in seconds
"""
headers = build_hf_headers(
token=token, library_name="text-generation", library_version=__version__
)
# Text Generation Inference client only supports a subset of the available hub models
if not check_model_support(repo_id, headers):
raise NotSupportedError(repo_id)
base_url = f"{INFERENCE_ENDPOINT}/models/{repo_id}"
super(InferenceAPIAsyncClient, self).__init__(
base_url, headers=headers, timeout=timeout
)
from enum import Enum
from pydantic import BaseModel, field_validator, ConfigDict
from typing import Optional, List, Union, Any
from text_generation.errors import ValidationError
# enum for grammar type
class GrammarType(str, Enum):
Json = "json"
Regex = "regex"
# Grammar type and value
class Grammar(BaseModel):
# Grammar type
type: GrammarType
# Grammar value
value: Union[str, dict]
class ToolCall(BaseModel):
# Id of the tool call
id: int
# Type of the tool call
type: str
# Function details of the tool call
function: dict
class Message(BaseModel):
# Role of the message sender
role: str
# Content of the message
content: Optional[str] = None
# Optional name of the message sender
name: Optional[str] = None
# Tool calls associated with the chat completion
tool_calls: Optional[Any] = None
class Tool(BaseModel):
# Type of the tool
type: str
# Function details of the tool
function: dict
class Function(BaseModel):
name: Optional[str]
arguments: str
class ChoiceDeltaToolCall(BaseModel):
index: int
id: str
type: str
function: Function
class ChoiceDelta(BaseModel):
role: str
content: Optional[str] = None
tool_calls: Optional[ChoiceDeltaToolCall]
class Choice(BaseModel):
index: int
delta: ChoiceDelta
logprobs: Optional[dict] = None
finish_reason: Optional[str] = None
class CompletionRequest(BaseModel):
# Model identifier
model: str
# Prompt
prompt: str
# The parameter for repetition penalty. 1.0 means no penalty.
# See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
repetition_penalty: Optional[float] = None
# The parameter for frequency penalty. 1.0 means no penalty
# Penalize new tokens based on their existing frequency in the text so far,
# decreasing the model's likelihood to repeat the same line verbatim.
frequency_penalty: Optional[float] = None
# Maximum number of tokens to generate
max_tokens: Optional[int] = None
# Flag to indicate streaming response
stream: bool = False
# Random sampling seed
seed: Optional[int] = None
# Sampling temperature
temperature: Optional[float] = None
# Top-p value for nucleus sampling
top_p: Optional[float] = None
# Stop generating tokens if a member of `stop` is generated
stop: Optional[List[str]] = None
class CompletionComplete(BaseModel):
# Index of the chat completion
index: int
# Message associated with the chat completion
text: str
# Log probabilities for the chat completion
logprobs: Optional[Any]
# Reason for completion
finish_reason: str
class Completion(BaseModel):
# Completion details
id: str
object: str
created: int
model: str
system_fingerprint: str
choices: List[CompletionComplete]
class ChatRequest(BaseModel):
# Model identifier
model: str
# List of messages in the conversation
messages: List[Message]
# The parameter for repetition penalty. 1.0 means no penalty.
# See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
repetition_penalty: Optional[float] = None
# The parameter for frequency penalty. 1.0 means no penalty
# Penalize new tokens based on their existing frequency in the text so far,
# decreasing the model's likelihood to repeat the same line verbatim.
frequency_penalty: Optional[float] = None
# Bias values for token selection
logit_bias: Optional[List[float]] = None
# Whether to return log probabilities
logprobs: Optional[bool] = None
# Number of most likely tokens to return at each position
top_logprobs: Optional[int] = None
# Maximum number of tokens to generate
max_tokens: Optional[int] = None
# Number of chat completion choices to generate
n: Optional[int] = None
# Penalty for presence of new tokens
presence_penalty: Optional[float] = None
# Flag to indicate streaming response
stream: bool = False
# Random sampling seed
seed: Optional[int] = None
# Sampling temperature
temperature: Optional[float] = None
# Top-p value for nucleus sampling
top_p: Optional[float] = None
# List of tools to be used
tools: Optional[List[Tool]] = None
# A prompt to be appended before the tools
tool_prompt: Optional[str] = None
# Choice of tool to be used
tool_choice: Optional[str] = None
# Stop generating tokens if a member of `stop` is generated
stop: Optional[List[str]] = None
class ChatCompletionComplete(BaseModel):
# Index of the chat completion
index: int
# Message associated with the chat completion
message: Message
# Log probabilities for the chat completion
logprobs: Optional[Any]
# Reason for completion
finish_reason: str
# Usage details of the chat completion
usage: Optional[Any] = None
class ChatComplete(BaseModel):
# Chat completion details
id: str
object: str
created: int
model: str
system_fingerprint: str
choices: List[ChatCompletionComplete]
usage: Any
class ChatCompletionChunk(BaseModel):
id: str
object: str
created: int
model: str
system_fingerprint: str
choices: List[Choice]
class Parameters(BaseModel):
# Activate logits sampling
do_sample: bool = False
# Maximum number of generated tokens
max_new_tokens: int = 20
# The parameter for repetition penalty. 1.0 means no penalty.
# See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
repetition_penalty: Optional[float] = None
# The parameter for frequency penalty. 1.0 means no penalty
# Penalize new tokens based on their existing frequency in the text so far,
# decreasing the model's likelihood to repeat the same line verbatim.
frequency_penalty: Optional[float] = None
# Whether to prepend the prompt to the generated text
return_full_text: bool = False
# Stop generating tokens if a member of `stop_sequences` is generated
stop: List[str] = []
# Random sampling seed
seed: Optional[int] = None
# The value used to module the logits distribution.
temperature: Optional[float] = None
# The number of highest probability vocabulary tokens to keep for top-k-filtering.
top_k: Optional[int] = None
# If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
# higher are kept for generation.
top_p: Optional[float] = None
# truncate inputs tokens to the given size
truncate: Optional[int] = None
# Typical Decoding mass
# See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
typical_p: Optional[float] = None
# Generate best_of sequences and return the one if the highest token logprobs
best_of: Optional[int] = None
# Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
watermark: bool = False
# Get generation details
details: bool = False
# Get decoder input token logprobs and ids
decoder_input_details: bool = False
# Return the N most likely tokens at each step
top_n_tokens: Optional[int] = None
# grammar to use for generation
grammar: Optional[Grammar] = None
@field_validator("best_of")
def valid_best_of(cls, field_value, values):
if field_value is not None:
if field_value <= 0:
raise ValidationError("`best_of` must be strictly positive")
if field_value > 1 and values.data["seed"] is not None:
raise ValidationError("`seed` must not be set when `best_of` is > 1")
sampling = (
values.data["do_sample"]
| (values.data["temperature"] is not None)
| (values.data["top_k"] is not None)
| (values.data["top_p"] is not None)
| (values.data["typical_p"] is not None)
)
if field_value > 1 and not sampling:
raise ValidationError("you must use sampling when `best_of` is > 1")
return field_value
@field_validator("repetition_penalty")
def valid_repetition_penalty(cls, v):
if v is not None and v <= 0:
raise ValidationError("`repetition_penalty` must be strictly positive")
return v
@field_validator("frequency_penalty")
def valid_frequency_penalty(cls, v):
if v is not None and v <= 0:
raise ValidationError("`frequency_penalty` must be strictly positive")
return v
@field_validator("seed")
def valid_seed(cls, v):
if v is not None and v < 0:
raise ValidationError("`seed` must be positive")
return v
@field_validator("temperature")
def valid_temp(cls, v):
if v is not None and v <= 0:
raise ValidationError("`temperature` must be strictly positive")
return v
@field_validator("top_k")
def valid_top_k(cls, v):
if v is not None and v <= 0:
raise ValidationError("`top_k` must be strictly positive")
return v
@field_validator("top_p")
def valid_top_p(cls, v):
if v is not None and (v <= 0 or v >= 1.0):
raise ValidationError("`top_p` must be > 0.0 and < 1.0")
return v
@field_validator("truncate")
def valid_truncate(cls, v):
if v is not None and v <= 0:
raise ValidationError("`truncate` must be strictly positive")
return v
@field_validator("typical_p")
def valid_typical_p(cls, v):
if v is not None and (v <= 0 or v >= 1.0):
raise ValidationError("`typical_p` must be > 0.0 and < 1.0")
return v
@field_validator("top_n_tokens")
def valid_top_n_tokens(cls, v):
if v is not None and v <= 0:
raise ValidationError("`top_n_tokens` must be strictly positive")
return v
@field_validator("grammar")
def valid_grammar(cls, v):
if v is not None:
if v.type == GrammarType.Regex and not v.value:
raise ValidationError("`value` cannot be empty for `regex` grammar")
if v.type == GrammarType.Json and not v.value:
raise ValidationError("`value` cannot be empty for `json` grammar")
return v
class Request(BaseModel):
# Prompt
inputs: str
# Generation parameters
parameters: Optional[Parameters] = None
# Whether to stream output tokens
stream: bool = False
@field_validator("inputs")
def valid_input(cls, v):
if not v:
raise ValidationError("`inputs` cannot be empty")
return v
@field_validator("stream")
def valid_best_of_stream(cls, field_value, values):
parameters = values.data["parameters"]
if (
parameters is not None
and parameters.best_of is not None
and parameters.best_of > 1
and field_value
):
raise ValidationError(
"`best_of` != 1 is not supported when `stream` == True"
)
return field_value
# Decoder input tokens
class InputToken(BaseModel):
# Token ID from the model tokenizer
id: int
# Token text
text: str
# Logprob
# Optional since the logprob of the first token cannot be computed
logprob: Optional[float] = None
# Generated tokens
class Token(BaseModel):
# Token ID from the model tokenizer
id: int
# Token text
text: str
# Logprob
logprob: Optional[float] = None
# Is the token a special token
# Can be used to ignore tokens when concatenating
special: bool
# Generation finish reason
class FinishReason(str, Enum):
# number of generated tokens == `max_new_tokens`
Length = "length"
# the model generated its end of sequence token
EndOfSequenceToken = "eos_token"
# the model generated a text included in `stop_sequences`
StopSequence = "stop_sequence"
# Additional sequences when using the `best_of` parameter
class BestOfSequence(BaseModel):
# Generated text
generated_text: str
# Generation finish reason
finish_reason: FinishReason
# Number of generated tokens
generated_tokens: int
# Sampling seed if sampling was activated
seed: Optional[int] = None
# Decoder input tokens, empty if decoder_input_details is False
prefill: List[InputToken]
# Generated tokens
tokens: List[Token]
# Most likely tokens
top_tokens: Optional[List[List[Token]]] = None
# `generate` details
class Details(BaseModel):
# Generation finish reason
finish_reason: FinishReason
# Number of generated tokens
generated_tokens: int
# Sampling seed if sampling was activated
seed: Optional[int] = None
# Decoder input tokens, empty if decoder_input_details is False
prefill: List[InputToken]
# Generated tokens
tokens: List[Token]
# Most likely tokens
top_tokens: Optional[List[List[Token]]] = None
# Additional sequences when using the `best_of` parameter
best_of_sequences: Optional[List[BestOfSequence]] = None
# `generate` return value
class Response(BaseModel):
# Generated text
generated_text: str
# Generation details
details: Details
# `generate_stream` details
class StreamDetails(BaseModel):
# Generation finish reason
finish_reason: FinishReason
# Number of generated tokens
generated_tokens: int
# Sampling seed if sampling was activated
seed: Optional[int] = None
# `generate_stream` return value
class StreamResponse(BaseModel):
# Generated token
token: Token
# Most likely tokens
top_tokens: Optional[List[Token]] = None
# Complete generated text
# Only available when the generation is finished
generated_text: Optional[str] = None
# Generation details
# Only available when the generation is finished
details: Optional[StreamDetails] = None
# Inference API currently deployed model
class DeployedModel(BaseModel):
# Disable warning for use of `model_` prefix in `model_id`. Be mindful about adding members
# with model_ prefixes, since this disables guardrails for colliding fields:
# https://github.com/pydantic/pydantic/issues/9177
model_config = ConfigDict(protected_namespaces=())
model_id: str
sha: str
Documentation available at: https://huggingface.co/docs/text-generation-inference
## Release
When making a release, please update the latest version in the documentation with:
```
export OLD_VERSION="2\.0\.3"
export NEW_VERSION="2\.0\.4"
find . -name '*.md' -exec sed -i -e "s/$OLD_VERSION/$NEW_VERSION/g" {} \;
```
<html>
<head>
<!-- Load the latest Swagger UI code and style from npm using unpkg.com -->
<script src="https://unpkg.com/swagger-ui-dist@3/swagger-ui-bundle.js"></script>
<link rel="stylesheet" type="text/css" href="https://unpkg.com/swagger-ui-dist@3/swagger-ui.css"/>
<title>Text Generation Inference API</title>
</head>
<body>
<div id="swagger-ui"></div> <!-- Div to hold the UI component -->
<script>
window.onload = function () {
// Begin Swagger UI call region
const ui = SwaggerUIBundle({
url: "openapi.json", //Location of Open API spec in the repo
dom_id: '#swagger-ui',
deepLinking: true,
supportedSubmitMethods: [],
presets: [
SwaggerUIBundle.presets.apis,
SwaggerUIBundle.SwaggerUIStandalonePreset
],
plugins: [
SwaggerUIBundle.plugins.DownloadUrl
],
})
window.ui = ui
}
</script>
</body>
</html>
{
"openapi": "3.0.3",
"info": {
"title": "Text Generation Inference",
"description": "Text Generation Webserver",
"contact": {
"name": "Olivier Dehaene"
},
"license": {
"name": "Apache 2.0",
"url": "https://www.apache.org/licenses/LICENSE-2.0"
},
"version": "2.1.1"
},
"paths": {
"/": {
"post": {
"tags": [
"Text Generation Inference"
],
"summary": "Generate tokens if `stream == false` or a stream of token if `stream == true`",
"operationId": "compat_generate",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/CompatGenerateRequest"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Generated Text",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/GenerateResponse"
}
},
"text/event-stream": {
"schema": {
"$ref": "#/components/schemas/StreamResponse"
}
}
}
},
"422": {
"description": "Input validation error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Input validation error"
}
}
}
},
"424": {
"description": "Generation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Request failed during generation"
}
}
}
},
"429": {
"description": "Model is overloaded",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Model is overloaded"
}
}
}
},
"500": {
"description": "Incomplete generation",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Incomplete generation"
}
}
}
}
}
}
},
"/generate": {
"post": {
"tags": [
"Text Generation Inference"
],
"summary": "Generate tokens",
"operationId": "generate",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/GenerateRequest"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Generated Text",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/GenerateResponse"
}
}
}
},
"422": {
"description": "Input validation error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Input validation error"
}
}
}
},
"424": {
"description": "Generation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Request failed during generation"
}
}
}
},
"429": {
"description": "Model is overloaded",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Model is overloaded"
}
}
}
},
"500": {
"description": "Incomplete generation",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Incomplete generation"
}
}
}
}
}
}
},
"/generate_stream": {
"post": {
"tags": [
"Text Generation Inference"
],
"summary": "Generate a stream of token using Server-Sent Events",
"operationId": "generate_stream",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/GenerateRequest"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Generated Text",
"content": {
"text/event-stream": {
"schema": {
"$ref": "#/components/schemas/StreamResponse"
}
}
}
},
"422": {
"description": "Input validation error",
"content": {
"text/event-stream": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Input validation error"
}
}
}
},
"424": {
"description": "Generation Error",
"content": {
"text/event-stream": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Request failed during generation"
}
}
}
},
"429": {
"description": "Model is overloaded",
"content": {
"text/event-stream": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Model is overloaded"
}
}
}
},
"500": {
"description": "Incomplete generation",
"content": {
"text/event-stream": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Incomplete generation"
}
}
}
}
}
}
},
"/health": {
"get": {
"tags": [
"Text Generation Inference"
],
"summary": "Health check method",
"operationId": "health",
"responses": {
"200": {
"description": "Everything is working fine"
},
"503": {
"description": "Text generation inference is down",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "unhealthy",
"error_type": "healthcheck"
}
}
}
}
}
}
},
"/info": {
"get": {
"tags": [
"Text Generation Inference"
],
"summary": "Text Generation Inference endpoint info",
"operationId": "get_model_info",
"responses": {
"200": {
"description": "Served model info",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Info"
}
}
}
}
}
}
},
"/metrics": {
"get": {
"tags": [
"Text Generation Inference"
],
"summary": "Prometheus metrics scrape endpoint",
"operationId": "metrics",
"responses": {
"200": {
"description": "Prometheus Metrics",
"content": {
"text/plain": {
"schema": {
"type": "string"
}
}
}
}
}
}
},
"/tokenize": {
"post": {
"tags": [
"Text Generation Inference"
],
"summary": "Tokenize inputs",
"operationId": "tokenize",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/GenerateRequest"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Tokenized ids",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/TokenizeResponse"
}
}
}
},
"404": {
"description": "No tokenizer found",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "No fast tokenizer available"
}
}
}
}
}
}
},
"/v1/chat/completions": {
"post": {
"tags": [
"Text Generation Inference"
],
"summary": "Generate tokens",
"operationId": "chat_completions",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ChatRequest"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Generated Chat Completion",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ChatCompletion"
}
},
"text/event-stream": {
"schema": {
"$ref": "#/components/schemas/ChatCompletionChunk"
}
}
}
},
"422": {
"description": "Input validation error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Input validation error"
}
}
}
},
"424": {
"description": "Generation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Request failed during generation"
}
}
}
},
"429": {
"description": "Model is overloaded",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Model is overloaded"
}
}
}
},
"500": {
"description": "Incomplete generation",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Incomplete generation"
}
}
}
}
}
}
},
"/v1/completions": {
"post": {
"tags": [
"Text Generation Inference"
],
"summary": "Generate tokens",
"operationId": "completions",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/CompletionRequest"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Generated Chat Completion",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Completion"
}
},
"text/event-stream": {
"schema": {
"$ref": "#/components/schemas/CompletionCompleteChunk"
}
}
}
},
"422": {
"description": "Input validation error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Input validation error"
}
}
}
},
"424": {
"description": "Generation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Request failed during generation"
}
}
}
},
"429": {
"description": "Model is overloaded",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Model is overloaded"
}
}
}
},
"500": {
"description": "Incomplete generation",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Incomplete generation"
}
}
}
}
}
}
}
},
"components": {
"schemas": {
"BestOfSequence": {
"type": "object",
"required": [
"generated_text",
"finish_reason",
"generated_tokens",
"prefill",
"tokens"
],
"properties": {
"finish_reason": {
"$ref": "#/components/schemas/FinishReason"
},
"generated_text": {
"type": "string",
"example": "test"
},
"generated_tokens": {
"type": "integer",
"format": "int32",
"example": 1,
"minimum": 0
},
"prefill": {
"type": "array",
"items": {
"$ref": "#/components/schemas/PrefillToken"
}
},
"seed": {
"type": "integer",
"format": "int64",
"example": 42,
"nullable": true,
"minimum": 0
},
"tokens": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Token"
}
},
"top_tokens": {
"type": "array",
"items": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Token"
}
}
}
}
},
"ChatCompletion": {
"type": "object",
"required": [
"id",
"created",
"model",
"system_fingerprint",
"choices",
"usage"
],
"properties": {
"choices": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ChatCompletionComplete"
}
},
"created": {
"type": "integer",
"format": "int64",
"example": "1706270835",
"minimum": 0
},
"id": {
"type": "string"
},
"model": {
"type": "string",
"example": "mistralai/Mistral-7B-Instruct-v0.2"
},
"system_fingerprint": {
"type": "string"
},
"usage": {
"$ref": "#/components/schemas/Usage"
}
}
},
"ChatCompletionChoice": {
"type": "object",
"required": [
"index",
"delta"
],
"properties": {
"delta": {
"$ref": "#/components/schemas/ChatCompletionDelta"
},
"finish_reason": {
"type": "string",
"nullable": true
},
"index": {
"type": "integer",
"format": "int32",
"minimum": 0
},
"logprobs": {
"allOf": [
{
"$ref": "#/components/schemas/ChatCompletionLogprobs"
}
],
"nullable": true
}
}
},
"ChatCompletionChunk": {
"type": "object",
"required": [
"id",
"created",
"model",
"system_fingerprint",
"choices"
],
"properties": {
"choices": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ChatCompletionChoice"
}
},
"created": {
"type": "integer",
"format": "int64",
"example": "1706270978",
"minimum": 0
},
"id": {
"type": "string"
},
"model": {
"type": "string",
"example": "mistralai/Mistral-7B-Instruct-v0.2"
},
"system_fingerprint": {
"type": "string"
}
}
},
"ChatCompletionComplete": {
"type": "object",
"required": [
"index",
"message",
"finish_reason"
],
"properties": {
"finish_reason": {
"type": "string"
},
"index": {
"type": "integer",
"format": "int32",
"minimum": 0
},
"logprobs": {
"allOf": [
{
"$ref": "#/components/schemas/ChatCompletionLogprobs"
}
],
"nullable": true
},
"message": {
"$ref": "#/components/schemas/OutputMessage"
}
}
},
"ChatCompletionDelta": {
"oneOf": [
{
"$ref": "#/components/schemas/TextMessage"
},
{
"$ref": "#/components/schemas/ToolCallDelta"
}
]
},
"ChatCompletionLogprob": {
"type": "object",
"required": [
"token",
"logprob",
"top_logprobs"
],
"properties": {
"logprob": {
"type": "number",
"format": "float"
},
"token": {
"type": "string"
},
"top_logprobs": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ChatCompletionTopLogprob"
}
}
}
},
"ChatCompletionLogprobs": {
"type": "object",
"required": [
"content"
],
"properties": {
"content": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ChatCompletionLogprob"
}
}
}
},
"ChatCompletionTopLogprob": {
"type": "object",
"required": [
"token",
"logprob"
],
"properties": {
"logprob": {
"type": "number",
"format": "float"
},
"token": {
"type": "string"
}
}
},
"ChatRequest": {
"type": "object",
"required": [
"model",
"messages"
],
"properties": {
"frequency_penalty": {
"type": "number",
"format": "float",
"description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim.",
"example": "1.0",
"nullable": true
},
"logit_bias": {
"type": "array",
"items": {
"type": "number",
"format": "float"
},
"description": "UNUSED\nModify the likelihood of specified tokens appearing in the completion. Accepts a JSON object that maps tokens\n(specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically,\nthe bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model,\nbut values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should\nresult in a ban or exclusive selection of the relevant token.",
"nullable": true
},
"logprobs": {
"type": "boolean",
"description": "Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each\noutput token returned in the content of message.",
"example": "false",
"nullable": true
},
"max_tokens": {
"type": "integer",
"format": "int32",
"description": "The maximum number of tokens that can be generated in the chat completion.",
"example": "32",
"nullable": true,
"minimum": 0
},
"messages": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Message"
},
"description": "A list of messages comprising the conversation so far.",
"example": "[{\"role\": \"user\", \"content\": \"What is Deep Learning?\"}]"
},
"model": {
"type": "string",
"description": "[UNUSED] ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.",
"example": "mistralai/Mistral-7B-Instruct-v0.2"
},
"n": {
"type": "integer",
"format": "int32",
"description": "UNUSED\nHow many chat completion choices to generate for each input message. Note that you will be charged based on the\nnumber of generated tokens across all of the choices. Keep n as 1 to minimize costs.",
"example": "2",
"nullable": true,
"minimum": 0
},
"presence_penalty": {
"type": "number",
"format": "float",
"description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far,\nincreasing the model's likelihood to talk about new topics",
"example": 0.1,
"nullable": true
},
"response_format": {
"allOf": [
{
"$ref": "#/components/schemas/GrammarType"
}
],
"default": "null",
"nullable": true
},
"seed": {
"type": "integer",
"format": "int64",
"example": 42,
"nullable": true,
"minimum": 0
},
"stop": {
"type": "array",
"items": {
"type": "string"
},
"description": "Up to 4 sequences where the API will stop generating further tokens.",
"example": "null",
"nullable": true
},
"stream": {
"type": "boolean"
},
"temperature": {
"type": "number",
"format": "float",
"description": "What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while\nlower values like 0.2 will make it more focused and deterministic.\n\nWe generally recommend altering this or `top_p` but not both.",
"example": 1.0,
"nullable": true
},
"tool_choice": {
"allOf": [
{
"$ref": "#/components/schemas/ToolType"
}
],
"nullable": true
},
"tool_prompt": {
"type": "string",
"description": "A prompt to be appended before the tools",
"example": "\"You will be presented with a JSON schema representing a set of tools.\nIf the user request lacks of sufficient information to make a precise tool selection: Do not invent any tool's properties, instead notify with an error message.\n\nJSON Schema:\n\"",
"nullable": true
},
"tools": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Tool"
},
"description": "A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of\nfunctions the model may generate JSON inputs for.",
"example": "null",
"nullable": true
},
"top_logprobs": {
"type": "integer",
"format": "int32",
"description": "An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with\nan associated log probability. logprobs must be set to true if this parameter is used.",
"example": "5",
"nullable": true,
"minimum": 0
},
"top_p": {
"type": "number",
"format": "float",
"description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the\ntokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.",
"example": 0.95,
"nullable": true
}
}
},
"Chunk": {
"type": "object",
"required": [
"id",
"created",
"choices",
"model",
"system_fingerprint"
],
"properties": {
"choices": {
"type": "array",
"items": {
"$ref": "#/components/schemas/CompletionComplete"
}
},
"created": {
"type": "integer",
"format": "int64",
"minimum": 0
},
"id": {
"type": "string"
},
"model": {
"type": "string"
},
"system_fingerprint": {
"type": "string"
}
}
},
"CompatGenerateRequest": {
"type": "object",
"required": [
"inputs"
],
"properties": {
"inputs": {
"type": "string",
"example": "My name is Olivier and I"
},
"parameters": {
"$ref": "#/components/schemas/GenerateParameters"
},
"stream": {
"type": "boolean",
"default": "false"
}
}
},
"Completion": {
"oneOf": [
{
"allOf": [
{
"$ref": "#/components/schemas/Chunk"
},
{
"type": "object",
"required": [
"object"
],
"properties": {
"object": {
"type": "string",
"enum": [
"text_completion"
]
}
}
}
]
},
{
"allOf": [
{
"$ref": "#/components/schemas/CompletionFinal"
},
{
"type": "object",
"required": [
"object"
],
"properties": {
"object": {
"type": "string",
"enum": [
"text_completion"
]
}
}
}
]
}
],
"discriminator": {
"propertyName": "object"
}
},
"CompletionComplete": {
"type": "object",
"required": [
"index",
"text",
"finish_reason"
],
"properties": {
"finish_reason": {
"type": "string"
},
"index": {
"type": "integer",
"format": "int32",
"minimum": 0
},
"logprobs": {
"type": "array",
"items": {
"type": "number",
"format": "float"
},
"nullable": true
},
"text": {
"type": "string"
}
}
},
"CompletionFinal": {
"type": "object",
"required": [
"id",
"created",
"model",
"system_fingerprint",
"choices",
"usage"
],
"properties": {
"choices": {
"type": "array",
"items": {
"$ref": "#/components/schemas/CompletionComplete"
}
},
"created": {
"type": "integer",
"format": "int64",
"example": "1706270835",
"minimum": 0
},
"id": {
"type": "string"
},
"model": {
"type": "string",
"example": "mistralai/Mistral-7B-Instruct-v0.2"
},
"system_fingerprint": {
"type": "string"
},
"usage": {
"$ref": "#/components/schemas/Usage"
}
}
},
"CompletionRequest": {
"type": "object",
"required": [
"model",
"prompt"
],
"properties": {
"frequency_penalty": {
"type": "number",
"format": "float",
"description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim.",
"example": "1.0",
"nullable": true
},
"max_tokens": {
"type": "integer",
"format": "int32",
"description": "The maximum number of tokens that can be generated in the chat completion.",
"default": "32",
"nullable": true,
"minimum": 0
},
"model": {
"type": "string",
"description": "UNUSED\nID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.",
"example": "mistralai/Mistral-7B-Instruct-v0.2"
},
"prompt": {
"$ref": "#/components/schemas/Prompt"
},
"repetition_penalty": {
"type": "number",
"format": "float",
"nullable": true
},
"seed": {
"type": "integer",
"format": "int64",
"example": 42,
"nullable": true,
"minimum": 0
},
"stop": {
"type": "array",
"items": {
"type": "string"
},
"description": "Up to 4 sequences where the API will stop generating further tokens.",
"example": "null",
"nullable": true
},
"stream": {
"type": "boolean"
},
"suffix": {
"type": "string",
"description": "The text to append to the prompt. This is useful for completing sentences or generating a paragraph of text.\nplease see the completion_template field in the model's tokenizer_config.json file for completion template.",
"nullable": true
},
"temperature": {
"type": "number",
"format": "float",
"description": "What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while\nlower values like 0.2 will make it more focused and deterministic. We generally recommend altering this or `top_p` but not both.",
"example": 1.0,
"nullable": true
},
"top_p": {
"type": "number",
"format": "float",
"description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the\ntokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.",
"example": 0.95,
"nullable": true
}
}
},
"DeltaToolCall": {
"type": "object",
"required": [
"index",
"id",
"type",
"function"
],
"properties": {
"function": {
"$ref": "#/components/schemas/Function"
},
"id": {
"type": "string"
},
"index": {
"type": "integer",
"format": "int32",
"minimum": 0
},
"type": {
"type": "string"
}
}
},
"Details": {
"type": "object",
"required": [
"finish_reason",
"generated_tokens",
"prefill",
"tokens"
],
"properties": {
"best_of_sequences": {
"type": "array",
"items": {
"$ref": "#/components/schemas/BestOfSequence"
},
"nullable": true
},
"finish_reason": {
"$ref": "#/components/schemas/FinishReason"
},
"generated_tokens": {
"type": "integer",
"format": "int32",
"example": 1,
"minimum": 0
},
"prefill": {
"type": "array",
"items": {
"$ref": "#/components/schemas/PrefillToken"
}
},
"seed": {
"type": "integer",
"format": "int64",
"example": 42,
"nullable": true,
"minimum": 0
},
"tokens": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Token"
}
},
"top_tokens": {
"type": "array",
"items": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Token"
}
}
}
}
},
"ErrorResponse": {
"type": "object",
"required": [
"error",
"error_type"
],
"properties": {
"error": {
"type": "string"
},
"error_type": {
"type": "string"
}
}
},
"FinishReason": {
"type": "string",
"enum": [
"length",
"eos_token",
"stop_sequence"
],
"example": "Length"
},
"Function": {
"type": "object",
"required": [
"arguments"
],
"properties": {
"arguments": {
"type": "string"
},
"name": {
"type": "string",
"nullable": true
}
}
},
"FunctionDefinition": {
"type": "object",
"required": [
"name",
"arguments"
],
"properties": {
"arguments": {},
"description": {
"type": "string",
"nullable": true
},
"name": {
"type": "string"
}
}
},
"GenerateParameters": {
"type": "object",
"properties": {
"adapter_id": {
"type": "string",
"description": "Lora adapter id",
"default": "null",
"example": "null",
"nullable": true
},
"best_of": {
"type": "integer",
"description": "Generate best_of sequences and return the one if the highest token logprobs.",
"default": "null",
"example": 1,
"nullable": true,
"minimum": 0,
"exclusiveMinimum": 0
},
"decoder_input_details": {
"type": "boolean",
"description": "Whether to return decoder input token logprobs and ids.",
"default": "false"
},
"details": {
"type": "boolean",
"description": "Whether to return generation details.",
"default": "true"
},
"do_sample": {
"type": "boolean",
"description": "Activate logits sampling.",
"default": "false",
"example": true
},
"frequency_penalty": {
"type": "number",
"format": "float",
"description": "The parameter for frequency penalty. 1.0 means no penalty\nPenalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim.",
"default": "null",
"example": 0.1,
"nullable": true,
"exclusiveMinimum": -2
},
"grammar": {
"allOf": [
{
"$ref": "#/components/schemas/GrammarType"
}
],
"default": "null",
"nullable": true
},
"max_new_tokens": {
"type": "integer",
"format": "int32",
"description": "Maximum number of tokens to generate.",
"default": "100",
"example": "20",
"nullable": true,
"minimum": 0
},
"repetition_penalty": {
"type": "number",
"format": "float",
"description": "The parameter for repetition penalty. 1.0 means no penalty.\nSee [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.",
"default": "null",
"example": 1.03,
"nullable": true,
"exclusiveMinimum": 0
},
"return_full_text": {
"type": "boolean",
"description": "Whether to prepend the prompt to the generated text",
"default": "null",
"example": false,
"nullable": true
},
"seed": {
"type": "integer",
"format": "int64",
"description": "Random sampling seed.",
"default": "null",
"example": "null",
"nullable": true,
"minimum": 0,
"exclusiveMinimum": 0
},
"stop": {
"type": "array",
"items": {
"type": "string"
},
"description": "Stop generating tokens if a member of `stop` is generated.",
"example": [
"photographer"
],
"maxItems": 4
},
"temperature": {
"type": "number",
"format": "float",
"description": "The value used to module the logits distribution.",
"default": "null",
"example": 0.5,
"nullable": true,
"exclusiveMinimum": 0
},
"top_k": {
"type": "integer",
"format": "int32",
"description": "The number of highest probability vocabulary tokens to keep for top-k-filtering.",
"default": "null",
"example": 10,
"nullable": true,
"exclusiveMinimum": 0
},
"top_n_tokens": {
"type": "integer",
"format": "int32",
"description": "The number of highest probability vocabulary tokens to keep for top-n-filtering.",
"default": "null",
"example": 5,
"nullable": true,
"minimum": 0,
"exclusiveMinimum": 0
},
"top_p": {
"type": "number",
"format": "float",
"description": "Top-p value for nucleus sampling.",
"default": "null",
"example": 0.95,
"nullable": true,
"maximum": 1,
"exclusiveMinimum": 0
},
"truncate": {
"type": "integer",
"description": "Truncate inputs tokens to the given size.",
"default": "null",
"example": "null",
"nullable": true,
"minimum": 0
},
"typical_p": {
"type": "number",
"format": "float",
"description": "Typical Decoding mass\nSee [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information.",
"default": "null",
"example": 0.95,
"nullable": true,
"maximum": 1,
"exclusiveMinimum": 0
},
"watermark": {
"type": "boolean",
"description": "Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226).",
"default": "false",
"example": true
}
}
},
"GenerateRequest": {
"type": "object",
"required": [
"inputs"
],
"properties": {
"inputs": {
"type": "string",
"example": "My name is Olivier and I"
},
"parameters": {
"$ref": "#/components/schemas/GenerateParameters"
}
}
},
"GenerateResponse": {
"type": "object",
"required": [
"generated_text"
],
"properties": {
"details": {
"allOf": [
{
"$ref": "#/components/schemas/Details"
}
],
"nullable": true
},
"generated_text": {
"type": "string",
"example": "test"
}
}
},
"GrammarType": {
"oneOf": [
{
"type": "object",
"required": [
"type",
"value"
],
"properties": {
"type": {
"type": "string",
"enum": [
"json"
]
},
"value": {
"description": "A string that represents a [JSON Schema](https://json-schema.org/).\n\nJSON Schema is a declarative language that allows to annotate JSON documents\nwith types and descriptions."
}
}
},
{
"type": "object",
"required": [
"type",
"value"
],
"properties": {
"type": {
"type": "string",
"enum": [
"regex"
]
},
"value": {
"type": "string"
}
}
}
],
"discriminator": {
"propertyName": "type"
}
},
"Info": {
"type": "object",
"required": [
"model_id",
"model_dtype",
"model_device_type",
"max_concurrent_requests",
"max_best_of",
"max_stop_sequences",
"max_input_tokens",
"max_total_tokens",
"waiting_served_ratio",
"max_batch_total_tokens",
"max_waiting_tokens",
"validation_workers",
"max_client_batch_size",
"router",
"version"
],
"properties": {
"docker_label": {
"type": "string",
"example": "null",
"nullable": true
},
"max_batch_size": {
"type": "integer",
"example": "null",
"nullable": true,
"minimum": 0
},
"max_batch_total_tokens": {
"type": "integer",
"format": "int32",
"example": "32000",
"minimum": 0
},
"max_best_of": {
"type": "integer",
"example": "2",
"minimum": 0
},
"max_client_batch_size": {
"type": "integer",
"example": "32",
"minimum": 0
},
"max_concurrent_requests": {
"type": "integer",
"description": "Router Parameters",
"example": "128",
"minimum": 0
},
"max_input_tokens": {
"type": "integer",
"example": "1024",
"minimum": 0
},
"max_stop_sequences": {
"type": "integer",
"example": "4",
"minimum": 0
},
"max_total_tokens": {
"type": "integer",
"example": "2048",
"minimum": 0
},
"max_waiting_tokens": {
"type": "integer",
"example": "20",
"minimum": 0
},
"model_device_type": {
"type": "string",
"example": "cuda"
},
"model_dtype": {
"type": "string",
"example": "torch.float16"
},
"model_id": {
"type": "string",
"description": "Model info",
"example": "bigscience/blomm-560m"
},
"model_pipeline_tag": {
"type": "string",
"example": "text-generation",
"nullable": true
},
"model_sha": {
"type": "string",
"example": "e985a63cdc139290c5f700ff1929f0b5942cced2",
"nullable": true
},
"router": {
"type": "string",
"description": "Router Info",
"example": "text-generation-router"
},
"sha": {
"type": "string",
"example": "null",
"nullable": true
},
"validation_workers": {
"type": "integer",
"example": "2",
"minimum": 0
},
"version": {
"type": "string",
"example": "0.5.0"
},
"waiting_served_ratio": {
"type": "number",
"format": "float",
"example": "1.2"
}
}
},
"Message": {
"type": "object",
"required": [
"role",
"content"
],
"properties": {
"content": {
"$ref": "#/components/schemas/MessageContent"
},
"name": {
"type": "string",
"example": "\"David\"",
"nullable": true
},
"role": {
"type": "string",
"example": "user"
}
}
},
"PrefillToken": {
"type": "object",
"required": [
"id",
"text",
"logprob"
],
"properties": {
"id": {
"type": "integer",
"format": "int32",
"example": 0,
"minimum": 0
},
"logprob": {
"type": "number",
"format": "float",
"example": -0.34,
"nullable": true
},
"text": {
"type": "string",
"example": "test"
}
}
},
"Prompt": {
"type": "array",
"items": {
"type": "string"
}
},
"SimpleToken": {
"type": "object",
"required": [
"id",
"text",
"start",
"stop"
],
"properties": {
"id": {
"type": "integer",
"format": "int32",
"example": 0,
"minimum": 0
},
"start": {
"type": "integer",
"example": 0,
"minimum": 0
},
"stop": {
"type": "integer",
"example": 2,
"minimum": 0
},
"text": {
"type": "string",
"example": "test"
}
}
},
"StreamDetails": {
"type": "object",
"required": [
"finish_reason",
"generated_tokens"
],
"properties": {
"finish_reason": {
"$ref": "#/components/schemas/FinishReason"
},
"generated_tokens": {
"type": "integer",
"format": "int32",
"example": 1,
"minimum": 0
},
"seed": {
"type": "integer",
"format": "int64",
"example": 42,
"nullable": true,
"minimum": 0
}
}
},
"StreamResponse": {
"type": "object",
"required": [
"index",
"token"
],
"properties": {
"details": {
"allOf": [
{
"$ref": "#/components/schemas/StreamDetails"
}
],
"default": "null",
"nullable": true
},
"generated_text": {
"type": "string",
"default": "null",
"example": "test",
"nullable": true
},
"index": {
"type": "integer",
"format": "int32",
"minimum": 0
},
"token": {
"$ref": "#/components/schemas/Token"
},
"top_tokens": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Token"
}
}
}
},
"Token": {
"type": "object",
"required": [
"id",
"text",
"logprob",
"special"
],
"properties": {
"id": {
"type": "integer",
"format": "int32",
"example": 0,
"minimum": 0
},
"logprob": {
"type": "number",
"format": "float",
"example": -0.34,
"nullable": true
},
"special": {
"type": "boolean",
"example": "false"
},
"text": {
"type": "string",
"example": "test"
}
}
},
"TokenizeResponse": {
"type": "array",
"items": {
"$ref": "#/components/schemas/SimpleToken"
}
},
"Tool": {
"type": "object",
"required": [
"type",
"function"
],
"properties": {
"function": {
"$ref": "#/components/schemas/FunctionDefinition"
},
"type": {
"type": "string",
"example": "function"
}
}
},
"ToolCall": {
"type": "object",
"required": [
"id",
"type",
"function"
],
"properties": {
"function": {
"$ref": "#/components/schemas/FunctionDefinition"
},
"id": {
"type": "string"
},
"type": {
"type": "string"
}
}
},
"ToolType": {
"oneOf": [
{
"type": "object",
"default": null,
"nullable": true
},
{
"type": "string"
},
{
"type": "object",
"required": [
"function"
],
"properties": {
"function": {
"$ref": "#/components/schemas/FunctionName"
}
}
}
]
},
"Usage": {
"type": "object",
"required": [
"prompt_tokens",
"completion_tokens",
"total_tokens"
],
"properties": {
"completion_tokens": {
"type": "integer",
"format": "int32",
"minimum": 0
},
"prompt_tokens": {
"type": "integer",
"format": "int32",
"minimum": 0
},
"total_tokens": {
"type": "integer",
"format": "int32",
"minimum": 0
}
}
}
}
},
"tags": [
{
"name": "Text Generation Inference",
"description": "Hugging Face Text Generation Inference API"
}
]
}
- sections:
- local: index
title: Text Generation Inference
- local: quicktour
title: Quick Tour
- local: installation_nvidia
title: Using TGI with Nvidia GPUs
- local: installation_amd
title: Using TGI with AMD GPUs
- local: installation_gaudi
title: Using TGI with Intel Gaudi
- local: installation_inferentia
title: Using TGI with AWS Inferentia
- local: installation
title: Installation from source
- local: supported_models
title: Supported Models and Hardware
- local: messages_api
title: Messages API
- local: architecture
title: Internal Architecture
title: Getting started
- sections:
- local: basic_tutorials/consuming_tgi
title: Consuming TGI
- local: basic_tutorials/preparing_model
title: Preparing Model for Serving
- local: basic_tutorials/gated_model_access
title: Serving Private & Gated Models
- local: basic_tutorials/using_cli
title: Using TGI CLI
- local: basic_tutorials/launcher
title: All TGI CLI options
- local: basic_tutorials/non_core_models
title: Non-core Model Serving
- local: basic_tutorials/safety
title: Safety
- local: basic_tutorials/using_guidance
title: Using Guidance, JSON, tools
- local: basic_tutorials/visual_language_models
title: Visual Language Models
- local: basic_tutorials/monitoring
title: Monitoring TGI with Prometheus and Grafana
- local: basic_tutorials/train_medusa
title: Train Medusa
title: Tutorials
- sections:
- local: conceptual/streaming
title: Streaming
- local: conceptual/quantization
title: Quantization
- local: conceptual/tensor_parallelism
title: Tensor Parallelism
- local: conceptual/paged_attention
title: PagedAttention
- local: conceptual/safetensors
title: Safetensors
- local: conceptual/flash_attention
title: Flash Attention
- local: conceptual/speculation
title: Speculation (Medusa, ngram)
- local: conceptual/guidance
title: How Guidance Works (via outlines
- local: conceptual/lora
title: LoRA (Low-Rank Adaptation)
title: Conceptual Guides
# Text Generation Inference Architecture
This document aims at describing the architecture of Text Generation Inference (TGI), by describing the call flow between the separate components.
A high-level architecture diagram can be seen here:
![TGI architecture](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/TGI.png)
This diagram shows well there are these separate components:
- **The router**, also named `webserver`, that receives the client requests, buffers them, creates some batches, and prepares gRPC calls to a model server.
- **The model server**, responsible of receiving the gRPC requests and to process the inference on the model. If the model is sharded across multiple accelerators (e.g.: multiple GPUs), the model server shards might be synchronized via NCCL or equivalent.
- **The launcher** is a helper thar will be able to launch one or several model servers (if model is sharded), and it launches the router with the compatible arguments.
The router and the model server can be two different machines, they do not need to be deployed together.
## The Router
This component is a rust web server binary that accepts HTTP requests using the custom [HTTP API](https://huggingface.github.io/text-generation-inference/), as well as OpenAI's [Messages API](https://huggingface.co/docs/text-generation-inference/messages_api).
The router receives the API calls and handles the "baches" logic (and introduction to batching can be found [here](https://github.com/huggingface/text-generation-inference/blob/main/router/README.md)).
It uses different strategies to reduce latency between requests and responses, especially oriented to decoding latency. It will use queues, schedulers, and block allocators to achieve that and produce batched requests that it will then be sent to the model server.
### Router's command line
The router command line will be the way to pass parameters to it (it does not rely on configuration file):
```
Text Generation Webserver
Usage: text-generation-router [OPTIONS]
Options:
--max-concurrent-requests <MAX_CONCURRENT_REQUESTS>
[env: MAX_CONCURRENT_REQUESTS=] [default: 128]
--max-best-of <MAX_BEST_OF>
[env: MAX_BEST_OF=] [default: 2]
--max-stop-sequences <MAX_STOP_SEQUENCES>
[env: MAX_STOP_SEQUENCES=] [default: 4]
--max-top-n-tokens <MAX_TOP_N_TOKENS>
[env: MAX_TOP_N_TOKENS=] [default: 5]
--max-input-tokens <MAX_INPUT_TOKENS>
[env: MAX_INPUT_TOKENS=] [default: 1024]
--max-total-tokens <MAX_TOTAL_TOKENS>
[env: MAX_TOTAL_TOKENS=] [default: 2048]
--waiting-served-ratio <WAITING_SERVED_RATIO>
[env: WAITING_SERVED_RATIO=] [default: 1.2]
--max-batch-prefill-tokens <MAX_BATCH_PREFILL_TOKENS>
[env: MAX_BATCH_PREFILL_TOKENS=] [default: 4096]
--max-batch-total-tokens <MAX_BATCH_TOTAL_TOKENS>
[env: MAX_BATCH_TOTAL_TOKENS=]
--max-waiting-tokens <MAX_WAITING_TOKENS>
[env: MAX_WAITING_TOKENS=] [default: 20]
--max-batch-size <MAX_BATCH_SIZE>
[env: MAX_BATCH_SIZE=]
--hostname <HOSTNAME>
[env: HOSTNAME=] [default: 0.0.0.0]
-p, --port <PORT>
[env: PORT=] [default: 3000]
--master-shard-uds-path <MASTER_SHARD_UDS_PATH>
[env: MASTER_SHARD_UDS_PATH=] [default: /tmp/text-generation-server-0]
--tokenizer-name <TOKENIZER_NAME>
[env: TOKENIZER_NAME=] [default: bigscience/bloom]
--tokenizer-config-path <TOKENIZER_CONFIG_PATH>
[env: TOKENIZER_CONFIG_PATH=]
--revision <REVISION>
[env: REVISION=]
--validation-workers <VALIDATION_WORKERS>
[env: VALIDATION_WORKERS=] [default: 2]
--json-output
[env: JSON_OUTPUT=]
--otlp-endpoint <OTLP_ENDPOINT>
[env: OTLP_ENDPOINT=]
--otlp-service-name <OTLP_SERVICE_NAME>
[env: OTLP_SERVICE_NAME=]
--cors-allow-origin <CORS_ALLOW_ORIGIN>
[env: CORS_ALLOW_ORIGIN=]
--ngrok
[env: NGROK=]
--ngrok-authtoken <NGROK_AUTHTOKEN>
[env: NGROK_AUTHTOKEN=]
--ngrok-edge <NGROK_EDGE>
[env: NGROK_EDGE=]
--messages-api-enabled
[env: MESSAGES_API_ENABLED=]
--disable-grammar-support
[env: DISABLE_GRAMMAR_SUPPORT=]
--max-client-batch-size <MAX_CLIENT_BATCH_SIZE>
[env: MAX_CLIENT_BATCH_SIZE=] [default: 4]
-h, --help
Print help
-V, --version
Print version
```
## The Model Server
The model server is a python server, capable of starting a server waiting for gRPC requests, loads a given model, perform sharding to provide [tensor parallelism](https://huggingface.co/docs/text-generation-inference/conceptual/tensor_parallelism), and stays alive while waiting for new requests.
The model server supports models instantiated using Pytorch and optimized for inference mainly on CUDA/ROCM.
### Model Server Variants
Several variants of the model server exist that are actively supported by Hugging Face:
- By default, the model server will attempt building [a server optimized for Nvidia GPUs with CUDA](https://huggingface.co/docs/text-generation-inference/installation_nvidia). The code for this version is hosted in the [main TGI repository](https://github.com/huggingface/text-generation-inference).
- A [version optimized for AMD with ROCm](https://huggingface.co/docs/text-generation-inference/installation_amd) is hosted in the main TGI repository. Some model features differ.
- The [version for Intel Gaudi](https://huggingface.co/docs/text-generation-inference/installation_gaudi) is maintained on a forked repository, often resynchronized with the main [TGI repository](https://github.com/huggingface/tgi-gaudi).
- A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained as part of [Optimum Neuron](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference).
- A version for Google TPUs is maintained as part of [Optimum TPU](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference).
Not all variants provide the same features, as hardware and middleware capabilities do not provide the same optimizations.
### Command Line Interface
The official command line interface (CLI) for the server supports three subcommands, `download-weights`, `quantize` and `serve`:
- `download-weights` will download weights from the hub and, in some variants it will convert weights to a format that is adapted to the given implementation;
- `quantize` will allow to quantize a model using the `qptq` package. This feature is not available nor supported on all variants;
- `serve` will start the server that load a model (or a model shard), receives gRPC calls from the router, performs an inference and provides a formatted response to the given request.
Serve's command line parameters on the TGI repository are these:
```
Usage: cli.py serve [OPTIONS] MODEL_ID
╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────╮
│ * model_id TEXT [default: None] [required] │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯
╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────╮
│ --revision TEXT [default: None] │
│ --sharded --no-sharded [default: no-sharded] │
│ --quantize [bitsandbytes|bitsandbytes [default: None] │
│ -nf4|bitsandbytes-fp4|gptq │
│ |awq|eetq|exl2|fp8] │
│ --speculate INTEGER [default: None] │
│ --dtype [float16|bfloat16] [default: None] │
│ --trust-remote-code --no-trust-remote-code [default: │
│ no-trust-remote-code] │
│ --uds-path PATH [default: │
│ /tmp/text-generation-serve… │
│ --logger-level TEXT [default: INFO] │
│ --json-output --no-json-output [default: no-json-output] │
│ --otlp-endpoint TEXT [default: None] │
│ --otlp-service-name TEXT [default: │
│ text-generation-inference...│
│ --help Show this message and exit. │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯
```
Note that some variants might support different parameters, and they could possibly accept more options that can be passed on using environment variables.
## Call Flow
Once both components are initialized, weights downloaded and model server is up and running, router and model server exchange data and info through the gRPC call. There are currently two supported schemas, [v2](https://github.com/huggingface/text-generation-inference/blob/main/proto/generate.proto) and [v3](https://github.com/huggingface/text-generation-inference/blob/main/proto/v3/generate.proto). These two versions are almost identical, except for:
- input chunks support, for text and image data,
- paged attention support
Here's a diagram that displays the exchanges that follow the router and model server startup.
```mermaid
sequenceDiagram
Router->>Model Server: service discovery
Model Server-->>Router: urls for other shards
Router->>Model Server: get model info
Model Server-->>Router: shard info
Router->>Model Server: health check
Model Server-->>Router: health OK
Router->>Model Server: warmup(max_input_tokens, max_batch_prefill_tokens, max_total_tokens, max_batch_size)
Model Server-->>Router: warmup result
```
After these are done, the router is ready to receive generate calls from multiple clients. Here's an example.
```mermaid
sequenceDiagram
participant Client 1
participant Client 2
participant Client 3
participant Router
participant Model Server
Client 1->>Router: generate_stream
Router->>Model Server: prefill(batch1)
Model Server-->>Router: generations, cached_batch1, timings
Router-->>Client 1: token 1
Router->>Model Server: decode(cached_batch1)
Model Server-->>Router: generations, cached_batch1, timings
Router-->>Client 1: token 2
Router->>Model Server: decode(cached_batch1)
Model Server-->>Router: generations, cached_batch1, timings
Router-->>Client 1: token 3
Client 2->>Router: generate_stream
Router->>Model Server: prefill(batch2)
Note right of Model Server: This stops previous batch, that is restarted
Model Server-->>Router: generations, cached_batch2, timings
Router-->>Client 2: token 1'
Router->>Model Server: decode(cached_batch1, cached_batch2)
Model Server-->>Router: generations, cached_batch1, timings
Router-->>Client 1: token 4
Router-->>Client 2: token 2'
Note left of Client 1: Client 1 leaves
Router->>Model Server: filter_batch(cached_batch1, request_ids_to_keep=batch2)
Model Server-->>Router: filtered batch
Router->>Model Server: decode(cached_batch2)
Model Server-->>Router: generations, cached_batch2, timings
Router-->>Client 2: token 3'
Client 3->>Router: generate_stream
Note right of Model Server: This stops previous batch, that is restarted
Router->>Model Server: prefill(batch3)
Note left of Client 1: Client 3 leaves without receiving any batch
Router->>Model Server: clear_cache(batch3)
Note right of Model Server: This stops previous batch, that is restarted
Router->>Model Server: decode(cached_batch3)
Note right of Model Server: Last token (stopping criteria)
Model Server-->>Router: generations, cached_batch3, timings
Router-->>Client 2: token 4'
```
# Consuming Text Generation Inference
There are many ways you can consume Text Generation Inference server in your applications. After launching, you can use the `/generate` route and make a `POST` request to get results from the server. You can also use the `/generate_stream` route if you want TGI to return a stream of tokens. You can make the requests using the tool of your preference, such as curl, Python or TypeScrpt. For a final end-to-end experience, we also open-sourced ChatUI, a chat interface for open-source models.
## curl
After the launch, you can query the model using either the `/generate` or `/generate_stream` routes:
```bash
curl 127.0.0.1:8080/generate \
-X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
-H 'Content-Type: application/json'
```
## Inference Client
[`huggingface-hub`](https://huggingface.co/docs/huggingface_hub/main/en/index) is a Python library to interact with the Hugging Face Hub, including its endpoints. It provides a nice high-level class, [`~huggingface_hub.InferenceClient`], which makes it easy to make calls to a TGI endpoint. `InferenceClient` also takes care of parameter validation and provides a simple to-use interface.
You can simply install `huggingface-hub` package with pip.
```bash
pip install huggingface-hub
```
Once you start the TGI server, instantiate `InferenceClient()` with the URL to the endpoint serving the model. You can then call `text_generation()` to hit the endpoint through Python.
```python
from huggingface_hub import InferenceClient
client = InferenceClient(model="http://127.0.0.1:8080")
client.text_generation(prompt="Write a code for snake game")
```
You can do streaming with `InferenceClient` by passing `stream=True`. Streaming will return tokens as they are being generated in the server. To use streaming, you can do as follows:
```python
for token in client.text_generation("How do you make cheese?", max_new_tokens=12, stream=True):
print(token)
```
Another parameter you can use with TGI backend is `details`. You can get more details on generation (tokens, probabilities, etc.) by setting `details` to `True`. When it's specified, TGI will return a `TextGenerationResponse` or `TextGenerationStreamResponse` rather than a string or stream.
```python
output = client.text_generation(prompt="Meaning of life is", details=True)
print(output)
# TextGenerationResponse(generated_text=' a complex concept that is not always clear to the individual. It is a concept that is not always', details=Details(finish_reason=<FinishReason.Length: 'length'>, generated_tokens=20, seed=None, prefill=[], tokens=[Token(id=267, text=' a', logprob=-2.0723474, special=False), Token(id=11235, text=' complex', logprob=-3.1272552, special=False), Token(id=17908, text=' concept', logprob=-1.3632495, special=False),..))
```
You can see how to stream below.
```python
output = client.text_generation(prompt="Meaning of life is", stream=True, details=True)
print(next(iter(output)))
# TextGenerationStreamResponse(token=Token(id=267, text=' a', logprob=-2.0723474, special=False), generated_text=None, details=None)
```
You can check out the details of the function [here](https://huggingface.co/docs/huggingface_hub/main/en/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation). There is also an async version of the client, `AsyncInferenceClient`, based on `asyncio` and `aiohttp`. You can find docs for it [here](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.AsyncInferenceClient)
## ChatUI
ChatUI is an open-source interface built for LLM serving. It offers many customization options, such as web search with SERP API and more. ChatUI can automatically consume the TGI server and even provides an option to switch between different TGI endpoints. You can try it out at [Hugging Chat](https://huggingface.co/chat/), or use the [ChatUI Docker Space](https://huggingface.co/new-space?template=huggingchat/chat-ui-template) to deploy your own Hugging Chat to Spaces.
To serve both ChatUI and TGI in same environment, simply add your own endpoints to the `MODELS` variable in `.env.local` file inside the `chat-ui` repository. Provide the endpoints pointing to where TGI is served.
```
{
// rest of the model config here
"endpoints": [{"url": "https://HOST:PORT/generate_stream"}]
}
```
![ChatUI](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/chatui_screen.png)
## Gradio
Gradio is a Python library that helps you build web applications for your machine learning models with a few lines of code. It has a `ChatInterface` wrapper that helps create neat UIs for chatbots. Let's take a look at how to create a chatbot with streaming mode using TGI and Gradio. Let's install Gradio and Hub Python library first.
```bash
pip install huggingface-hub gradio
```
Assume you are serving your model on port 8080, we will query through [InferenceClient](consuming_tgi#inference-client).
```python
import gradio as gr
from huggingface_hub import InferenceClient
client = InferenceClient(model="http://127.0.0.1:8080")
def inference(message, history):
partial_message = ""
for token in client.text_generation(message, max_new_tokens=20, stream=True):
partial_message += token
yield partial_message
gr.ChatInterface(
inference,
chatbot=gr.Chatbot(height=300),
textbox=gr.Textbox(placeholder="Chat with me!", container=False, scale=7),
description="This is the demo for Gradio UI consuming TGI endpoint with LLaMA 7B-Chat model.",
title="Gradio 🤝 TGI",
examples=["Are tomatoes vegetables?"],
retry_btn="Retry",
undo_btn="Undo",
clear_btn="Clear",
).queue().launch()
```
The UI looks like this 👇
<div class="flex justify-center">
<img
class="block dark:hidden"
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi.png"
/>
<img
class="hidden dark:block"
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi-dark.png"
/>
</div>
You can try the demo directly here 👇
<div class="block dark:hidden">
<iframe
src="https://merve-gradio-tgi-2.hf.space?__theme=light"
width="850"
height="750"
></iframe>
</div>
<div class="hidden dark:block">
<iframe
src="https://merve-gradio-tgi-2.hf.space?__theme=dark"
width="850"
height="750"
></iframe>
</div>
You can disable streaming mode using `return` instead of `yield` in your inference function, like below.
```python
def inference(message, history):
return client.text_generation(message, max_new_tokens=20)
```
You can read more about how to customize a `ChatInterface` [here](https://www.gradio.app/guides/creating-a-chatbot-fast).
## API documentation
You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route. The Swagger UI is also available [here](https://huggingface.github.io/text-generation-inference).
# Serving Private & Gated Models
If the model you wish to serve is behind gated access or the model repository on Hugging Face Hub is private, and you have access to the model, you can provide your Hugging Face Hub access token. You can generate and copy a read token from [Hugging Face Hub tokens page](https://huggingface.co/settings/tokens)
If you're using the CLI, set the `HF_TOKEN` environment variable. For example:
```
export HF_TOKEN=<YOUR READ TOKEN>
```
If you would like to do it through Docker, you can provide your token by specifying `HF_TOKEN` as shown below.
```bash
model=meta-llama/Llama-2-7b-chat-hf
volume=$PWD/data
token=<your READ token>
docker run --gpus all \
--shm-size 1g \
-e HF_TOKEN=$token \
-p 8080:80 \
-v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.4 \
--model-id $model
```
# Text-generation-launcher arguments
<!-- WRAP CODE BLOCKS -->
```shell
Text Generation Launcher
Usage: text-generation-launcher [OPTIONS]
Options:
```
## MODEL_ID
```shell
--model-id <MODEL_ID>
The name of the model to load. Can be a MODEL_ID as listed on <https://hf.co/models> like `gpt2` or `OpenAssistant/oasst-sft-1-pythia-12b`. Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of transformers
[env: MODEL_ID=]
[default: bigscience/bloom-560m]
```
## REVISION
```shell
--revision <REVISION>
The actual revision of the model if you're referring to a model on the hub. You can use a specific commit id or a branch like `refs/pr/2`
[env: REVISION=]
```
## VALIDATION_WORKERS
```shell
--validation-workers <VALIDATION_WORKERS>
The number of tokenizer workers used for payload validation and truncation inside the router
[env: VALIDATION_WORKERS=]
[default: 2]
```
## SHARDED
```shell
--sharded <SHARDED>
Whether to shard the model across multiple GPUs By default text-generation-inference will use all available GPUs to run the model. Setting it to `false` deactivates `num_shard`
[env: SHARDED=]
[possible values: true, false]
```
## NUM_SHARD
```shell
--num-shard <NUM_SHARD>
The number of shards to use if you don't want to use all GPUs on a given machine. You can use `CUDA_VISIBLE_DEVICES=0,1 text-generation-launcher... --num_shard 2` and `CUDA_VISIBLE_DEVICES=2,3 text-generation-launcher... --num_shard 2` to launch 2 copies with 2 shard each on a given machine with 4 GPUs for instance
[env: NUM_SHARD=]
```
## QUANTIZE
```shell
--quantize <QUANTIZE>
Whether you want the model to be quantized
[env: QUANTIZE=]
Possible values:
- awq: 4 bit quantization. Requires a specific AWQ quantized model: <https://hf.co/models?search=awq>. Should replace GPTQ models wherever possible because of the better latency
- eetq: 8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
- exl2: Variable bit quantization. Requires a specific EXL2 quantized model: <https://hf.co/models?search=exl2>. Requires exllama2 kernels and does not support tensor parallelism (num_shard > 1)
- gptq: 4 bit quantization. Requires a specific GTPQ quantized model: <https://hf.co/models?search=gptq>. text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels
- marlin: 4 bit quantization. Requires a specific Marlin quantized model: <https://hf.co/models?search=marlin>
- bitsandbytes: Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half, but it is known that the model will be much slower to run than the native f16
- bitsandbytes-nf4: Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x, but it is known that the model will be much slower to run than the native f16
- bitsandbytes-fp4: Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better perplexity performance for you model
- fp8: [FP8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/) (e4m3) works on H100 and above This dtype has native ops should be the fastest if available. This is currently not the fastest because of local unpacking + padding to satisfy matrix multiplication limitations
```
## SPECULATE
```shell
--speculate <SPECULATE>
The number of input_ids to speculate on If using a medusa model, the heads will be picked up automatically Other wise, it will use n-gram speculation which is relatively free in terms of compute, but the speedup heavily depends on the task
[env: SPECULATE=]
```
## DTYPE
```shell
--dtype <DTYPE>
The dtype to be forced upon the model. This option cannot be used with `--quantize`
[env: DTYPE=]
[possible values: float16, bfloat16]
```
## TRUST_REMOTE_CODE
```shell
--trust-remote-code
Whether you want to execute hub modelling code. Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision
[env: TRUST_REMOTE_CODE=]
```
## MAX_CONCURRENT_REQUESTS
```shell
--max-concurrent-requests <MAX_CONCURRENT_REQUESTS>
The maximum amount of concurrent requests for this particular deployment. Having a low limit will refuse clients requests instead of having them wait for too long and is usually good to handle backpressure correctly
[env: MAX_CONCURRENT_REQUESTS=]
[default: 128]
```
## MAX_BEST_OF
```shell
--max-best-of <MAX_BEST_OF>
This is the maximum allowed value for clients to set `best_of`. Best of makes `n` generations at the same time, and return the best in terms of overall log probability over the entire generated sequence
[env: MAX_BEST_OF=]
[default: 2]
```
## MAX_STOP_SEQUENCES
```shell
--max-stop-sequences <MAX_STOP_SEQUENCES>
This is the maximum allowed value for clients to set `stop_sequences`. Stop sequences are used to allow the model to stop on more than just the EOS token, and enable more complex "prompting" where users can preprompt the model in a specific way and define their "own" stop token aligned with their prompt
[env: MAX_STOP_SEQUENCES=]
[default: 4]
```
## MAX_TOP_N_TOKENS
```shell
--max-top-n-tokens <MAX_TOP_N_TOKENS>
This is the maximum allowed value for clients to set `top_n_tokens`. `top_n_tokens` is used to return information about the the `n` most likely tokens at each generation step, instead of just the sampled token. This information can be used for downstream tasks like for classification or ranking
[env: MAX_TOP_N_TOKENS=]
[default: 5]
```
## MAX_INPUT_TOKENS
```shell
--max-input-tokens <MAX_INPUT_TOKENS>
This is the maximum allowed input length (expressed in number of tokens) for users. The larger this value, the longer prompt users can send which can impact the overall memory required to handle the load. Please note that some models have a finite range of sequence they can handle. Default to min(max_position_embeddings - 1, 4095)
[env: MAX_INPUT_TOKENS=]
```
## MAX_INPUT_LENGTH
```shell
--max-input-length <MAX_INPUT_LENGTH>
Legacy version of [`Args::max_input_tokens`]
[env: MAX_INPUT_LENGTH=]
```
## MAX_TOTAL_TOKENS
```shell
--max-total-tokens <MAX_TOTAL_TOKENS>
This is the most important value to set as it defines the "memory budget" of running clients requests. Clients will send input sequences and ask to generate `max_new_tokens` on top. with a value of `1512` users can send either a prompt of `1000` and ask for `512` new tokens, or send a prompt of `1` and ask for `1511` max_new_tokens. The larger this value, the larger amount each request will be in your RAM and the less effective batching can be. Default to min(max_position_embeddings, 4096)
[env: MAX_TOTAL_TOKENS=]
```
## WAITING_SERVED_RATIO
```shell
--waiting-served-ratio <WAITING_SERVED_RATIO>
This represents the ratio of waiting queries vs running queries where you want to start considering pausing the running queries to include the waiting ones into the same batch. `waiting_served_ratio=1.2` Means when 12 queries are waiting and there's only 10 queries left in the current batch we check if we can fit those 12 waiting queries into the batching strategy, and if yes, then batching happens delaying the 10 running queries by a `prefill` run.
This setting is only applied if there is room in the batch as defined by `max_batch_total_tokens`.
[env: WAITING_SERVED_RATIO=]
[default: 0.3]
```
## MAX_BATCH_PREFILL_TOKENS
```shell
--max-batch-prefill-tokens <MAX_BATCH_PREFILL_TOKENS>
Limits the number of tokens for the prefill operation. Since this operation take the most memory and is compute bound, it is interesting to limit the number of requests that can be sent. Default to `max_input_tokens + 50` to give a bit of room
[env: MAX_BATCH_PREFILL_TOKENS=]
```
## MAX_BATCH_TOTAL_TOKENS
```shell
--max-batch-total-tokens <MAX_BATCH_TOTAL_TOKENS>
**IMPORTANT** This is one critical control to allow maximum usage of the available hardware.
This represents the total amount of potential tokens within a batch. When using padding (not recommended) this would be equivalent of `batch_size` * `max_total_tokens`.
However in the non-padded (flash attention) version this can be much finer.
For `max_batch_total_tokens=1000`, you could fit `10` queries of `total_tokens=100` or a single query of `1000` tokens.
Overall this number should be the largest possible amount that fits the remaining memory (after the model is loaded). Since the actual memory overhead depends on other parameters like if you're using quantization, flash attention or the model implementation, text-generation-inference cannot infer this number automatically.
[env: MAX_BATCH_TOTAL_TOKENS=]
```
## MAX_WAITING_TOKENS
```shell
--max-waiting-tokens <MAX_WAITING_TOKENS>
This setting defines how many tokens can be passed before forcing the waiting queries to be put on the batch (if the size of the batch allows for it). New queries require 1 `prefill` forward, which is different from `decode` and therefore you need to pause the running batch in order to run `prefill` to create the correct values for the waiting queries to be able to join the batch.
With a value too small, queries will always "steal" the compute to run `prefill` and running queries will be delayed by a lot.
With a value too big, waiting queries could wait for a very long time before being allowed a slot in the running batch. If your server is busy that means that requests that could run in ~2s on an empty server could end up running in ~20s because the query had to wait for 18s.
This number is expressed in number of tokens to make it a bit more "model" agnostic, but what should really matter is the overall latency for end users.
[env: MAX_WAITING_TOKENS=]
[default: 20]
```
## MAX_BATCH_SIZE
```shell
--max-batch-size <MAX_BATCH_SIZE>
Enforce a maximum number of requests per batch Specific flag for hardware targets that do not support unpadded inference
[env: MAX_BATCH_SIZE=]
```
## CUDA_GRAPHS
```shell
--cuda-graphs <CUDA_GRAPHS>
Specify the batch sizes to compute cuda graphs for. Use "0" to disable. Default = "1,2,4,8,16,32"
[env: CUDA_GRAPHS=]
```
## HOSTNAME
```shell
--hostname <HOSTNAME>
The IP address to listen on
[env: HOSTNAME=]
[default: 0.0.0.0]
```
## PORT
```shell
-p, --port <PORT>
The port to listen on
[env: PORT=]
[default: 3000]
```
## SHARD_UDS_PATH
```shell
--shard-uds-path <SHARD_UDS_PATH>
The name of the socket for gRPC communication between the webserver and the shards
[env: SHARD_UDS_PATH=]
[default: /tmp/text-generation-server]
```
## MASTER_ADDR
```shell
--master-addr <MASTER_ADDR>
The address the master shard will listen on. (setting used by torch distributed)
[env: MASTER_ADDR=]
[default: localhost]
```
## MASTER_PORT
```shell
--master-port <MASTER_PORT>
The address the master port will listen on. (setting used by torch distributed)
[env: MASTER_PORT=]
[default: 29500]
```
## HUGGINGFACE_HUB_CACHE
```shell
--huggingface-hub-cache <HUGGINGFACE_HUB_CACHE>
The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk for instance
[env: HUGGINGFACE_HUB_CACHE=]
```
## WEIGHTS_CACHE_OVERRIDE
```shell
--weights-cache-override <WEIGHTS_CACHE_OVERRIDE>
The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk for instance
[env: WEIGHTS_CACHE_OVERRIDE=]
```
## DISABLE_CUSTOM_KERNELS
```shell
--disable-custom-kernels
For some models (like bloom), text-generation-inference implemented custom cuda kernels to speed up inference. Those kernels were only tested on A100. Use this flag to disable them if you're running on different hardware and encounter issues
[env: DISABLE_CUSTOM_KERNELS=]
```
## CUDA_MEMORY_FRACTION
```shell
--cuda-memory-fraction <CUDA_MEMORY_FRACTION>
Limit the CUDA available memory. The allowed value equals the total visible memory multiplied by cuda-memory-fraction
[env: CUDA_MEMORY_FRACTION=]
[default: 1.0]
```
## ROPE_SCALING
```shell
--rope-scaling <ROPE_SCALING>
Rope scaling will only be used for RoPE models and allow rescaling the position rotary to accomodate for larger prompts.
Goes together with `rope_factor`.
`--rope-factor 2.0` gives linear scaling with a factor of 2.0 `--rope-scaling dynamic` gives dynamic scaling with a factor of 1.0 `--rope-scaling linear` gives linear scaling with a factor of 1.0 (Nothing will be changed basically)
`--rope-scaling linear --rope-factor` fully describes the scaling you want
[env: ROPE_SCALING=]
[possible values: linear, dynamic]
```
## ROPE_FACTOR
```shell
--rope-factor <ROPE_FACTOR>
Rope scaling will only be used for RoPE models See `rope_scaling`
[env: ROPE_FACTOR=]
```
## JSON_OUTPUT
```shell
--json-output
Outputs the logs in JSON format (useful for telemetry)
[env: JSON_OUTPUT=]
```
## OTLP_ENDPOINT
```shell
--otlp-endpoint <OTLP_ENDPOINT>
[env: OTLP_ENDPOINT=]
```
## OTLP_SERVICE_NAME
```shell
--otlp-service-name <OTLP_SERVICE_NAME>
[env: OTLP_SERVICE_NAME=]
[default: text-generation-inference.router]
```
## CORS_ALLOW_ORIGIN
```shell
--cors-allow-origin <CORS_ALLOW_ORIGIN>
[env: CORS_ALLOW_ORIGIN=]
```
## WATERMARK_GAMMA
```shell
--watermark-gamma <WATERMARK_GAMMA>
[env: WATERMARK_GAMMA=]
```
## WATERMARK_DELTA
```shell
--watermark-delta <WATERMARK_DELTA>
[env: WATERMARK_DELTA=]
```
## NGROK
```shell
--ngrok
Enable ngrok tunneling
[env: NGROK=]
```
## NGROK_AUTHTOKEN
```shell
--ngrok-authtoken <NGROK_AUTHTOKEN>
ngrok authentication token
[env: NGROK_AUTHTOKEN=]
```
## NGROK_EDGE
```shell
--ngrok-edge <NGROK_EDGE>
ngrok edge
[env: NGROK_EDGE=]
```
## TOKENIZER_CONFIG_PATH
```shell
--tokenizer-config-path <TOKENIZER_CONFIG_PATH>
The path to the tokenizer config file. This path is used to load the tokenizer configuration which may include a `chat_template`. If not provided, the default config will be used from the model hub
[env: TOKENIZER_CONFIG_PATH=]
```
## DISABLE_GRAMMAR_SUPPORT
```shell
--disable-grammar-support
Disable outlines grammar constrained generation. This is a feature that allows you to generate text that follows a specific grammar
[env: DISABLE_GRAMMAR_SUPPORT=]
```
## ENV
```shell
-e, --env
Display a lot of information about your runtime environment
```
## MAX_CLIENT_BATCH_SIZE
```shell
--max-client-batch-size <MAX_CLIENT_BATCH_SIZE>
Control the maximum number of inputs that a client can send in a single request
[env: MAX_CLIENT_BATCH_SIZE=]
[default: 4]
```
## LORA_ADAPTERS
```shell
--lora-adapters <LORA_ADAPTERS>
Lora Adapters a list of adapter ids i.e. `repo/adapter1,repo/adapter2` to load during startup that will be available to callers via the `adapter_id` field in a request
[env: LORA_ADAPTERS=]
```
## HELP
```shell
-h, --help
Print help (see a summary with '-h')
```
## VERSION
```shell
-V, --version
Print version
```
# Monitoring TGI server with Prometheus and Grafana dashboard
TGI server deployment can easily be monitored through a Grafana dashboard, consuming a Prometheus data collection. Example of inspectable metrics are statistics on the effective batch sizes used by TGI, prefill/decode latencies, number of generated tokens, etc.
In this tutorial, we look at how to set up a local Grafana dashboard to monitor TGI usage.
![Grafana dashboard for TGI](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/grafana.png)
## Setup on the server machine
First, on your server machine, TGI needs to be launched as usual. TGI exposes [multiple](https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527) metrics that can be collected by Prometheus monitoring server.
In the rest of this tutorial, we assume that TGI was launched through Docker with `--network host`.
On the server where TGI is hosted, a Prometheus server needs to be installed and launched. To do so, please follow [Prometheus installation instructions](https://prometheus.io/download/#prometheus). For example, at the time of writing on a Linux machine:
```
wget https://github.com/prometheus/prometheus/releases/download/v2.52.0/prometheus-2.52.0.linux-amd64.tar.gz
tar -xvzf prometheus-2.52.0.linux-amd64.tar.gz
cd prometheus
```
Prometheus needs to be configured to listen on TGI's port. To do so, in Prometheus configuration file `prometheus.yml`, one needs to edit the lines:
```
static_configs:
- targets: ["0.0.0.0:80"]
```
to use the correct IP address and port.
We suggest to try `curl 0.0.0.0:80/generate -X POST -d '{"inputs":"hey chatbot, how are","parameters":{"max_new_tokens":15}}' -H 'Content-Type: application/json'` on the server side to make sure to configure the correct IP and port.
Once Prometheus is configured, Prometheus server can be launched on the same machine where TGI is launched:
```
./prometheus --config.file="prometheus.yml"
```
In this guide, Prometheus monitoring data will be consumed on a local computer. Hence, we need to forward Prometheus port (by default 9090) to the local computer. To do so, we can for example:
* Use ssh [local port forwarding](https://www.ssh.com/academy/ssh/tunneling-example)
* Use ngrok port tunneling
For simplicity, we will use [Ngrok](https://ngrok.com/docs/) in this guide to tunnel Prometheus port from the TGI server to the outside word.
For that, you should follow the steps at https://dashboard.ngrok.com/get-started/setup/linux, and once Ngrok is installed, use:
```bash
ngrok http http://0.0.0.0:9090
```
As a sanity check, one can make sure that Prometheus server can be accessed at the URL given by Ngrok (in the style of https://d661-4-223-164-145.ngrok-free.app) from a local machine.
## Setup on the monitoring machine
Monitoring is typically done on an other machine than the server one. We use a Grafana dashboard to monitor TGI's server usage.
Two options are available:
* Use Grafana Cloud for an hosted dashboard solution (https://grafana.com/products/cloud/).
* Self-host a grafana dashboard.
In this tutorial, for simplicity, we will self host the dashbard. We recommend installing Grafana Open-source edition following [the official install instructions](https://grafana.com/grafana/download?platform=linux&edition=oss), using the available Linux binaries. For example:
```bash
wget https://dl.grafana.com/oss/release/grafana-11.0.0.linux-amd64.tar.gz
tar -zxvf grafana-11.0.0.linux-amd64.tar.gz
cd grafana-11.0.0
./bin/grafana-server
```
Once the Grafana server is launched, the Grafana interface is available at http://localhost:3000. One needs to log in with the `admin` username and `admin` password.
Once logged in, the Prometheus data source for Grafana needs to be configured, in the option `Add your first data source`. There, a Prometheus data source needs to be added with the Ngrok address we got earlier, that exposes Prometheus port (example: https://d661-4-223-164-145.ngrok-free.app).
Once Prometheus data source is configured, we can finally create our dashboard! From home, go to `Create your first dashboard` and then `Import dashboard`. There, we will use the recommended dashboard template [tgi_grafana.json](https://github.com/huggingface/text-generation-inference/blob/main/assets/tgi_grafana.json) for a dashboard ready to be used, but you may configure your own dashboard as you like.
Community contributed dashboard templates are also available, for example [here](https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/) or [here](https://grafana.com/grafana/dashboards/20246-text-generation-inference/).
Load your dashboard configuration, and your TGI dashboard should be ready to go!
# Non-core Model Serving
TGI supports various LLM architectures (see full list [here](../supported_models)). If you wish to serve a model that is not one of the supported models, TGI will fallback to the `transformers` implementation of that model. This means you will be unable to use some of the features introduced by TGI, such as tensor-parallel sharding or flash attention. However, you can still get many benefits of TGI, such as continuous batching or streaming outputs.
You can serve these models using the same Docker command-line invocation as with fully supported models 👇
```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id gpt2
```
If the model you wish to serve is a custom transformers model, and its weights and implementation are available in the Hub, you can still serve the model by passing the `--trust-remote-code` flag to the `docker run` command like below 👇
```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id <CUSTOM_MODEL_ID> --trust-remote-code
```
Finally, if the model is not on Hugging Face Hub but on your local, you can pass the path to the folder that contains your model like below 👇
```bash
# Make sure your model is in the $volume directory
docker run --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id /data/<PATH-TO-FOLDER>
```
You can refer to [transformers docs on custom models](https://huggingface.co/docs/transformers/main/en/custom_models) for more information.
# Preparing the Model
Text Generation Inference improves the model in several aspects.
## Quantization
TGI supports [bits-and-bytes](https://github.com/TimDettmers/bitsandbytes#bitsandbytes), [GPT-Q](https://arxiv.org/abs/2210.17323) and [AWQ](https://arxiv.org/abs/2306.00978) quantization. To speed up inference with quantization, simply set `quantize` flag to `bitsandbytes`, `gptq` or `awq` depending on the quantization technique you wish to use. When using GPT-Q quantization, you need to point to one of the models [here](https://huggingface.co/models?search=gptq) when using AWQ quantization, you need to point to one of the models [here](https://huggingface.co/models?search=awq). To get more information about quantization, please refer to [quantization guide](./../conceptual/quantization)
## RoPE Scaling
RoPE scaling can be used to increase the sequence length of the model during the inference time without necessarily fine-tuning it. To enable RoPE scaling, simply pass `--rope-scaling`, `--max-input-length` and `--rope-factors` flags when running through CLI. `--rope-scaling` can take the values `linear` or `dynamic`. If your model is not fine-tuned to a longer sequence length, use `dynamic`. `--rope-factor` is the ratio between the intended max sequence length and the model's original max sequence length. Make sure to pass `--max-input-length` to provide maximum input length for extension.
<Tip>
We recommend using `dynamic` RoPE scaling.
</Tip>
## Safetensors
[Safetensors](https://github.com/huggingface/safetensors) is a fast and safe persistence format for deep learning models, and is required for tensor parallelism. TGI supports `safetensors` model loading under the hood. By default, given a repository with `safetensors` and `pytorch` weights, TGI will always load `safetensors`. If there's no `pytorch` weights, TGI will convert the weights to `safetensors` format.
# Model safety.
[Pytorch uses pickle](https://pytorch.org/docs/master/generated/torch.load.html) by default meaning that for quite a long while
*Every* model using that format is potentially executing unintended code while purely loading the model.
There is a big red warning on Python's page for pickle [link](https://docs.python.org/3/library/pickle.html) but for quite a while
this was ignored by the community. Now that AI/ML is getting used much more ubiquitously we need to switch away from this format.
HuggingFace is leading the effort here by creating a new format which contains pure data ([safetensors](https://github.com/huggingface/safetensors))
and moving slowly but surely all the libs to make use of it by default.
The move is intentionnally slow in order to make breaking changes as little impact as possible on users throughout.
# TGI 2.0
Since the release of TGI 2.0, we take the opportunity of this major version increase to break backward compatibility for these pytorch
models (since they are a huge security risk for anyone deploying them).
From now on, TGI will not convert automatically pickle files without having `--trust-remote-code` flag or `TRUST_REMOTE_CODE=true` in the environment variables.
This flag is already used for community defined inference code, and is therefore quite representative of the level of confidence you are giving the model providers.
If you want to use a model that uses pickle, but you still do not want to trust the authors entirely we recommend making a convertion on our space made for that.
https://huggingface.co/spaces/safetensors/convert
This space will create a PR on the original model, which you are use directly regardless of merge status from the original authors. Just use
```
docker run .... --revision refs/pr/#ID # Or use REVISION=refs/pr/#ID in the environment
```
# Train Medusa
This tutorial will show you how to train a Medusa model on a dataset of your choice. Please check out the [speculation documentation](../conceptual/speculation) for more information on how Medusa works and speculation in general.
## What are the benefits of training a Medusa model?
Training Medusa heads can greatly improve the speed of generation. Medusa adds extra "heads" to LLMs to predict multiple future tokens simultaneously. When augmenting a model with Medusa, the original model stays untouched, and only the new heads are fine-tuned during training.
One of the most important things is to have a good dataset (with similar data to what will be used in production) because Medusa has a much higher hit-rate when the generation is in-domain.
If you train Medusa on a dataset that is very different from the one you will use in production then the model will not be able to predict the future tokens accurately and consequently the speedup will be minimal or non-existent.
## Self-distillation (Generating data for training)
There are many methods for preparing data for training, but one of the easiest and most effective ways is to "self-distill" the data. This means that you can use the same model to generate the data that you will use to train the model.
Essentially, you prompt the model with a similar input to what you will use in production and the model will generate the output.
We'll use this output to help train the medusa heads to predict the `n+1`, `n+2`, `n+3`, etc tokens in the sequence.
## Training
The original implementation of Medusa is available at [https://github.com/FasterDecoding/Medusa](https://github.com/FasterDecoding/Medusa) and we'll follow a very similar process to train the model as described on the original repository.
### Getting Started
There are two methods for training the model:
- `torchrun` that is a wrapper around `torch.distributed.launch`
- a forked version of `axlotl` that supports Medusa
In this tutorial we'll use `torchrun` to train the model as it is the most straightforward way to train the model but similar steps can be followed to train the model using `axlotl` if you prefer.
### Training with `torchrun`
```bash
mkdir medusa-training
cd medusa-training
pyenv install 3.10
pyenv local 3.10
uv venv -p 3.10
source .venv/bin/activate
```
Now lets clone the original `Medusa` repository and install the library.
```bash
git clone https://github.com/FasterDecoding/Medusa.git
cd Medusa
pip install -e .
```
Next we'll need some data to train on, we can use the `ShareGPT_Vicuna_unfiltered` dataset that is available on the Hugging Face Hub.
```bash
apt install git-lfs
git lfs install
git clone https://huggingface.co/datasets/Aeala/ShareGPT_Vicuna_unfiltered
```
Currently our directory structure looks like this:
```bash
.
├── assets
├── CITATION.cff
├── create_data.py
├── data_generation
├── deepspeed.json
├── last_run_prepared
├── LICENSE
├── llm_judge
├── medusa
├── medusa_llm.egg-info
├── mistral.json
├── notebooks
├── pyproject.toml
├── README.md
├── ROADMAP.md
├── scripts
├── ShareGPT_Vicuna_unfiltered
│   ├── README.md
│   ├── ShareGPT_2023.05.04v0_Wasteland_Edition.json
│   └── ShareGPT_V4.3_unfiltered_cleaned_split.json
├── simple_gradio_interface.py
├── tiny-llama.json
└── vicuna_7b_qlora_stage1
```
## Start Training
Now the lets generate the data and start training the model. This process will take a while since we are generating data from the model.
First make sure you have an instance of TGI running with the model you want to use for self-distillation.
```bash
model=HuggingFaceH4/zephyr-7b-beta
volume=/home/ubuntu/.cache/huggingface/hub/
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model
```
Now we can generate the data using the `create_data.py` script.
```bash
python create_data.py \
--input-filename ShareGPT_Vicuna_unfiltered/ShareGPT_V4.3_unfiltered_cleaned_split.json \
--output-filename zephyr_self_distill.json
```
At this point our terminal should look like this:
<div class="flex justify-center">
<img
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/medusa-train-large.gif"
width="550"
/>
</div>
> Note: In the screen shot above we are only using a the first 500 examples from the dataset to speed up the process, you should have a much larger dataset for training.
Now we can finally get to the fun part and start training the model!
Using `torchrun` we can easily launch the `medusa` training script with the `zephyr_self_distill.json` configuration file.
> NOTE: If you just self-distilled you may still have the model running, make sure to stop it before starting the training in order to allow all of the resources to be used for training.
```bash
WANDB_MODE=offline torchrun --nproc_per_node=4 medusa/train/train_legacy.py \
--model_name_or_path HuggingFaceH4/zephyr-7b-beta \
--data_path zephyr_self_distill.json \
--bf16 True \
--output_dir zephyr_out \
--num_train_epochs 5 \
--per_device_train_batch_size 4 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 4 \
--evaluation_strategy "no" \
--save_strategy "no" \
--learning_rate 1e-3 \
--weight_decay 0.0 \
--warmup_ratio 0.1 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 2048 \
--lazy_preprocess True \
--medusa_num_heads 3 \
--medusa_num_layers 1 \
--deepspeed deepspeed.json
```
<div class="flex justify-center">
<img
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/medusa-train-heads-large.gif"
width="550"
/>
</div>
If successful, you should see the similar output to the one below:
```bash
wandb: Run history:
wandb: train/epoch ▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
wandb: train/global_step ▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
wandb: train/learning_rate ▅███▇▇▆▅▅▄▃▂▂▁▁▁
wandb: train/loss ██▆▄▄▃▃▂▂▃▁▁▂▁▁▁
wandb: train/medusa0_loss ▆▆▇▆▆▅▄▅▃▃▃▃▂▂▂▂▂▃▂▂▂▁▁▁▂▁▁▁▁▁█▁▁▁▂▁▁▁▁▁
wandb: train/medusa0_top1 ▁▁▁▁▁▁▁▁▃▂▃▃▄▄▄▃▄▃▄▄▅▅▆▅▆▆▇▅▇▇▄▇█▇▅▇█▆▇▇
wandb: train/medusa1_loss ▇▇█▇▇▆▅▅▃▄▃▃▃▃▃▃▃▃▃▃▂▁▂▂▂▁▁▂▁▁▇▁▁▁▂▁▁▁▁▁
wandb: train/medusa1_top1 ▁▁▁▁▁▁▁▁▃▂▃▃▃▄▄▃▃▂▃▃▅▅▆▄█▆▇▅▇▇▅█▇▇▅▇█▆▆▇
wandb: train/medusa2_loss ▃▃▄▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁█▁▁▁▂▁▁▁▁▁
wandb: train/medusa2_top1 ▁▁▁▂▁▁▁▁▂▂▃▃▃▄▄▃▃▂▃▃▅▆▅▄█▆▆▅▆▆▄█▇▇▄▇█▆▆▇
wandb: train/total_flos ▁
wandb: train/train_loss ▁
wandb: train/train_runtime ▁
wandb: train/train_samples_per_second ▁
wandb: train/train_steps_per_second ▁
wandb:
wandb: Run summary:
wandb: train/epoch 2.0
wandb: train/global_step 16
wandb: train/learning_rate 0.0
wandb: train/loss 14.8906
wandb: train/medusa0_loss 4.25
wandb: train/medusa0_top1 0.28809
wandb: train/medusa1_loss 4.8125
wandb: train/medusa1_top1 0.22727
wandb: train/medusa2_loss 5.5
wandb: train/medusa2_top1 0.17293
wandb: train/total_flos 0.0
wandb: train/train_loss 23.98242
wandb: train/train_runtime 396.9266
wandb: train/train_samples_per_second 2.519
wandb: train/train_steps_per_second 0.04
```
Last but most importantly, don't forget to push this model to the Hugging Face Hub so you can use it in your projects.
```bash
python -m medusa.hf_utils \
--folder zephyr_out_medusa_mlp_zephyr-7b-beta_medusa_3_lr_0.001_layers_1 \
--repo drbh/zephyr_medusa_demo
```
Woo, we've successfully trained a Medusa model and pushed it to the Hugging Face Hub! 🎉
# Using TGI CLI
You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters. To install the CLI, please refer to [the installation section](../installation#install-cli).
`text-generation-server` lets you download the model with `download-weights` command like below 👇
```bash
text-generation-server download-weights MODEL_HUB_ID
```
You can also use it to quantize models like below 👇
```bash
text-generation-server quantize MODEL_HUB_ID OUTPUT_DIR
```
You can use `text-generation-launcher` to serve models.
```bash
text-generation-launcher --model-id MODEL_HUB_ID --port 8080
```
There are many options and parameters you can pass to `text-generation-launcher`. The documentation for CLI is kept minimal and intended to rely on self-generating documentation, which can be found by running
```bash
text-generation-launcher --help
```
You can also find it hosted in this [Swagger UI](https://huggingface.github.io/text-generation-inference/).
Same documentation can be found for `text-generation-server`.
```bash
text-generation-server --help
```
# Guidance
Text Generation Inference (TGI) now supports [JSON and regex grammars](#grammar-and-constraints) and [tools and functions](#tools-and-functions) to help developers guide LLM responses to fit their needs.
These feature are available starting from version `1.4.3`. They are accessible via the [`huggingface_hub`](https://pypi.org/project/huggingface-hub/) library. The tool support is compatible with OpenAI's client libraries. The following guide will walk you through the new features and how to use them!
_note: guidance is supported as grammar in the `/generate` endpoint and as tools in the `/chat/completions` endpoint._
## How it works
TGI leverages the [outlines](https://github.com/outlines-dev/outlines) library to efficiently parse and compile the grammatical structures and tools specified by users. This integration transforms the defined grammars into an intermediate representation that acts as a framework to guide and constrain content generation, ensuring that outputs adhere to the specified grammatical rules.
If you are interested in the technical details on how outlines is used in TGI, you can check out the [conceptual guidance documentation](../conceptual/guidance).
## Table of Contents 📚
### Grammar and Constraints
- [The Grammar Parameter](#the-grammar-parameter): Shape your AI's responses with precision.
- [Constrain with Pydantic](#constrain-with-pydantic): Define a grammar using Pydantic models.
- [JSON Schema Integration](#json-schema-integration): Fine-grained control over your requests via JSON schema.
- [Using the client](#using-the-client): Use TGI's client libraries to shape the AI's responses.
### Tools and Functions
- [The Tools Parameter](#the-tools-parameter): Enhance the AI's capabilities with predefined functions.
- [Via the client](#text-generation-inference-client): Use TGI's client libraries to interact with the Messages API and Tool functions.
- [OpenAI integration](#openai-integration): Use OpenAI's client libraries to interact with TGI's Messages API and Tool functions.
## Grammar and Constraints 🛣️
### The Grammar Parameter
In TGI `1.4.3`, we've introduced the grammar parameter, which allows you to specify the format of the response you want from the LLM.
Using curl, you can make a request to TGI's Messages API with the grammar parameter. This is the most primitive way to interact with the API and using [Pydantic](#constrain-with-pydantic) is recommended for ease of use and readability.
```json
curl localhost:3000/generate \
-X POST \
-H 'Content-Type: application/json' \
-d '{
"inputs": "I saw a puppy a cat and a raccoon during my bike ride in the park",
"parameters": {
"repetition_penalty": 1.3,
"grammar": {
"type": "json",
"value": {
"properties": {
"location": {
"type": "string"
},
"activity": {
"type": "string"
},
"animals_seen": {
"type": "integer",
"minimum": 1,
"maximum": 5
},
"animals": {
"type": "array",
"items": {
"type": "string"
}
}
},
"required": ["location", "activity", "animals_seen", "animals"]
}
}
}
}'
// {"generated_text":"{ \n\n\"activity\": \"biking\",\n\"animals\": [\"puppy\",\"cat\",\"raccoon\"],\n\"animals_seen\": 3,\n\"location\": \"park\"\n}"}
```
### Hugging Face Hub Python Library
The Hugging Face Hub Python library provides a client that makes it easy to interact with the Messages API. Here's an example of how to use the client to send a request with a grammar parameter.
```python
from huggingface_hub import InferenceClient
client = InferenceClient("http://localhost:3000")
schema = {
"properties": {
"location": {"title": "Location", "type": "string"},
"activity": {"title": "Activity", "type": "string"},
"animals_seen": {
"maximum": 5,
"minimum": 1,
"title": "Animals Seen",
"type": "integer",
},
"animals": {"items": {"type": "string"}, "title": "Animals", "type": "array"},
},
"required": ["location", "activity", "animals_seen", "animals"],
"title": "Animals",
"type": "object",
}
user_input = "I saw a puppy a cat and a raccoon during my bike ride in the park"
resp = client.text_generation(
f"convert to JSON: 'f{user_input}'. please use the following schema: {schema}",
max_new_tokens=100,
seed=42,
grammar={"type": "json", "value": schema},
)
print(resp)
# { "activity": "bike ride", "animals": ["puppy", "cat", "raccoon"], "animals_seen": 3, "location": "park" }
```
A grammar can be defined using Pydantic models, JSON schemas, or regular expressions. The LLM will then generate a response that conforms to the specified grammar.
> Note: A grammar must compile to an intermediate representation to constrain the output. Grammar compilation is a computationally expensive and may take a few seconds to complete on the first request. Subsequent requests will use the cached grammar and will be much faster.
### Constrain with Pydantic
Using Pydantic models we can define a similar grammar as the previous example in a shorter and more readable way.
```python
from huggingface_hub import InferenceClient
from pydantic import BaseModel, conint
from typing import List
class Animals(BaseModel):
location: str
activity: str
animals_seen: conint(ge=1, le=5) # Constrained integer type
animals: List[str]
client = InferenceClient("http://localhost:3000")
user_input = "I saw a puppy a cat and a raccoon during my bike ride in the park"
resp = client.text_generation(
f"convert to JSON: 'f{user_input}'. please use the following schema: {Animals.schema()}",
max_new_tokens=100,
seed=42,
grammar={"type": "json", "value": Animals.schema()},
)
print(resp)
# { "activity": "bike ride", "animals": ["puppy", "cat", "raccoon"], "animals_seen": 3, "location": "park" }
```
defining a grammar as regular expressions
```python
from huggingface_hub import InferenceClient
client = InferenceClient("http://localhost:3000")
regexp = "((25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)"
resp = client.text_generation(
f"Whats Googles DNS? Please use the following regex: {regexp}",
seed=42,
grammar={
"type": "regex",
"value": regexp,
},
)
print(resp)
# 7.1.1.1
```
## Tools and Functions 🛠️
### The Tools Parameter
In addition to the grammar parameter, we've also introduced a set of tools and functions to help you get the most out of the Messages API.
Tools are a set of user defined functions that can be used in tandem with the chat functionality to enhance the LLM's capabilities. Functions, similar to grammar are defined as JSON schema and can be passed as part of the parameters to the Messages API.
Functions, similar to grammar are defined as JSON schema and can be passed as part of the parameters to the Messages API.
```json
curl localhost:3000/v1/chat/completions \
-X POST \
-H 'Content-Type: application/json' \
-d '{
"model": "tgi",
"messages": [
{
"role": "user",
"content": "What is the weather like in New York?"
}
],
"tools": [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA"
},
"format": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The temperature unit to use. Infer this from the users location."
}
},
"required": ["location", "format"]
}
}
}
],
"tool_choice": "get_current_weather"
}'
// {"id":"","object":"text_completion","created":1709051640,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-native","choices":[{"index":0,"message":{"role":"assistant","tool_calls":{"id":0,"type":"function","function":{"description":null,"name":"tools","parameters":{"format":"celsius","location":"New York"}}}},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":157,"completion_tokens":19,"total_tokens":176}}
```
### Chat Completion with Tools
Grammars are supported in the `/generate` endpoint, while tools are supported in the `/chat/completions` endpoint. Here's an example of how to use the client to send a request with a tool parameter.
```python
from huggingface_hub import InferenceClient
client = InferenceClient("http://localhost:3000")
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"format": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The temperature unit to use. Infer this from the users location.",
},
},
"required": ["location", "format"],
},
},
},
{
"type": "function",
"function": {
"name": "get_n_day_weather_forecast",
"description": "Get an N-day weather forecast",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"format": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The temperature unit to use. Infer this from the users location.",
},
"num_days": {
"type": "integer",
"description": "The number of days to forecast",
},
},
"required": ["location", "format", "num_days"],
},
},
},
]
chat = client.chat_completion(
messages=[
{
"role": "system",
"content": "You're a helpful assistant! Answer the users question best you can.",
},
{
"role": "user",
"content": "What is the weather like in Brooklyn, New York?",
},
],
tools=tools,
seed=42,
max_tokens=100,
)
print(chat.choices[0].message.tool_calls)
# [ChatCompletionOutputToolCall(function=ChatCompletionOutputFunctionDefinition(arguments={'format': 'fahrenheit', 'location': 'Brooklyn, New York', 'num_days': 7}, name='get_n_day_weather_forecast', description=None), id=0, type='function')]
```
### OpenAI integration
TGI exposes an OpenAI-compatible API, which means you can use OpenAI's client libraries to interact with TGI's Messages API and Tool functions.
However there are some minor differences in the API, for example `tool_choice="auto"` will ALWAYS choose the tool for you. This is different from OpenAI's API where `tool_choice="auto"` will choose a tool if the model thinks it's necessary.
```python
from openai import OpenAI
# Initialize the client, pointing it to one of the available models
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="_",
)
# NOTE: tools defined above and removed for brevity
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{
"role": "system",
"content": "Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous.",
},
{
"role": "user",
"content": "What's the weather like the next 3 days in San Francisco, CA?",
},
],
tools=tools,
tool_choice="auto", # tool selected by model
max_tokens=500,
)
called = chat_completion.choices[0].message.tool_calls
print(called)
# {
# "id": 0,
# "type": "function",
# "function": {
# "description": None,
# "name": "tools",
# "parameters": {
# "format": "celsius",
# "location": "San Francisco, CA",
# "num_days": 3,
# },
# },
# }
```
# Vision Language Model Inference in TGI
Visual Language Model (VLM) are models that consume both image and text inputs to generate text.
VLM's are trained on a combination of image and text data and can handle a wide range of tasks, such as image captioning, visual question answering, and visual dialog.
> What distinguishes VLMs from other text and image models is their ability to handle long context and generate text that is coherent and relevant to the image even after multiple turns or in some cases, multiple images.
Below are couple of common use cases for vision language models:
- **Image Captioning**: Given an image, generate a caption that describes the image.
- **Visual Question Answering (VQA)**: Given an image and a question about the image, generate an answer to the question.
- **Mulimodal Dialog**: Generate response to multiple turns of images and conversations.
- **Image Information Retrieval**: Given an image, retrieve information from the image.
## How to Use a Vision Language Model?
### Hugging Face Hub Python Library
To infer with vision language models through Python, you can use the [`huggingface_hub`](https://pypi.org/project/huggingface-hub/) library. The `InferenceClient` class provides a simple way to interact with the [Inference API](https://huggingface.co/docs/api-inference/index). Images can be passed as URLs or base64-encoded strings. The `InferenceClient` will automatically detect the image format.
```python
from huggingface_hub import InferenceClient
client = InferenceClient("http://127.0.0.1:3000")
image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
prompt = f"![]({image})What is this a picture of?\n\n"
for token in client.text_generation(prompt, max_new_tokens=16, stream=True):
print(token)
# This is a picture of an anthropomorphic rabbit in a space suit.
```
```python
from huggingface_hub import InferenceClient
import base64
import requests
import io
client = InferenceClient("http://127.0.0.1:3000")
# read image from local file
image_path = "rabbit.png"
with open(image_path, "rb") as f:
image = base64.b64encode(f.read()).decode("utf-8")
image = f"data:image/png;base64,{image}"
prompt = f"![]({image})What is this a picture of?\n\n"
for token in client.text_generation(prompt, max_new_tokens=10, stream=True):
print(token)
# This is a picture of an anthropomorphic rabbit in a space suit.
```
or via the `chat_completion` endpoint:
```python
from huggingface_hub import InferenceClient
client = InferenceClient("http://127.0.0.1:3000")
chat = client.chat_completion(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Whats in this image?"},
{
"type": "image_url",
"image_url": {
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
},
},
],
},
],
seed=42,
max_tokens=100,
)
print(chat)
# ChatCompletionOutput(choices=[ChatCompletionOutputComplete(finish_reason='length', index=0, message=ChatCompletionOutputMessage(role='assistant', content=" The image you've provided features an anthropomorphic rabbit in spacesuit attire. This rabbit is depicted with human-like posture and movement, standing on a rocky terrain with a vast, reddish-brown landscape in the background. The spacesuit is detailed with mission patches, circuitry, and a helmet that covers the rabbit's face and ear, with an illuminated red light on the chest area.\n\nThe artwork style is that of a", name=None, tool_calls=None), logprobs=None)], created=1714589614, id='', model='llava-hf/llava-v1.6-mistral-7b-hf', object='text_completion', system_fingerprint='2.0.2-native', usage=ChatCompletionOutputUsage(completion_tokens=100, prompt_tokens=2943, total_tokens=3043))
```
or with OpenAi's library:
```python
from openai import OpenAI
# init the client but point it to TGI
client = OpenAI(base_url="http://localhost:3000/v1", api_key="-")
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Whats in this image?"},
{
"type": "image_url",
"image_url": {
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
},
},
],
},
],
stream=False,
)
print(chat_completion)
# ChatCompletion(id='', choices=[Choice(finish_reason='eos_token', index=0, logprobs=None, message=ChatCompletionMessage(content=' The image depicts an anthropomorphic rabbit dressed in a space suit with gear that resembles NASA attire. The setting appears to be a solar eclipse with dramatic mountain peaks and a partial celestial body in the sky. The artwork is detailed and vivid, with a warm color palette and a sense of an adventurous bunny exploring or preparing for a journey beyond Earth. ', role='assistant', function_call=None, tool_calls=None))], created=1714589732, model='llava-hf/llava-v1.6-mistral-7b-hf', object='text_completion', system_fingerprint='2.0.2-native', usage=CompletionUsage(completion_tokens=84, prompt_tokens=2943, total_tokens=3027))
```
### Inference Through Sending `cURL` Requests
To use the `generate_stream` endpoint with curl, you can add the `-N` flag. This flag disables curl default buffering and shows data as it arrives from the server.
```bash
curl -N 127.0.0.1:3000/generate_stream \
-X POST \
-d '{"inputs":"![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)What is this a picture of?\n\n","parameters":{"max_new_tokens":16, "seed": 42}}' \
-H 'Content-Type: application/json'
# ...
# data:{"index":16,"token":{"id":28723,"text":".","logprob":-0.6196289,"special":false},"generated_text":"This is a picture of an anthropomorphic rabbit in a space suit.","details":null}
```
### Inference Through JavaScript
First, we need to install the `@huggingface/inference` library.
```bash
npm install @huggingface/inference
```
If you're using the free Inference API, you can use [Huggingface.js](https://huggingface.co/docs/huggingface.js/inference/README)'s `HfInference`. If you're using inference endpoints, you can use `HfInferenceEndpoint` class to easily interact with the Inference API.
We can create a `HfInferenceEndpoint` providing our endpoint URL and We can create a `HfInferenceEndpoint` providing our endpoint URL and [Hugging Face access token](https://huggingface.co/settings/tokens).
```js
import { HfInferenceEndpoint } from "@huggingface/inference";
const hf = new HfInferenceEndpoint("http://127.0.0.1:3000", "HF_TOKEN");
const prompt =
"![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)What is this a picture of?\n\n";
const stream = hf.textGenerationStream({
inputs: prompt,
parameters: { max_new_tokens: 16, seed: 42 },
});
for await (const r of stream) {
// yield the generated token
process.stdout.write(r.token.text);
}
// This is a picture of an anthropomorphic rabbit in a space suit.
```
## Combining Vision Language Models with Other Features
VLMs in TGI have several advantages, for example these models can be used in tandem with other features for more complex tasks. For example, you can use VLMs with [Guided Generation](/docs/conceptual/guided-generation) to generate specific JSON data from an image.
<div class="flex justify-center">
<img
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
width="400"
/>
</div>
For example we can extract information from the rabbit image and generate a JSON object with the location, activity, number of animals seen, and the animals seen. That would look like this:
```json
{
"activity": "Standing",
"animals": ["Rabbit"],
"animals_seen": 1,
"location": "Rocky surface with mountains in the background and a red light on the rabbit's chest"
}
```
All we need to do is provide a JSON schema to the VLM model and it will generate the JSON object for us.
```bash
curl localhost:3000/generate \
-X POST \
-H 'Content-Type: application/json' \
-d '{
"inputs":"![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)What is this a picture of?\n\n",
"parameters": {
"max_new_tokens": 100,
"seed": 42,
"grammar": {
"type": "json",
"value": {
"properties": {
"location": {
"type": "string"
},
"activity": {
"type": "string"
},
"animals_seen": {
"type": "integer",
"minimum": 1,
"maximum": 5
},
"animals": {
"type": "array",
"items": {
"type": "string"
}
}
},
"required": ["location", "activity", "animals_seen", "animals"]
}
}
}
}'
# {
# "generated_text": "{ \"activity\": \"Standing\", \"animals\": [ \"Rabbit\" ], \"animals_seen\": 1, \"location\": \"Rocky surface with mountains in the background and a red light on the rabbit's chest\" }"
# }
```
Want to learn more about how Vision Language Models work? Check out the [awesome blog post on the topic](https://huggingface.co/blog/vlms).
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment