Commit efd602c8 authored by xuxzh1's avatar xuxzh1 🎱
Browse files

last

parent f1b779fc
install-server:
cd server && make install
install-custom-kernels:
if [ "$$BUILD_EXTENSIONS" = "True" ]; then cd server/custom_kernels && python setup.py install; else echo "Custom kernels are disabled, you need to set the BUILD_EXTENSIONS environment variable to 'True' in order to build them. (Please read the docs, kernels might not work on all hardware)"; fi
install-integration-tests:
cd integration-tests && pip install -r requirements.txt
cd clients/python && pip install .
install-server-cpu:
cd server && make install-server
install-router:
cd router && cargo install --path .
cd router && cargo install --path . --debug
install-launcher:
cd launcher && cargo install --path .
......@@ -17,7 +13,10 @@ install-launcher:
install-benchmark:
cd benchmark && cargo install --path .
install: install-server install-router install-launcher install-custom-kernels
install: install-server install-router install-launcher
install-cpu: install-server-cpu install-router install-launcher
server-dev:
cd server && make run-dev
......@@ -28,6 +27,10 @@ router-dev:
rust-tests: install-router install-launcher
cargo test
install-integration-tests:
cd integration-tests && pip install -r requirements.txt
cd clients/python && pip install .
integration-tests: install-integration-tests
pytest -s -vv -m "not private" integration-tests
......
......@@ -34,19 +34,19 @@ Text Generation Inference(TGI)是一个用 Rust 和 Python 编写的框架
基于现有python环境自己安装pytorch,triton,flash-att包:
**安装pytorch**
安装pytorch2.1.0,pytorch whl包下载目录:[https://cancon.hpccube.com:65024/4/main/pytorch](https://cancon.hpccube.com:65024/4/main/pytorch),根据python、dtk版本,下载对应pytorch2.1.0的whl包。安装命令如下:
```shell
```bash
pip install torch* (下载的torch的whl包)
pip install setuptools wheel
```
**安装triton**
triton whl包下载:[https://cancon.hpccube.com:65024/4/main/triton](https://cancon.hpccube.com:65024/4/main/triton),需要根据python、dtk版本,下载对应triton 2.1的whl包
```shell
```bash
pip install triton* (下载的triton的whl包)
```
**安装flash-attn**
flash_attn包下载:[https://cancon.hpccube.com:65024/4/main/flash_attn](https://cancon.hpccube.com:65024/4/main/flash_attn),需要根据python、dtk版本,下载对应flash_attn 2.0.4的whl包
```shell
```bash
pip install flash_attn* (下载的triton的whl包)
```
......@@ -66,36 +66,41 @@ sudo unzip -o $PROTOC_ZIP -d /usr/local 'include/*'
rm -f $PROTOC_ZIP
```
3. 安装TGI Service
```
```bash
git clone http://developer.hpccube.com/codes/OpenDAS/text-generation-inference.git # 根据需要的分支进行切换
cd text-generation-inference
#添加安装vllm exllama
#安装exllama
cd server
pip uninstall vllm #optional:如果是按方式一准备的环境,需要先卸载环境中默认的vllm
make install-vllm #安装定制版本的vllm
make install-exllama #安装exllama kernels
make install-exllamav2 #安装exllmav2 kernels
cd .. #回到项目根目录
source $HOME/.cargo/env
BUILD_EXTENSIONS=True make install #安装text-generation服务
```
4. 安装benchmark
```
```bash
cd text-generation-inference
make install-benchmark
```
注意:若安装过程过慢,可以通过如下命令修改默认源提速。
```
```bash
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
```
另外,`cargo install` 太慢也可以通过在`~/.cargo/config`中添加源来提速。
## 查看安装的版本号
```
```bash
text-generation-launcher -V #版本号与官方版本同步
```
## 使用前
```bash
export PYTORCH_TUNABLEOP_ENABLED=0
```
## Known Issue
-
## 参考资料
......
{
"__inputs": [
{
"name": "DS_PROMETHEUS_EKS API INFERENCE PROD",
"label": "Prometheus EKS API Inference Prod",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__elements": {},
"__requires": [
{
"type": "panel",
"id": "gauge",
"name": "Gauge",
"version": ""
},
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "10.0.2"
},
{
"type": "panel",
"id": "heatmap",
"name": "Heatmap",
"version": ""
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
},
{
"type": "panel",
"id": "timeseries",
"name": "Time series",
"version": ""
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 2,
"id": 551,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"fieldMinMax": false,
"mappings": [],
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 1000
}
]
},
"unit": "ms"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 8,
"x": 0,
"y": 0
},
"id": 49,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"mean"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "10.4.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "(histogram_quantile(0.5, sum by (le) (rate(tgi_request_queue_duration_bucket{container=\"$service\"}[10m]))) * 1000) > 0",
"hide": true,
"instant": false,
"legendFormat": "__auto",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "(histogram_quantile(0.5, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[10m]))) * 1000) > 0",
"hide": true,
"instant": false,
"legendFormat": "__auto",
"range": true,
"refId": "C"
},
{
"datasource": {
"name": "Expression",
"type": "__expr__",
"uid": "__expr__"
},
"expression": "$B + $C",
"hide": false,
"refId": "D",
"type": "math"
}
],
"title": "Time to first token",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "ms"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 8,
"x": 9,
"y": 0
},
"id": 44,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"mean"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "10.4.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "(histogram_quantile(0.5, sum by (le) (rate(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[10m]))) * 1000)>0",
"instant": false,
"range": true,
"refId": "A"
}
],
"title": "Decode per-token latency",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 7,
"x": 17,
"y": 0
},
"id": 45,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"mean"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "10.4.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "sum((rate(tgi_request_generated_tokens_sum{container=\"$service\"}[10m]) / rate(tgi_request_generated_tokens_count{container=\"$service\"}[10m]))>0)",
"instant": false,
"range": true,
"refId": "A"
}
],
"title": "Throughput (generated tok/s)",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "none"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "p50"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "green",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p90"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "orange",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p99"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 7
},
"id": 48,
"options": {
"legend": {
"calcs": [
"min",
"max"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))",
"legendFormat": "p50",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p90",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p99",
"range": true,
"refId": "C"
}
],
"title": "Number of tokens per prompt",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "none"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "p50"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "green",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p90"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "orange",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p99"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 7
},
"id": 30,
"options": {
"legend": {
"calcs": [
"min",
"max"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_generated_tokens_bucket{container=\"$service\"}[10m])))",
"legendFormat": "p50",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_generated_tokens_bucket{container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p90",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_generated_tokens_bucket{container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p99",
"range": true,
"refId": "C"
}
],
"title": "Number of generated tokens per request",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 15
},
"id": 20,
"panels": [],
"title": "General",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 30,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 0,
"y": 16
},
"id": 4,
"maxDataPoints": 100,
"options": {
"legend": {
"calcs": [
"min",
"max"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "sum(increase(tgi_request_success{container=\"$service\"}[1m]))",
"legendFormat": "Success",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "sum(increase(tgi_request_failure{container=\"$service\"}[1m])) by (err)",
"hide": false,
"legendFormat": "Error: {{err}}",
"range": true,
"refId": "B"
}
],
"title": "Requests",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "p50"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "green",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p90"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "orange",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p99"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 13,
"w": 9,
"x": 6,
"y": 16
},
"id": 6,
"options": {
"legend": {
"calcs": [
"min",
"max"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_mean_time_per_token_duration_bucket{container=\"$service\"}[10m])))",
"legendFormat": "p50",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_mean_time_per_token_duration_bucket{container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p90",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_mean_time_per_token_duration_bucket{container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p99",
"range": true,
"refId": "C"
}
],
"title": "Mean Time Per Token quantiles",
"type": "timeseries"
},
{
"cards": {},
"color": {
"cardColor": "#5794F2",
"colorScale": "linear",
"colorScheme": "interpolateSpectral",
"exponent": 0.5,
"min": 0,
"mode": "opacity"
},
"dataFormat": "tsbuckets",
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"scaleDistribution": {
"type": "linear"
}
}
},
"overrides": []
},
"gridPos": {
"h": 13,
"w": 9,
"x": 15,
"y": 16
},
"heatmap": {},
"hideZeroBuckets": false,
"highlightCards": true,
"id": 13,
"legend": {
"show": false
},
"maxDataPoints": 25,
"options": {
"calculate": false,
"calculation": {},
"cellGap": 2,
"cellValues": {},
"color": {
"exponent": 0.5,
"fill": "#5794F2",
"min": 0,
"mode": "scheme",
"reverse": false,
"scale": "exponential",
"scheme": "Spectral",
"steps": 128
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"filterValues": {
"le": 1e-9
},
"legend": {
"show": false
},
"rowsFrame": {
"layout": "auto"
},
"showValue": "never",
"tooltip": {
"mode": "single",
"showColorScale": false,
"yHistogram": false
},
"yAxis": {
"axisPlacement": "left",
"decimals": 1,
"reverse": false,
"unit": "s"
}
},
"pluginVersion": "10.4.2",
"reverseYBuckets": false,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"exemplar": true,
"expr": "sum(increase(tgi_request_mean_time_per_token_duration_bucket{container=\"$service\"}[5m])) by (le)",
"format": "heatmap",
"interval": "",
"legendFormat": "{{ le }}",
"range": true,
"refId": "A"
}
],
"title": "Mean Time Per Token",
"tooltip": {
"show": true,
"showHistogram": false
},
"type": "heatmap",
"xAxis": {
"show": true
},
"yAxis": {
"decimals": 1,
"format": "s",
"logBase": 1,
"show": true
},
"yBucketBound": "auto"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "orange",
"value": 70
},
{
"color": "red",
"value": 85
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 3,
"x": 0,
"y": 24
},
"id": 18,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"pluginVersion": "9.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "count(tgi_request_count{container=\"$service\"})",
"legendFormat": "Replicas",
"range": true,
"refId": "A"
}
],
"title": "Number of replicas",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"mappings": [],
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "orange",
"value": 70
},
{
"color": "red",
"value": 85
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 3,
"x": 3,
"y": 24
},
"id": 32,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto"
},
"pluginVersion": "10.4.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "sum(tgi_queue_size{container=\"$service\"})",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Queue Size",
"type": "gauge"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 29
},
"id": 26,
"panels": [],
"title": "Batching",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "bars",
"fillOpacity": 50,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "normal"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 6,
"x": 0,
"y": 30
},
"id": 29,
"maxDataPoints": 40,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"pluginVersion": "9.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "avg(tgi_batch_current_max_tokens{container=\"$service\"})",
"legendFormat": "{{ pod }}",
"range": true,
"refId": "A"
}
],
"title": "Max tokens per batch",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "none"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "p50"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "green",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p90"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "orange",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p99"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 9,
"w": 4,
"x": 6,
"y": 30
},
"id": 33,
"options": {
"legend": {
"calcs": [
"min",
"max"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_skipped_tokens_bucket{container=\"$service\"}[10m])))",
"legendFormat": "p50",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_skipped_tokens_bucket{container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p90",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_skipped_tokens_bucket{container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p99",
"range": true,
"refId": "C"
}
],
"title": "Speculated Tokens",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "none"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "p50"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "green",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p90"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "orange",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p99"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 9,
"w": 5,
"x": 10,
"y": 30
},
"id": 46,
"options": {
"legend": {
"calcs": [
"min",
"max"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))",
"legendFormat": "p50",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p90",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p99",
"range": true,
"refId": "C"
}
],
"title": "Prompt Tokens",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "p50"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "green",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p90"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "orange",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p99"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 9,
"w": 9,
"x": 15,
"y": 30
},
"id": 8,
"options": {
"legend": {
"calcs": [
"min",
"max"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_duration_bucket{container=\"$service\"}[10m])))",
"legendFormat": "p50",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_duration_bucket{container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p90",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_duration_bucket{container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p99",
"range": true,
"refId": "C"
}
],
"title": "Latency quantiles",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "bars",
"fillOpacity": 50,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "normal"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 6,
"x": 0,
"y": 35
},
"id": 27,
"maxDataPoints": 40,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"pluginVersion": "9.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "avg(tgi_batch_current_size{container=\"$service\"})",
"legendFormat": "{{ pod }}",
"range": true,
"refId": "A"
}
],
"title": "Batch Size",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 30,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 6,
"x": 0,
"y": 39
},
"id": 28,
"maxDataPoints": 100,
"options": {
"legend": {
"calcs": [
"min",
"max"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "sum(increase(tgi_batch_concat{container=\"$service\"}[1m])) by (reason)",
"hide": false,
"legendFormat": "Reason: {{ reason }}",
"range": true,
"refId": "B"
}
],
"title": "Concatenates",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "p50"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "green",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p90"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "orange",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p99"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 9,
"w": 9,
"x": 6,
"y": 39
},
"id": 31,
"options": {
"legend": {
"calcs": [
"min",
"max"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_queue_duration_bucket{container=\"$service\"}[10m])))",
"legendFormat": "p50",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_queue_duration_bucket{container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p90",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_queue_duration_bucket{container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p99",
"range": true,
"refId": "C"
}
],
"title": "Queue quantiles",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 48
},
"id": 22,
"panels": [],
"title": "Prefill",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "p50"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "green",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p90"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "orange",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p99"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 11,
"w": 12,
"x": 0,
"y": 49
},
"id": 7,
"options": {
"legend": {
"calcs": [
"min",
"max"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[10m])))",
"legendFormat": "p50",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p90",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p99",
"range": true,
"refId": "C"
}
],
"title": "Prefill Quantiles",
"type": "timeseries"
},
{
"cards": {},
"color": {
"cardColor": "#5794F2",
"colorScale": "linear",
"colorScheme": "interpolateSpectral",
"exponent": 0.5,
"min": 0,
"mode": "opacity"
},
"dataFormat": "tsbuckets",
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"scaleDistribution": {
"type": "linear"
}
}
},
"overrides": []
},
"gridPos": {
"h": 11,
"w": 12,
"x": 12,
"y": 49
},
"heatmap": {},
"hideZeroBuckets": false,
"highlightCards": true,
"id": 14,
"legend": {
"show": false
},
"maxDataPoints": 25,
"options": {
"calculate": false,
"calculation": {},
"cellGap": 2,
"cellValues": {},
"color": {
"exponent": 0.5,
"fill": "#5794F2",
"min": 0,
"mode": "scheme",
"reverse": false,
"scale": "exponential",
"scheme": "Spectral",
"steps": 128
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"filterValues": {
"le": 1e-9
},
"legend": {
"show": false
},
"rowsFrame": {
"layout": "auto"
},
"showValue": "never",
"tooltip": {
"mode": "single",
"showColorScale": false,
"yHistogram": false
},
"yAxis": {
"axisPlacement": "left",
"decimals": 1,
"reverse": false,
"unit": "s"
}
},
"pluginVersion": "10.4.2",
"reverseYBuckets": false,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"exemplar": true,
"expr": "sum(increase(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[5m])) by (le)",
"format": "heatmap",
"interval": "",
"legendFormat": "{{ le }}",
"range": true,
"refId": "A"
}
],
"title": "Prefill Latency",
"tooltip": {
"show": true,
"showHistogram": false
},
"type": "heatmap",
"xAxis": {
"show": true
},
"yAxis": {
"decimals": 1,
"format": "s",
"logBase": 1,
"show": true
},
"yBucketBound": "auto"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 60
},
"id": 24,
"panels": [],
"title": "Decode",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "p50"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "green",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p90"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "orange",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p99"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 11,
"w": 12,
"x": 0,
"y": 61
},
"id": 11,
"options": {
"legend": {
"calcs": [
"min",
"max"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
"legendFormat": "p50",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p90",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p99",
"range": true,
"refId": "C"
}
],
"title": "Decode quantiles",
"type": "timeseries"
},
{
"cards": {},
"color": {
"cardColor": "#5794F2",
"colorScale": "linear",
"colorScheme": "interpolateSpectral",
"exponent": 0.5,
"min": 0,
"mode": "opacity"
},
"dataFormat": "tsbuckets",
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"scaleDistribution": {
"type": "linear"
}
}
},
"overrides": []
},
"gridPos": {
"h": 11,
"w": 12,
"x": 12,
"y": 61
},
"heatmap": {},
"hideZeroBuckets": false,
"highlightCards": true,
"id": 15,
"legend": {
"show": false
},
"maxDataPoints": 25,
"options": {
"calculate": false,
"calculation": {},
"cellGap": 2,
"cellValues": {},
"color": {
"exponent": 0.5,
"fill": "#5794F2",
"min": 0,
"mode": "scheme",
"reverse": false,
"scale": "exponential",
"scheme": "Spectral",
"steps": 128
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"filterValues": {
"le": 1e-9
},
"legend": {
"show": false
},
"rowsFrame": {
"layout": "auto"
},
"showValue": "never",
"tooltip": {
"mode": "single",
"showColorScale": false,
"yHistogram": false
},
"yAxis": {
"axisPlacement": "left",
"decimals": 1,
"reverse": false,
"unit": "s"
}
},
"pluginVersion": "10.4.2",
"reverseYBuckets": false,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"exemplar": true,
"expr": "sum(increase(tgi_batch_inference_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)",
"format": "heatmap",
"interval": "",
"legendFormat": "{{ le }}",
"range": true,
"refId": "A"
}
],
"title": "Decode Latency",
"tooltip": {
"show": true,
"showHistogram": false
},
"type": "heatmap",
"xAxis": {
"show": true
},
"yAxis": {
"decimals": 1,
"format": "s",
"logBase": 1,
"show": true
},
"yBucketBound": "auto"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 72
},
"id": 43,
"panels": [],
"title": "Debug",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "p50"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "green",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p90"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "orange",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p99"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 11,
"w": 6,
"x": 0,
"y": 73
},
"id": 38,
"options": {
"legend": {
"calcs": [
"min",
"max"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
"legendFormat": "p50",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p90",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p99",
"range": true,
"refId": "C"
}
],
"title": "Forward quantiles",
"type": "timeseries"
},
{
"cards": {},
"color": {
"cardColor": "#5794F2",
"colorScale": "linear",
"colorScheme": "interpolateSpectral",
"exponent": 0.5,
"min": 0,
"mode": "opacity"
},
"dataFormat": "tsbuckets",
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"scaleDistribution": {
"type": "linear"
}
}
},
"overrides": []
},
"gridPos": {
"h": 11,
"w": 6,
"x": 6,
"y": 73
},
"heatmap": {},
"hideZeroBuckets": false,
"highlightCards": true,
"id": 35,
"legend": {
"show": false
},
"maxDataPoints": 25,
"options": {
"calculate": false,
"calculation": {},
"cellGap": 2,
"cellValues": {},
"color": {
"exponent": 0.5,
"fill": "#5794F2",
"min": 0,
"mode": "scheme",
"reverse": false,
"scale": "exponential",
"scheme": "Spectral",
"steps": 128
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"filterValues": {
"le": 1e-9
},
"legend": {
"show": false
},
"rowsFrame": {
"layout": "auto"
},
"showValue": "never",
"tooltip": {
"mode": "single",
"showColorScale": false,
"yHistogram": false
},
"yAxis": {
"axisPlacement": "left",
"decimals": 1,
"reverse": false,
"unit": "s"
}
},
"pluginVersion": "10.4.2",
"reverseYBuckets": false,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"exemplar": true,
"expr": "sum(increase(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)",
"format": "heatmap",
"interval": "",
"legendFormat": "{{ le }}",
"range": true,
"refId": "A"
}
],
"title": "Forward Latency",
"tooltip": {
"show": true,
"showHistogram": false
},
"type": "heatmap",
"xAxis": {
"show": true
},
"yAxis": {
"decimals": 1,
"format": "s",
"logBase": 1,
"show": true
},
"yBucketBound": "auto"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "p50"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "green",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p90"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "orange",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p99"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 11,
"w": 6,
"x": 12,
"y": 73
},
"id": 34,
"options": {
"legend": {
"calcs": [
"min",
"max"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_decode_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
"legendFormat": "p50",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_decode_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p90",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_decode_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p99",
"range": true,
"refId": "C"
}
],
"title": "Token Decode quantiles",
"type": "timeseries"
},
{
"cards": {},
"color": {
"cardColor": "#5794F2",
"colorScale": "linear",
"colorScheme": "interpolateSpectral",
"exponent": 0.5,
"min": 0,
"mode": "opacity"
},
"dataFormat": "tsbuckets",
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"scaleDistribution": {
"type": "linear"
}
}
},
"overrides": []
},
"gridPos": {
"h": 11,
"w": 6,
"x": 18,
"y": 73
},
"heatmap": {},
"hideZeroBuckets": false,
"highlightCards": true,
"id": 40,
"legend": {
"show": false
},
"maxDataPoints": 25,
"options": {
"calculate": false,
"calculation": {},
"cellGap": 2,
"cellValues": {},
"color": {
"exponent": 0.5,
"fill": "#5794F2",
"min": 0,
"mode": "scheme",
"reverse": false,
"scale": "exponential",
"scheme": "Spectral",
"steps": 128
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"filterValues": {
"le": 1e-9
},
"legend": {
"show": false
},
"rowsFrame": {
"layout": "auto"
},
"showValue": "never",
"tooltip": {
"mode": "single",
"showColorScale": false,
"yHistogram": false
},
"yAxis": {
"axisPlacement": "left",
"decimals": 1,
"reverse": false,
"unit": "s"
}
},
"pluginVersion": "10.4.2",
"reverseYBuckets": false,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"exemplar": true,
"expr": "sum(increase(tgi_batch_decode_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)",
"format": "heatmap",
"interval": "",
"legendFormat": "{{ le }}",
"range": true,
"refId": "A"
}
],
"title": "Token Decode Latency",
"tooltip": {
"show": true,
"showHistogram": false
},
"type": "heatmap",
"xAxis": {
"show": true
},
"yAxis": {
"decimals": 1,
"format": "s",
"logBase": 1,
"show": true
},
"yBucketBound": "auto"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "p50"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "green",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p90"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "orange",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p99"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 11,
"w": 6,
"x": 0,
"y": 84
},
"id": 42,
"options": {
"legend": {
"calcs": [
"min",
"max"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_filter_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
"legendFormat": "p50",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_filter_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p90",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_filter_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p99",
"range": true,
"refId": "C"
}
],
"title": "Filter Batch quantiles",
"type": "timeseries"
},
{
"cards": {},
"color": {
"cardColor": "#5794F2",
"colorScale": "linear",
"colorScheme": "interpolateSpectral",
"exponent": 0.5,
"min": 0,
"mode": "opacity"
},
"dataFormat": "tsbuckets",
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"scaleDistribution": {
"type": "linear"
}
}
},
"overrides": []
},
"gridPos": {
"h": 11,
"w": 6,
"x": 6,
"y": 84
},
"heatmap": {},
"hideZeroBuckets": false,
"highlightCards": true,
"id": 39,
"legend": {
"show": false
},
"maxDataPoints": 25,
"options": {
"calculate": false,
"calculation": {},
"cellGap": 2,
"cellValues": {},
"color": {
"exponent": 0.5,
"fill": "#5794F2",
"min": 0,
"mode": "scheme",
"reverse": false,
"scale": "exponential",
"scheme": "Spectral",
"steps": 128
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"filterValues": {
"le": 1e-9
},
"legend": {
"show": false
},
"rowsFrame": {
"layout": "auto"
},
"showValue": "never",
"tooltip": {
"mode": "single",
"showColorScale": false,
"yHistogram": false
},
"yAxis": {
"axisPlacement": "left",
"decimals": 1,
"reverse": false,
"unit": "s"
}
},
"pluginVersion": "10.4.2",
"reverseYBuckets": false,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"exemplar": true,
"expr": "sum(increase(tgi_batch_filter_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)",
"format": "heatmap",
"interval": "",
"legendFormat": "{{ le }}",
"range": true,
"refId": "A"
}
],
"title": "Filter Batch Latency",
"tooltip": {
"show": true,
"showHistogram": false
},
"type": "heatmap",
"xAxis": {
"show": true
},
"yAxis": {
"decimals": 1,
"format": "s",
"logBase": 1,
"show": true
},
"yBucketBound": "auto"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "p50"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "green",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p90"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "orange",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "p99"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 11,
"w": 6,
"x": 12,
"y": 84
},
"id": 36,
"options": {
"legend": {
"calcs": [
"min",
"max"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_concat_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
"legendFormat": "p50",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_concat_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p90",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_concat_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
"hide": false,
"legendFormat": "p99",
"range": true,
"refId": "C"
}
],
"title": "Batch Concat quantiles",
"type": "timeseries"
},
{
"cards": {},
"color": {
"cardColor": "#5794F2",
"colorScale": "linear",
"colorScheme": "interpolateSpectral",
"exponent": 0.5,
"min": 0,
"mode": "opacity"
},
"dataFormat": "tsbuckets",
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"fieldConfig": {
"defaults": {
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"scaleDistribution": {
"type": "linear"
}
}
},
"overrides": []
},
"gridPos": {
"h": 11,
"w": 6,
"x": 18,
"y": 84
},
"heatmap": {},
"hideZeroBuckets": false,
"highlightCards": true,
"id": 41,
"legend": {
"show": false
},
"maxDataPoints": 25,
"options": {
"calculate": false,
"calculation": {},
"cellGap": 2,
"cellValues": {},
"color": {
"exponent": 0.5,
"fill": "#5794F2",
"min": 0,
"mode": "scheme",
"reverse": false,
"scale": "exponential",
"scheme": "Spectral",
"steps": 128
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"filterValues": {
"le": 1e-9
},
"legend": {
"show": false
},
"rowsFrame": {
"layout": "auto"
},
"showValue": "never",
"tooltip": {
"mode": "single",
"showColorScale": false,
"yHistogram": false
},
"yAxis": {
"axisPlacement": "left",
"decimals": 1,
"reverse": false,
"unit": "s"
}
},
"pluginVersion": "10.4.2",
"reverseYBuckets": false,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"editorMode": "code",
"exemplar": true,
"expr": "sum(increase(tgi_batch_concat_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)",
"format": "heatmap",
"interval": "",
"legendFormat": "{{ le }}",
"range": true,
"refId": "A"
}
],
"title": "Batch Concat latency",
"tooltip": {
"show": true,
"showHistogram": false
},
"type": "heatmap",
"xAxis": {
"show": true
},
"yAxis": {
"decimals": 1,
"format": "s",
"logBase": 1,
"show": true
},
"yBucketBound": "auto"
}
],
"refresh": "",
"schemaVersion": 39,
"tags": [],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "gpu-txt-gen-cohereforai-c4ai-command-r-plu-ba7f1",
"value": "gpu-txt-gen-cohereforai-c4ai-command-r-plu-ba7f1"
},
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
},
"definition": "label_values(tgi_request_count, container)",
"hide": 0,
"includeAll": false,
"multi": false,
"name": "service",
"options": [],
"query": {
"query": "label_values(tgi_request_count, container)",
"refId": "StandardVariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-30m",
"to": "now-30s"
},
"timepicker": {
"nowDelay": "30s"
},
"timezone": "",
"title": "Text Generation Inference",
"uid": "RHSk7EL4kdqsd",
"version": 12,
"weekStart": ""
}
......@@ -497,7 +497,7 @@ fn statis_spans<'a>(data: &[f64], unit: &'static str) -> Vec<Line<'a>> {
"Lowest: {:.2} {unit}",
data.iter()
.min_by(|a, b| a.total_cmp(b))
.unwrap_or(&std::f64::NAN)
.unwrap_or(&f64::NAN)
),
Style::default().fg(Color::Reset),
)]),
......@@ -506,7 +506,7 @@ fn statis_spans<'a>(data: &[f64], unit: &'static str) -> Vec<Line<'a>> {
"Highest: {:.2} {unit}",
data.iter()
.max_by(|a, b| a.total_cmp(b))
.unwrap_or(&std::f64::NAN)
.unwrap_or(&f64::NAN)
),
Style::default().fg(Color::Reset),
)]),
......@@ -555,17 +555,17 @@ fn latency_throughput_chart<'a>(
let min_latency: f64 = *latency_iter
.clone()
.min_by(|a, b| a.total_cmp(b))
.unwrap_or(&std::f64::NAN);
.unwrap_or(&f64::NAN);
let max_latency: f64 = *latency_iter
.max_by(|a, b| a.total_cmp(b))
.unwrap_or(&std::f64::NAN);
.unwrap_or(&f64::NAN);
let min_throughput: f64 = *throughput_iter
.clone()
.min_by(|a, b| a.total_cmp(b))
.unwrap_or(&std::f64::NAN);
.unwrap_or(&f64::NAN);
let max_throughput: f64 = *throughput_iter
.max_by(|a, b| a.total_cmp(b))
.unwrap_or(&std::f64::NAN);
.unwrap_or(&f64::NAN);
// Char min max values
let min_x = if zoom {
......
......@@ -11,7 +11,7 @@ pub(crate) enum Event {
/// Key press.
Key(event::KeyEvent),
/// Terminal resize.
Resize(u16, u16),
Resize,
}
pub(crate) async fn terminal_event_task(
......@@ -47,8 +47,8 @@ async fn event_loop(fps: u32, event_sender: mpsc::Sender<Event>) {
if event::poll(Duration::from_secs(0)).expect("no events available") {
match event::read().expect("unable to read event") {
event::Event::Key(e) => event_sender.send(Event::Key(e)).await.unwrap_or(()),
event::Event::Resize(w, h) => {
event_sender.send(Event::Resize(w, h)).await.unwrap_or(())
event::Event::Resize(_w, _h) => {
event_sender.send(Event::Resize).await.unwrap_or(())
}
_ => (),
}
......
use std::time::{Duration, Instant};
use text_generation_client::{
Batch, CachedBatch, ClientError, NextTokenChooserParameters, Request, ShardedClient,
use text_generation_client::v3::{
Batch, CachedBatch, NextTokenChooserParameters, Request, ShardedClient,
StoppingCriteriaParameters,
};
use text_generation_client::{Chunk, ClientError, Input};
use tokenizers::{Tokenizer, TruncationDirection};
use tokio::sync::{broadcast, mpsc};
......@@ -142,6 +143,9 @@ async fn prefill(
.map(|id| Request {
id: id.into(),
prefill_logprobs: false,
input_chunks: Some(Input {
chunks: vec![Chunk::Text(sequence.clone()).into()],
}),
inputs: sequence.clone(),
truncate: sequence_length,
parameters: Some(parameters.clone()),
......@@ -151,6 +155,9 @@ async fn prefill(
ignore_eos_token: true, // Will not stop even if a eos token is generated
}),
top_n_tokens: top_n_tokens.unwrap_or(0),
blocks: vec![],
slots: vec![],
adapter_id: None,
})
.collect();
......@@ -159,6 +166,7 @@ async fn prefill(
requests,
size: batch_size,
max_tokens: batch_size * (sequence_length + decode_length),
max_blocks: 0,
};
// Run prefill
......
......@@ -8,7 +8,7 @@ use crate::app::App;
use crate::event::Event;
use crossterm::ExecutableCommand;
use std::io;
use text_generation_client::{GrammarType, NextTokenChooserParameters, ShardedClient};
use text_generation_client::v3::{GrammarType, NextTokenChooserParameters, ShardedClient};
use tokenizers::Tokenizer;
use tokio::sync::{broadcast, mpsc};
use tui::backend::CrosstermBackend;
......
......@@ -4,7 +4,7 @@
/// and: https://github.com/orhun/rust-tui-template
use clap::Parser;
use std::path::Path;
use text_generation_client::ShardedClient;
use text_generation_client::v3::ShardedClient;
use tokenizers::{FromPretrainedParameters, Tokenizer};
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::util::SubscriberInitExt;
......@@ -147,7 +147,9 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
tracing::info!("Downloading tokenizer");
// Parse Huggingface hub token
let auth_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok();
let auth_token = std::env::var("HF_TOKEN")
.or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
.ok();
// Download and instantiate tokenizer
// We need to download it outside of the Tokio runtime
......
......@@ -156,17 +156,17 @@ fn avg_min_max(data: &[f64]) -> (f64, f64, f64) {
let min = data
.iter()
.min_by(|a, b| a.total_cmp(b))
.unwrap_or(&std::f64::NAN);
.unwrap_or(&f64::NAN);
let max = data
.iter()
.max_by(|a, b| a.total_cmp(b))
.unwrap_or(&std::f64::NAN);
.unwrap_or(&f64::NAN);
(average, *min, *max)
}
fn px(data: &[f64], p: u32) -> f64 {
let i = (f64::from(p) / 100.0 * data.len() as f64) as usize;
*data.get(i).unwrap_or(&std::f64::NAN)
*data.get(i).unwrap_or(&f64::NAN)
}
fn format_value(value: f64, unit: &'static str) -> String {
......
......@@ -37,7 +37,7 @@ pub(crate) fn percentiles(values: &[f64], pecents: &[i32]) -> BTreeMap<String, f
.iter()
.map(|&p| {
let i = (f64::from(p) / 100.0 * values.len() as f64) as usize;
(format!("p{p}"), *values.get(i).unwrap_or(&std::f64::NAN))
(format!("p{p}"), *values.get(i).unwrap_or(&f64::NAN))
})
.collect()
}
......@@ -12,7 +12,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
__version__ = "0.6.0"
__version__ = "0.7.0"
DEPRECATION_WARNING = (
"`text_generation` clients are deprecated and will be removed in the near future. "
"Please use the `InferenceClient` from the `huggingface_hub` package instead."
)
from text_generation.client import Client, AsyncClient
from text_generation.inference_api import InferenceAPIClient, InferenceAPIAsyncClient
import json
import requests
import warnings
from aiohttp import ClientSession, ClientTimeout
from pydantic import ValidationError
from typing import Dict, Optional, List, AsyncIterator, Iterator, Union
from text_generation import DEPRECATION_WARNING
from text_generation.types import (
StreamResponse,
Response,
Request,
Parameters,
Grammar,
CompletionRequest,
Completion,
CompletionComplete,
ChatRequest,
ChatCompletionChunk,
ChatComplete,
......@@ -19,6 +24,9 @@ from text_generation.types import (
)
from text_generation.errors import parse_error
# emit deprecation warnings
warnings.simplefilter("always", DeprecationWarning)
class Client:
"""Client to make calls to a text-generation-inference instance
......@@ -59,11 +67,100 @@ class Client:
timeout (`int`):
Timeout in seconds
"""
warnings.warn(DEPRECATION_WARNING, DeprecationWarning)
self.base_url = base_url
self.headers = headers
self.cookies = cookies
self.timeout = timeout
def completion(
self,
prompt: str,
frequency_penalty: Optional[float] = None,
max_tokens: Optional[int] = None,
repetition_penalty: Optional[float] = None,
seed: Optional[int] = None,
stream: bool = False,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
stop: Optional[List[str]] = None,
):
"""
Given a prompt, generate a response synchronously
Args:
prompt (`str`):
Prompt
frequency_penalty (`float`):
The parameter for frequency penalty. 0.0 means no penalty
Penalize new tokens based on their existing frequency in the text so far,
decreasing the model's likelihood to repeat the same line verbatim.
max_tokens (`int`):
Maximum number of generated tokens
repetition_penalty (`float`):
The parameter for frequency penalty. 0.0 means no penalty. See [this
paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
seed (`int`):
Random sampling seed
stream (`bool`):
Stream the response
temperature (`float`):
The value used to module the logits distribution.
top_p (`float`):
If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
higher are kept for generation
stop (`List[str]`):
Stop generating tokens if a member of `stop` is generated
"""
request = CompletionRequest(
model="tgi",
prompt=prompt,
frequency_penalty=frequency_penalty,
max_tokens=max_tokens,
repetition_penalty=repetition_penalty,
seed=seed,
stream=stream,
temperature=temperature,
top_p=top_p,
stop=stop,
)
if not stream:
resp = requests.post(
f"{self.base_url}/v1/completions",
json=request.dict(),
headers=self.headers,
cookies=self.cookies,
timeout=self.timeout,
)
payload = resp.json()
if resp.status_code != 200:
raise parse_error(resp.status_code, payload)
return Completion(**payload)
else:
return self._completion_stream_response(request)
def _completion_stream_response(self, request):
resp = requests.post(
f"{self.base_url}/v1/completions",
json=request.dict(),
headers=self.headers,
cookies=self.cookies,
timeout=self.timeout,
stream=True,
)
# iterate and print stream
for byte_payload in resp.iter_lines():
if byte_payload == b"\n":
continue
payload = byte_payload.decode("utf-8")
if payload.startswith("data:"):
json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
try:
response = CompletionComplete(**json_payload)
yield response
except ValidationError:
raise parse_error(resp.status, json_payload)
def chat(
self,
messages: List[Message],
......@@ -82,6 +179,7 @@ class Client:
tools: Optional[List[Tool]] = None,
tool_prompt: Optional[str] = None,
tool_choice: Optional[str] = None,
stop: Optional[List[str]] = None,
):
"""
Given a list of messages, generate a response asynchronously
......@@ -124,6 +222,8 @@ class Client:
A prompt to be appended before the tools
tool_choice (`str`):
The tool to use
stop (`List[str]`):
Stop generating tokens if a member of `stop` is generated
"""
request = ChatRequest(
......@@ -144,6 +244,7 @@ class Client:
tools=tools,
tool_prompt=tool_prompt,
tool_choice=tool_choice,
stop=stop,
)
if not stream:
resp = requests.post(
......@@ -449,11 +550,99 @@ class AsyncClient:
timeout (`int`):
Timeout in seconds
"""
warnings.warn(DEPRECATION_WARNING, DeprecationWarning)
self.base_url = base_url
self.headers = headers
self.cookies = cookies
self.timeout = ClientTimeout(timeout)
async def completion(
self,
prompt: str,
frequency_penalty: Optional[float] = None,
max_tokens: Optional[int] = None,
repetition_penalty: Optional[float] = None,
seed: Optional[int] = None,
stream: bool = False,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
stop: Optional[List[str]] = None,
) -> Union[Completion, AsyncIterator[CompletionComplete]]:
"""
Given a prompt, generate a response asynchronously
Args:
prompt (`str`):
Prompt
frequency_penalty (`float`):
The parameter for frequency penalty. 0.0 means no penalty
Penalize new tokens based on their existing frequency in the text so far,
decreasing the model's likelihood to repeat the same line verbatim.
max_tokens (`int`):
Maximum number of generated tokens
repetition_penalty (`float`):
The parameter for frequency penalty. 0.0 means no penalty. See [this
paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
seed (`int`):
Random sampling seed
stream (`bool`):
Stream the response
temperature (`float`):
The value used to module the logits distribution.
top_p (`float`):
If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
higher are kept for generation
stop (`List[str]`):
Stop generating tokens if a member of `stop` is generated
"""
request = CompletionRequest(
model="tgi",
prompt=prompt,
frequency_penalty=frequency_penalty,
max_tokens=max_tokens,
repetition_penalty=repetition_penalty,
seed=seed,
stream=stream,
temperature=temperature,
top_p=top_p,
stop=stop,
)
if not stream:
return await self._completion_single_response(request)
else:
return self._completion_stream_response(request)
async def _completion_single_response(self, request):
async with ClientSession(
headers=self.headers, cookies=self.cookies, timeout=self.timeout
) as session:
async with session.post(
f"{self.base_url}/v1/completions", json=request.dict()
) as resp:
payload = await resp.json()
if resp.status != 200:
raise parse_error(resp.status, payload)
return Completion(**payload)
async def _completion_stream_response(self, request):
async with ClientSession(
headers=self.headers, cookies=self.cookies, timeout=self.timeout
) as session:
async with session.post(
f"{self.base_url}/v1/completions", json=request.dict()
) as resp:
async for byte_payload in resp.content:
if byte_payload == b"\n":
continue
payload = byte_payload.decode("utf-8")
if payload.startswith("data:"):
json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
try:
response = CompletionComplete(**json_payload)
yield response
except ValidationError:
raise parse_error(resp.status, json_payload)
async def chat(
self,
messages: List[Message],
......@@ -472,6 +661,7 @@ class AsyncClient:
tools: Optional[List[Tool]] = None,
tool_prompt: Optional[str] = None,
tool_choice: Optional[str] = None,
stop: Optional[List[str]] = None,
) -> Union[ChatComplete, AsyncIterator[ChatCompletionChunk]]:
"""
Given a list of messages, generate a response asynchronously
......@@ -514,6 +704,8 @@ class AsyncClient:
A prompt to be appended before the tools
tool_choice (`str`):
The tool to use
stop (`List[str]`):
Stop generating tokens if a member of `stop` is generated
"""
request = ChatRequest(
......@@ -534,6 +726,7 @@ class AsyncClient:
tools=tools,
tool_prompt=tool_prompt,
tool_choice=tool_choice,
stop=stop,
)
if not stream:
return await self._chat_single_response(request)
......
from enum import Enum
from pydantic import BaseModel, field_validator
from pydantic import BaseModel, field_validator, ConfigDict
from typing import Optional, List, Union, Any
from text_generation.errors import ValidationError
......@@ -46,30 +46,6 @@ class Tool(BaseModel):
function: dict
class ChatCompletionComplete(BaseModel):
# Index of the chat completion
index: int
# Message associated with the chat completion
message: Message
# Log probabilities for the chat completion
logprobs: Optional[Any]
# Reason for completion
finish_reason: str
# Usage details of the chat completion
usage: Optional[Any] = None
class CompletionComplete(BaseModel):
# Index of the chat completion
index: int
# Message associated with the chat completion
text: str
# Log probabilities for the chat completion
logprobs: Optional[Any]
# Reason for completion
finish_reason: str
class Function(BaseModel):
name: Optional[str]
arguments: str
......@@ -95,24 +71,41 @@ class Choice(BaseModel):
finish_reason: Optional[str] = None
class ChatCompletionChunk(BaseModel):
id: str
object: str
created: int
class CompletionRequest(BaseModel):
# Model identifier
model: str
system_fingerprint: str
choices: List[Choice]
# Prompt
prompt: str
# The parameter for repetition penalty. 1.0 means no penalty.
# See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
repetition_penalty: Optional[float] = None
# The parameter for frequency penalty. 1.0 means no penalty
# Penalize new tokens based on their existing frequency in the text so far,
# decreasing the model's likelihood to repeat the same line verbatim.
frequency_penalty: Optional[float] = None
# Maximum number of tokens to generate
max_tokens: Optional[int] = None
# Flag to indicate streaming response
stream: bool = False
# Random sampling seed
seed: Optional[int] = None
# Sampling temperature
temperature: Optional[float] = None
# Top-p value for nucleus sampling
top_p: Optional[float] = None
# Stop generating tokens if a member of `stop` is generated
stop: Optional[List[str]] = None
class ChatComplete(BaseModel):
# Chat completion details
id: str
object: str
created: int
model: str
system_fingerprint: str
choices: List[ChatCompletionComplete]
usage: Any
class CompletionComplete(BaseModel):
# Index of the chat completion
index: int
# Message associated with the chat completion
text: str
# Log probabilities for the chat completion
logprobs: Optional[Any]
# Reason for completion
finish_reason: str
class Completion(BaseModel):
......@@ -163,6 +156,41 @@ class ChatRequest(BaseModel):
tool_prompt: Optional[str] = None
# Choice of tool to be used
tool_choice: Optional[str] = None
# Stop generating tokens if a member of `stop` is generated
stop: Optional[List[str]] = None
class ChatCompletionComplete(BaseModel):
# Index of the chat completion
index: int
# Message associated with the chat completion
message: Message
# Log probabilities for the chat completion
logprobs: Optional[Any]
# Reason for completion
finish_reason: str
# Usage details of the chat completion
usage: Optional[Any] = None
class ChatComplete(BaseModel):
# Chat completion details
id: str
object: str
created: int
model: str
system_fingerprint: str
choices: List[ChatCompletionComplete]
usage: Any
class ChatCompletionChunk(BaseModel):
id: str
object: str
created: int
model: str
system_fingerprint: str
choices: List[Choice]
class Parameters(BaseModel):
......@@ -424,5 +452,9 @@ class StreamResponse(BaseModel):
# Inference API currently deployed model
class DeployedModel(BaseModel):
# Disable warning for use of `model_` prefix in `model_id`. Be mindful about adding members
# with model_ prefixes, since this disables guardrails for colliding fields:
# https://github.com/pydantic/pydantic/issues/9177
model_config = ConfigDict(protected_namespaces=())
model_id: str
sha: str
Documentation available at: https://huggingface.co/docs/text-generation-inference
## Release
When making a release, please update the latest version in the documentation with:
```
export OLD_VERSION="2\.0\.3"
export NEW_VERSION="2\.0\.4"
find . -name '*.md' -exec sed -i -e "s/$OLD_VERSION/$NEW_VERSION/g" {} \;
```
......@@ -10,7 +10,7 @@
"name": "Apache 2.0",
"url": "https://www.apache.org/licenses/LICENSE-2.0"
},
"version": "2.0.1"
"version": "2.1.1"
},
"paths": {
"/": {
......@@ -19,7 +19,6 @@
"Text Generation Inference"
],
"summary": "Generate tokens if `stream == false` or a stream of token if `stream == true`",
"description": "Generate tokens if `stream == false` or a stream of token if `stream == true`",
"operationId": "compat_generate",
"requestBody": {
"content": {
......@@ -108,7 +107,6 @@
"Text Generation Inference"
],
"summary": "Generate tokens",
"description": "Generate tokens",
"operationId": "generate",
"requestBody": {
"content": {
......@@ -192,7 +190,6 @@
"Text Generation Inference"
],
"summary": "Generate a stream of token using Server-Sent Events",
"description": "Generate a stream of token using Server-Sent Events",
"operationId": "generate_stream",
"requestBody": {
"content": {
......@@ -276,7 +273,6 @@
"Text Generation Inference"
],
"summary": "Health check method",
"description": "Health check method",
"operationId": "health",
"responses": {
"200": {
......@@ -305,7 +301,6 @@
"Text Generation Inference"
],
"summary": "Text Generation Inference endpoint info",
"description": "Text Generation Inference endpoint info",
"operationId": "get_model_info",
"responses": {
"200": {
......@@ -327,7 +322,6 @@
"Text Generation Inference"
],
"summary": "Prometheus metrics scrape endpoint",
"description": "Prometheus metrics scrape endpoint",
"operationId": "metrics",
"responses": {
"200": {
......@@ -349,7 +343,6 @@
"Text Generation Inference"
],
"summary": "Tokenize inputs",
"description": "Tokenize inputs",
"operationId": "tokenize",
"requestBody": {
"content": {
......@@ -394,7 +387,6 @@
"Text Generation Inference"
],
"summary": "Generate tokens",
"description": "Generate tokens",
"operationId": "chat_completions",
"requestBody": {
"content": {
......@@ -483,7 +475,6 @@
"Text Generation Inference"
],
"summary": "Generate tokens",
"description": "Generate tokens",
"operationId": "completions",
"requestBody": {
"content": {
......@@ -626,7 +617,6 @@
"type": "object",
"required": [
"id",
"object",
"created",
"model",
"system_fingerprint",
......@@ -653,9 +643,6 @@
"type": "string",
"example": "mistralai/Mistral-7B-Instruct-v0.2"
},
"object": {
"type": "string"
},
"system_fingerprint": {
"type": "string"
},
......@@ -697,7 +684,6 @@
"type": "object",
"required": [
"id",
"object",
"created",
"model",
"system_fingerprint",
......@@ -723,9 +709,6 @@
"type": "string",
"example": "mistralai/Mistral-7B-Instruct-v0.2"
},
"object": {
"type": "string"
},
"system_fingerprint": {
"type": "string"
}
......@@ -756,34 +739,19 @@
"nullable": true
},
"message": {
"$ref": "#/components/schemas/Message"
"$ref": "#/components/schemas/OutputMessage"
}
}
},
"ChatCompletionDelta": {
"type": "object",
"required": [
"role"
],
"properties": {
"content": {
"type": "string",
"example": "What is Deep Learning?",
"nullable": true
},
"role": {
"type": "string",
"example": "user"
"oneOf": [
{
"$ref": "#/components/schemas/TextMessage"
},
"tool_calls": {
"allOf": [
{
"$ref": "#/components/schemas/DeltaToolCall"
}
],
"nullable": true
}
"$ref": "#/components/schemas/ToolCallDelta"
}
]
},
"ChatCompletionLogprob": {
"type": "object",
......@@ -903,6 +871,15 @@
"example": 0.1,
"nullable": true
},
"response_format": {
"allOf": [
{
"$ref": "#/components/schemas/GrammarType"
}
],
"default": "null",
"nullable": true
},
"seed": {
"type": "integer",
"format": "int64",
......@@ -969,6 +946,38 @@
}
}
},
"Chunk": {
"type": "object",
"required": [
"id",
"created",
"choices",
"model",
"system_fingerprint"
],
"properties": {
"choices": {
"type": "array",
"items": {
"$ref": "#/components/schemas/CompletionComplete"
}
},
"created": {
"type": "integer",
"format": "int64",
"minimum": 0
},
"id": {
"type": "string"
},
"model": {
"type": "string"
},
"system_fingerprint": {
"type": "string"
}
}
},
"CompatGenerateRequest": {
"type": "object",
"required": [
......@@ -988,6 +997,55 @@
}
}
},
"Completion": {
"oneOf": [
{
"allOf": [
{
"$ref": "#/components/schemas/Chunk"
},
{
"type": "object",
"required": [
"object"
],
"properties": {
"object": {
"type": "string",
"enum": [
"text_completion"
]
}
}
}
]
},
{
"allOf": [
{
"$ref": "#/components/schemas/CompletionFinal"
},
{
"type": "object",
"required": [
"object"
],
"properties": {
"object": {
"type": "string",
"enum": [
"text_completion"
]
}
}
}
]
}
],
"discriminator": {
"propertyName": "object"
}
},
"CompletionComplete": {
"type": "object",
"required": [
......@@ -1017,15 +1075,15 @@
}
}
},
"CompletionCompleteChunk": {
"CompletionFinal": {
"type": "object",
"required": [
"id",
"object",
"created",
"choices",
"model",
"system_fingerprint"
"system_fingerprint",
"choices",
"usage"
],
"properties": {
"choices": {
......@@ -1037,19 +1095,21 @@
"created": {
"type": "integer",
"format": "int64",
"example": "1706270835",
"minimum": 0
},
"id": {
"type": "string"
},
"model": {
"type": "string"
},
"object": {
"type": "string"
"type": "string",
"example": "mistralai/Mistral-7B-Instruct-v0.2"
},
"system_fingerprint": {
"type": "string"
},
"usage": {
"$ref": "#/components/schemas/Usage"
}
}
},
......@@ -1081,12 +1141,7 @@
"example": "mistralai/Mistral-7B-Instruct-v0.2"
},
"prompt": {
"type": "array",
"items": {
"type": "string"
},
"description": "The prompt to generate completions for.",
"example": "What is Deep Learning?"
"$ref": "#/components/schemas/Prompt"
},
"repetition_penalty": {
"type": "number",
......@@ -1100,6 +1155,15 @@
"nullable": true,
"minimum": 0
},
"stop": {
"type": "array",
"items": {
"type": "string"
},
"description": "Up to 4 sequences where the API will stop generating further tokens.",
"example": "null",
"nullable": true
},
"stream": {
"type": "boolean"
},
......@@ -1263,8 +1327,16 @@
"GenerateParameters": {
"type": "object",
"properties": {
"adapter_id": {
"type": "string",
"description": "Lora adapter id",
"default": "null",
"example": "null",
"nullable": true
},
"best_of": {
"type": "integer",
"description": "Generate best_of sequences and return the one if the highest token logprobs.",
"default": "null",
"example": 1,
"nullable": true,
......@@ -1273,20 +1345,24 @@
},
"decoder_input_details": {
"type": "boolean",
"description": "Whether to return decoder input token logprobs and ids.",
"default": "false"
},
"details": {
"type": "boolean",
"description": "Whether to return generation details.",
"default": "true"
},
"do_sample": {
"type": "boolean",
"description": "Activate logits sampling.",
"default": "false",
"example": true
},
"frequency_penalty": {
"type": "number",
"format": "float",
"description": "The parameter for frequency penalty. 1.0 means no penalty\nPenalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim.",
"default": "null",
"example": 0.1,
"nullable": true,
......@@ -1304,6 +1380,7 @@
"max_new_tokens": {
"type": "integer",
"format": "int32",
"description": "Maximum number of tokens to generate.",
"default": "100",
"example": "20",
"nullable": true,
......@@ -1312,6 +1389,7 @@
"repetition_penalty": {
"type": "number",
"format": "float",
"description": "The parameter for repetition penalty. 1.0 means no penalty.\nSee [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.",
"default": "null",
"example": 1.03,
"nullable": true,
......@@ -1319,6 +1397,7 @@
},
"return_full_text": {
"type": "boolean",
"description": "Whether to prepend the prompt to the generated text",
"default": "null",
"example": false,
"nullable": true
......@@ -1326,6 +1405,7 @@
"seed": {
"type": "integer",
"format": "int64",
"description": "Random sampling seed.",
"default": "null",
"example": "null",
"nullable": true,
......@@ -1337,6 +1417,7 @@
"items": {
"type": "string"
},
"description": "Stop generating tokens if a member of `stop` is generated.",
"example": [
"photographer"
],
......@@ -1345,6 +1426,7 @@
"temperature": {
"type": "number",
"format": "float",
"description": "The value used to module the logits distribution.",
"default": "null",
"example": 0.5,
"nullable": true,
......@@ -1353,6 +1435,7 @@
"top_k": {
"type": "integer",
"format": "int32",
"description": "The number of highest probability vocabulary tokens to keep for top-k-filtering.",
"default": "null",
"example": 10,
"nullable": true,
......@@ -1361,6 +1444,7 @@
"top_n_tokens": {
"type": "integer",
"format": "int32",
"description": "The number of highest probability vocabulary tokens to keep for top-n-filtering.",
"default": "null",
"example": 5,
"nullable": true,
......@@ -1370,6 +1454,7 @@
"top_p": {
"type": "number",
"format": "float",
"description": "Top-p value for nucleus sampling.",
"default": "null",
"example": 0.95,
"nullable": true,
......@@ -1378,6 +1463,7 @@
},
"truncate": {
"type": "integer",
"description": "Truncate inputs tokens to the given size.",
"default": "null",
"example": "null",
"nullable": true,
......@@ -1386,6 +1472,7 @@
"typical_p": {
"type": "number",
"format": "float",
"description": "Typical Decoding mass\nSee [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information.",
"default": "null",
"example": 0.95,
"nullable": true,
......@@ -1394,6 +1481,7 @@
},
"watermark": {
"type": "boolean",
"description": "Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226).",
"default": "false",
"example": true
}
......@@ -1486,13 +1574,14 @@
"max_concurrent_requests",
"max_best_of",
"max_stop_sequences",
"max_input_length",
"max_input_tokens",
"max_total_tokens",
"waiting_served_ratio",
"max_batch_total_tokens",
"max_waiting_tokens",
"validation_workers",
"max_client_batch_size",
"router",
"version"
],
"properties": {
......@@ -1529,7 +1618,7 @@
"example": "128",
"minimum": 0
},
"max_input_length": {
"max_input_tokens": {
"type": "integer",
"example": "1024",
"minimum": 0
......@@ -1572,6 +1661,11 @@
"example": "e985a63cdc139290c5f700ff1929f0b5942cced2",
"nullable": true
},
"router": {
"type": "string",
"description": "Router Info",
"example": "text-generation-router"
},
"sha": {
"type": "string",
"example": "null",
......@@ -1584,7 +1678,6 @@
},
"version": {
"type": "string",
"description": "Router Info",
"example": "0.5.0"
},
"waiting_served_ratio": {
......@@ -1597,13 +1690,12 @@
"Message": {
"type": "object",
"required": [
"role"
"role",
"content"
],
"properties": {
"content": {
"type": "string",
"example": "My name is David and I",
"nullable": true
"$ref": "#/components/schemas/MessageContent"
},
"name": {
"type": "string",
......@@ -1613,13 +1705,6 @@
"role": {
"type": "string",
"example": "user"
},
"tool_calls": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ToolCall"
},
"nullable": true
}
}
},
......@@ -1649,6 +1734,12 @@
}
}
},
"Prompt": {
"type": "array",
"items": {
"type": "string"
}
},
"SimpleToken": {
"type": "object",
"required": [
......@@ -1808,9 +1899,7 @@
"$ref": "#/components/schemas/FunctionDefinition"
},
"id": {
"type": "integer",
"format": "int32",
"minimum": 0
"type": "string"
},
"type": {
"type": "string"
......@@ -1819,22 +1908,24 @@
},
"ToolType": {
"oneOf": [
{
"type": "object",
"default": null,
"nullable": true
},
{
"type": "string"
},
{
"type": "object",
"required": [
"FunctionName"
"function"
],
"properties": {
"FunctionName": {
"type": "string"
"function": {
"$ref": "#/components/schemas/FunctionName"
}
}
},
{
"type": "string",
"enum": [
"OneOf"
]
}
]
},
......
......@@ -3,12 +3,22 @@
title: Text Generation Inference
- local: quicktour
title: Quick Tour
- local: installation_nvidia
title: Using TGI with Nvidia GPUs
- local: installation_amd
title: Using TGI with AMD GPUs
- local: installation_gaudi
title: Using TGI with Intel Gaudi
- local: installation_inferentia
title: Using TGI with AWS Inferentia
- local: installation
title: Installation
title: Installation from source
- local: supported_models
title: Supported Models and Hardware
- local: messages_api
title: Messages API
- local: architecture
title: Internal Architecture
title: Getting started
- sections:
- local: basic_tutorials/consuming_tgi
......@@ -29,6 +39,10 @@
title: Using Guidance, JSON, tools
- local: basic_tutorials/visual_language_models
title: Visual Language Models
- local: basic_tutorials/monitoring
title: Monitoring TGI with Prometheus and Grafana
- local: basic_tutorials/train_medusa
title: Train Medusa
title: Tutorials
- sections:
- local: conceptual/streaming
......@@ -46,6 +60,9 @@
- local: conceptual/speculation
title: Speculation (Medusa, ngram)
- local: conceptual/guidance
title: How Guidance Works (via outlines)
title: How Guidance Works (via outlines
- local: conceptual/lora
title: LoRA (Low-Rank Adaptation)
title: Conceptual Guides
# Text Generation Inference Architecture
This document aims at describing the architecture of Text Generation Inference (TGI), by describing the call flow between the separate components.
A high-level architecture diagram can be seen here:
![TGI architecture](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/TGI.png)
This diagram shows well there are these separate components:
- **The router**, also named `webserver`, that receives the client requests, buffers them, creates some batches, and prepares gRPC calls to a model server.
- **The model server**, responsible of receiving the gRPC requests and to process the inference on the model. If the model is sharded across multiple accelerators (e.g.: multiple GPUs), the model server shards might be synchronized via NCCL or equivalent.
- **The launcher** is a helper thar will be able to launch one or several model servers (if model is sharded), and it launches the router with the compatible arguments.
The router and the model server can be two different machines, they do not need to be deployed together.
## The Router
This component is a rust web server binary that accepts HTTP requests using the custom [HTTP API](https://huggingface.github.io/text-generation-inference/), as well as OpenAI's [Messages API](https://huggingface.co/docs/text-generation-inference/messages_api).
The router receives the API calls and handles the "baches" logic (and introduction to batching can be found [here](https://github.com/huggingface/text-generation-inference/blob/main/router/README.md)).
It uses different strategies to reduce latency between requests and responses, especially oriented to decoding latency. It will use queues, schedulers, and block allocators to achieve that and produce batched requests that it will then be sent to the model server.
### Router's command line
The router command line will be the way to pass parameters to it (it does not rely on configuration file):
```
Text Generation Webserver
Usage: text-generation-router [OPTIONS]
Options:
--max-concurrent-requests <MAX_CONCURRENT_REQUESTS>
[env: MAX_CONCURRENT_REQUESTS=] [default: 128]
--max-best-of <MAX_BEST_OF>
[env: MAX_BEST_OF=] [default: 2]
--max-stop-sequences <MAX_STOP_SEQUENCES>
[env: MAX_STOP_SEQUENCES=] [default: 4]
--max-top-n-tokens <MAX_TOP_N_TOKENS>
[env: MAX_TOP_N_TOKENS=] [default: 5]
--max-input-tokens <MAX_INPUT_TOKENS>
[env: MAX_INPUT_TOKENS=] [default: 1024]
--max-total-tokens <MAX_TOTAL_TOKENS>
[env: MAX_TOTAL_TOKENS=] [default: 2048]
--waiting-served-ratio <WAITING_SERVED_RATIO>
[env: WAITING_SERVED_RATIO=] [default: 1.2]
--max-batch-prefill-tokens <MAX_BATCH_PREFILL_TOKENS>
[env: MAX_BATCH_PREFILL_TOKENS=] [default: 4096]
--max-batch-total-tokens <MAX_BATCH_TOTAL_TOKENS>
[env: MAX_BATCH_TOTAL_TOKENS=]
--max-waiting-tokens <MAX_WAITING_TOKENS>
[env: MAX_WAITING_TOKENS=] [default: 20]
--max-batch-size <MAX_BATCH_SIZE>
[env: MAX_BATCH_SIZE=]
--hostname <HOSTNAME>
[env: HOSTNAME=] [default: 0.0.0.0]
-p, --port <PORT>
[env: PORT=] [default: 3000]
--master-shard-uds-path <MASTER_SHARD_UDS_PATH>
[env: MASTER_SHARD_UDS_PATH=] [default: /tmp/text-generation-server-0]
--tokenizer-name <TOKENIZER_NAME>
[env: TOKENIZER_NAME=] [default: bigscience/bloom]
--tokenizer-config-path <TOKENIZER_CONFIG_PATH>
[env: TOKENIZER_CONFIG_PATH=]
--revision <REVISION>
[env: REVISION=]
--validation-workers <VALIDATION_WORKERS>
[env: VALIDATION_WORKERS=] [default: 2]
--json-output
[env: JSON_OUTPUT=]
--otlp-endpoint <OTLP_ENDPOINT>
[env: OTLP_ENDPOINT=]
--otlp-service-name <OTLP_SERVICE_NAME>
[env: OTLP_SERVICE_NAME=]
--cors-allow-origin <CORS_ALLOW_ORIGIN>
[env: CORS_ALLOW_ORIGIN=]
--ngrok
[env: NGROK=]
--ngrok-authtoken <NGROK_AUTHTOKEN>
[env: NGROK_AUTHTOKEN=]
--ngrok-edge <NGROK_EDGE>
[env: NGROK_EDGE=]
--messages-api-enabled
[env: MESSAGES_API_ENABLED=]
--disable-grammar-support
[env: DISABLE_GRAMMAR_SUPPORT=]
--max-client-batch-size <MAX_CLIENT_BATCH_SIZE>
[env: MAX_CLIENT_BATCH_SIZE=] [default: 4]
-h, --help
Print help
-V, --version
Print version
```
## The Model Server
The model server is a python server, capable of starting a server waiting for gRPC requests, loads a given model, perform sharding to provide [tensor parallelism](https://huggingface.co/docs/text-generation-inference/conceptual/tensor_parallelism), and stays alive while waiting for new requests.
The model server supports models instantiated using Pytorch and optimized for inference mainly on CUDA/ROCM.
### Model Server Variants
Several variants of the model server exist that are actively supported by Hugging Face:
- By default, the model server will attempt building [a server optimized for Nvidia GPUs with CUDA](https://huggingface.co/docs/text-generation-inference/installation_nvidia). The code for this version is hosted in the [main TGI repository](https://github.com/huggingface/text-generation-inference).
- A [version optimized for AMD with ROCm](https://huggingface.co/docs/text-generation-inference/installation_amd) is hosted in the main TGI repository. Some model features differ.
- The [version for Intel Gaudi](https://huggingface.co/docs/text-generation-inference/installation_gaudi) is maintained on a forked repository, often resynchronized with the main [TGI repository](https://github.com/huggingface/tgi-gaudi).
- A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained as part of [Optimum Neuron](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference).
- A version for Google TPUs is maintained as part of [Optimum TPU](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference).
Not all variants provide the same features, as hardware and middleware capabilities do not provide the same optimizations.
### Command Line Interface
The official command line interface (CLI) for the server supports three subcommands, `download-weights`, `quantize` and `serve`:
- `download-weights` will download weights from the hub and, in some variants it will convert weights to a format that is adapted to the given implementation;
- `quantize` will allow to quantize a model using the `qptq` package. This feature is not available nor supported on all variants;
- `serve` will start the server that load a model (or a model shard), receives gRPC calls from the router, performs an inference and provides a formatted response to the given request.
Serve's command line parameters on the TGI repository are these:
```
Usage: cli.py serve [OPTIONS] MODEL_ID
╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────╮
│ * model_id TEXT [default: None] [required] │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯
╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────╮
│ --revision TEXT [default: None] │
│ --sharded --no-sharded [default: no-sharded] │
│ --quantize [bitsandbytes|bitsandbytes [default: None] │
│ -nf4|bitsandbytes-fp4|gptq │
│ |awq|eetq|exl2|fp8] │
│ --speculate INTEGER [default: None] │
│ --dtype [float16|bfloat16] [default: None] │
│ --trust-remote-code --no-trust-remote-code [default: │
│ no-trust-remote-code] │
│ --uds-path PATH [default: │
│ /tmp/text-generation-serve… │
│ --logger-level TEXT [default: INFO] │
│ --json-output --no-json-output [default: no-json-output] │
│ --otlp-endpoint TEXT [default: None] │
│ --otlp-service-name TEXT [default: │
│ text-generation-inference...│
│ --help Show this message and exit. │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯
```
Note that some variants might support different parameters, and they could possibly accept more options that can be passed on using environment variables.
## Call Flow
Once both components are initialized, weights downloaded and model server is up and running, router and model server exchange data and info through the gRPC call. There are currently two supported schemas, [v2](https://github.com/huggingface/text-generation-inference/blob/main/proto/generate.proto) and [v3](https://github.com/huggingface/text-generation-inference/blob/main/proto/v3/generate.proto). These two versions are almost identical, except for:
- input chunks support, for text and image data,
- paged attention support
Here's a diagram that displays the exchanges that follow the router and model server startup.
```mermaid
sequenceDiagram
Router->>Model Server: service discovery
Model Server-->>Router: urls for other shards
Router->>Model Server: get model info
Model Server-->>Router: shard info
Router->>Model Server: health check
Model Server-->>Router: health OK
Router->>Model Server: warmup(max_input_tokens, max_batch_prefill_tokens, max_total_tokens, max_batch_size)
Model Server-->>Router: warmup result
```
After these are done, the router is ready to receive generate calls from multiple clients. Here's an example.
```mermaid
sequenceDiagram
participant Client 1
participant Client 2
participant Client 3
participant Router
participant Model Server
Client 1->>Router: generate_stream
Router->>Model Server: prefill(batch1)
Model Server-->>Router: generations, cached_batch1, timings
Router-->>Client 1: token 1
Router->>Model Server: decode(cached_batch1)
Model Server-->>Router: generations, cached_batch1, timings
Router-->>Client 1: token 2
Router->>Model Server: decode(cached_batch1)
Model Server-->>Router: generations, cached_batch1, timings
Router-->>Client 1: token 3
Client 2->>Router: generate_stream
Router->>Model Server: prefill(batch2)
Note right of Model Server: This stops previous batch, that is restarted
Model Server-->>Router: generations, cached_batch2, timings
Router-->>Client 2: token 1'
Router->>Model Server: decode(cached_batch1, cached_batch2)
Model Server-->>Router: generations, cached_batch1, timings
Router-->>Client 1: token 4
Router-->>Client 2: token 2'
Note left of Client 1: Client 1 leaves
Router->>Model Server: filter_batch(cached_batch1, request_ids_to_keep=batch2)
Model Server-->>Router: filtered batch
Router->>Model Server: decode(cached_batch2)
Model Server-->>Router: generations, cached_batch2, timings
Router-->>Client 2: token 3'
Client 3->>Router: generate_stream
Note right of Model Server: This stops previous batch, that is restarted
Router->>Model Server: prefill(batch3)
Note left of Client 1: Client 3 leaves without receiving any batch
Router->>Model Server: clear_cache(batch3)
Note right of Model Server: This stops previous batch, that is restarted
Router->>Model Server: decode(cached_batch3)
Note right of Model Server: Last token (stopping criteria)
Model Server-->>Router: generations, cached_batch3, timings
Router-->>Client 2: token 4'
```
......@@ -2,13 +2,13 @@
If the model you wish to serve is behind gated access or the model repository on Hugging Face Hub is private, and you have access to the model, you can provide your Hugging Face Hub access token. You can generate and copy a read token from [Hugging Face Hub tokens page](https://huggingface.co/settings/tokens)
If you're using the CLI, set the `HUGGING_FACE_HUB_TOKEN` environment variable. For example:
If you're using the CLI, set the `HF_TOKEN` environment variable. For example:
```
export HUGGING_FACE_HUB_TOKEN=<YOUR READ TOKEN>
export HF_TOKEN=<YOUR READ TOKEN>
```
If you would like to do it through Docker, you can provide your token by specifying `HUGGING_FACE_HUB_TOKEN` as shown below.
If you would like to do it through Docker, you can provide your token by specifying `HF_TOKEN` as shown below.
```bash
model=meta-llama/Llama-2-7b-chat-hf
......@@ -17,8 +17,8 @@ token=<your READ token>
docker run --gpus all \
--shm-size 1g \
-e HUGGING_FACE_HUB_TOKEN=$token \
-e HF_TOKEN=$token \
-p 8080:80 \
-v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4 \
-v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.4 \
--model-id $model
```
......@@ -62,7 +62,9 @@ Options:
Possible values:
- awq: 4 bit quantization. Requires a specific AWQ quantized model: <https://hf.co/models?search=awq>. Should replace GPTQ models wherever possible because of the better latency
- eetq: 8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
- exl2: Variable bit quantization. Requires a specific EXL2 quantized model: <https://hf.co/models?search=exl2>. Requires exllama2 kernels and does not support tensor parallelism (num_shard > 1)
- gptq: 4 bit quantization. Requires a specific GTPQ quantized model: <https://hf.co/models?search=gptq>. text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels
- marlin: 4 bit quantization. Requires a specific Marlin quantized model: <https://hf.co/models?search=marlin>
- bitsandbytes: Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half, but it is known that the model will be much slower to run than the native f16
- bitsandbytes-nf4: Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x, but it is known that the model will be much slower to run than the native f16
- bitsandbytes-fp4: Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better perplexity performance for you model
......@@ -124,7 +126,7 @@ Options:
## MAX_TOP_N_TOKENS
```shell
--max-top-n-tokens <MAX_TOP_N_TOKENS>
This is the maximum allowed value for clients to set `top_n_tokens`. `top_n_tokens is used to return information about the the `n` most likely tokens at each generation step, instead of just the sampled token. This information can be used for downstream tasks like for classification or ranking
This is the maximum allowed value for clients to set `top_n_tokens`. `top_n_tokens` is used to return information about the the `n` most likely tokens at each generation step, instead of just the sampled token. This information can be used for downstream tasks like for classification or ranking
[env: MAX_TOP_N_TOKENS=]
[default: 5]
......@@ -334,6 +336,13 @@ Options:
--otlp-endpoint <OTLP_ENDPOINT>
[env: OTLP_ENDPOINT=]
```
## OTLP_SERVICE_NAME
```shell
--otlp-service-name <OTLP_SERVICE_NAME>
[env: OTLP_SERVICE_NAME=]
[default: text-generation-inference.router]
```
## CORS_ALLOW_ORIGIN
```shell
......@@ -407,6 +416,14 @@ Options:
[env: MAX_CLIENT_BATCH_SIZE=]
[default: 4]
```
## LORA_ADAPTERS
```shell
--lora-adapters <LORA_ADAPTERS>
Lora Adapters a list of adapter ids i.e. `repo/adapter1,repo/adapter2` to load during startup that will be available to callers via the `adapter_id` field in a request
[env: LORA_ADAPTERS=]
```
## HELP
```shell
......
# Monitoring TGI server with Prometheus and Grafana dashboard
TGI server deployment can easily be monitored through a Grafana dashboard, consuming a Prometheus data collection. Example of inspectable metrics are statistics on the effective batch sizes used by TGI, prefill/decode latencies, number of generated tokens, etc.
In this tutorial, we look at how to set up a local Grafana dashboard to monitor TGI usage.
![Grafana dashboard for TGI](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/grafana.png)
## Setup on the server machine
First, on your server machine, TGI needs to be launched as usual. TGI exposes [multiple](https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527) metrics that can be collected by Prometheus monitoring server.
In the rest of this tutorial, we assume that TGI was launched through Docker with `--network host`.
On the server where TGI is hosted, a Prometheus server needs to be installed and launched. To do so, please follow [Prometheus installation instructions](https://prometheus.io/download/#prometheus). For example, at the time of writing on a Linux machine:
```
wget https://github.com/prometheus/prometheus/releases/download/v2.52.0/prometheus-2.52.0.linux-amd64.tar.gz
tar -xvzf prometheus-2.52.0.linux-amd64.tar.gz
cd prometheus
```
Prometheus needs to be configured to listen on TGI's port. To do so, in Prometheus configuration file `prometheus.yml`, one needs to edit the lines:
```
static_configs:
- targets: ["0.0.0.0:80"]
```
to use the correct IP address and port.
We suggest to try `curl 0.0.0.0:80/generate -X POST -d '{"inputs":"hey chatbot, how are","parameters":{"max_new_tokens":15}}' -H 'Content-Type: application/json'` on the server side to make sure to configure the correct IP and port.
Once Prometheus is configured, Prometheus server can be launched on the same machine where TGI is launched:
```
./prometheus --config.file="prometheus.yml"
```
In this guide, Prometheus monitoring data will be consumed on a local computer. Hence, we need to forward Prometheus port (by default 9090) to the local computer. To do so, we can for example:
* Use ssh [local port forwarding](https://www.ssh.com/academy/ssh/tunneling-example)
* Use ngrok port tunneling
For simplicity, we will use [Ngrok](https://ngrok.com/docs/) in this guide to tunnel Prometheus port from the TGI server to the outside word.
For that, you should follow the steps at https://dashboard.ngrok.com/get-started/setup/linux, and once Ngrok is installed, use:
```bash
ngrok http http://0.0.0.0:9090
```
As a sanity check, one can make sure that Prometheus server can be accessed at the URL given by Ngrok (in the style of https://d661-4-223-164-145.ngrok-free.app) from a local machine.
## Setup on the monitoring machine
Monitoring is typically done on an other machine than the server one. We use a Grafana dashboard to monitor TGI's server usage.
Two options are available:
* Use Grafana Cloud for an hosted dashboard solution (https://grafana.com/products/cloud/).
* Self-host a grafana dashboard.
In this tutorial, for simplicity, we will self host the dashbard. We recommend installing Grafana Open-source edition following [the official install instructions](https://grafana.com/grafana/download?platform=linux&edition=oss), using the available Linux binaries. For example:
```bash
wget https://dl.grafana.com/oss/release/grafana-11.0.0.linux-amd64.tar.gz
tar -zxvf grafana-11.0.0.linux-amd64.tar.gz
cd grafana-11.0.0
./bin/grafana-server
```
Once the Grafana server is launched, the Grafana interface is available at http://localhost:3000. One needs to log in with the `admin` username and `admin` password.
Once logged in, the Prometheus data source for Grafana needs to be configured, in the option `Add your first data source`. There, a Prometheus data source needs to be added with the Ngrok address we got earlier, that exposes Prometheus port (example: https://d661-4-223-164-145.ngrok-free.app).
Once Prometheus data source is configured, we can finally create our dashboard! From home, go to `Create your first dashboard` and then `Import dashboard`. There, we will use the recommended dashboard template [tgi_grafana.json](https://github.com/huggingface/text-generation-inference/blob/main/assets/tgi_grafana.json) for a dashboard ready to be used, but you may configure your own dashboard as you like.
Community contributed dashboard templates are also available, for example [here](https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/) or [here](https://grafana.com/grafana/dashboards/20246-text-generation-inference/).
Load your dashboard configuration, and your TGI dashboard should be ready to go!
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment