last

efd602c8 · xuxzh1 · f1b779fc · efd602c8 · efd602c8 · efd602c8
Commit efd602c8 authored Oct 29, 2024 by xuxzh1 🎱
20 changed files
--- a/Makefile
+++ b/Makefile
 install-server:
 	cd server && make install

-install-custom-kernels:
-	if [ "$$BUILD_EXTENSIONS" = "True" ]; then cd server/custom_kernels && python setup.py install; else echo "Custom kernels are disabled, you need to set the BUILD_EXTENSIONS environment variable to 'True' in order to build them. (Please read the docs, kernels might not work on all hardware)"; fi
-
-install-integration-tests:
-	cd integration-tests && pip install -r requirements.txt
-	cd clients/python && pip install .
+install-server-cpu:
+	cd server && make install-server

 install-router:
-	cd router && cargo install --path .
+	cd router && cargo install --path . --debug

 install-launcher:
 	cd launcher && cargo install --path .
@@ -17,7 +13,10 @@ install-launcher:
 install-benchmark:
 	cd benchmark && cargo install --path .

-install: install-server install-router install-launcher install-custom-kernels
+install: install-server install-router install-launcher
+
+
+install-cpu: install-server-cpu install-router install-launcher

 server-dev:
 	cd server && make run-dev
@@ -28,6 +27,10 @@ router-dev:
 rust-tests: install-router install-launcher
 	cargo test

+install-integration-tests:
+	cd integration-tests && pip install -r requirements.txt
+	cd clients/python && pip install .
+
 integration-tests: install-integration-tests
 	pytest -s -vv -m "not private" integration-tests


--- a/README.md
+++ b/README.md
@@ -34,19 +34,19 @@ Text Generation Inference（TGI）是一个用 Rust 和 Python 编写的框架
 基于现有python环境自己安装pytorch,triton,flash-att包：
 **安装pytorch**
 安装pytorch2.1.0，pytorch whl包下载目录：[https://cancon.hpccube.com:65024/4/main/pytorch](https://cancon.hpccube.com:65024/4/main/pytorch)，根据python、dtk版本,下载对应pytorch2.1.0的whl包。安装命令如下：
-```shell
+```bash
 pip install torch* (下载的torch的whl包)
 pip install setuptools wheel
 ```
 **安装triton**
 triton whl包下载：[https://cancon.hpccube.com:65024/4/main/triton](https://cancon.hpccube.com:65024/4/main/triton),需要根据python、dtk版本,下载对应triton 2.1的whl包
-```shell
+```bash
 pip install triton* (下载的triton的whl包)
 ```

 **安装flash-attn**
 flash_attn包下载：[https://cancon.hpccube.com:65024/4/main/flash_attn](https://cancon.hpccube.com:65024/4/main/flash_attn),需要根据python、dtk版本,下载对应flash_attn 2.0.4的whl包
-```shell
+```bash
 pip install flash_attn* (下载的triton的whl包)
 ```

@@ -66,36 +66,41 @@ sudo unzip -o $PROTOC_ZIP -d /usr/local 'include/*'
 rm -f $PROTOC_ZIP
 ```
 3. 安装TGI Service
-```
+```bash
 git clone http://developer.hpccube.com/codes/OpenDAS/text-generation-inference.git # 根据需要的分支进行切换
 cd text-generation-inference
-#添加安装vllm exllama等
+#安装exllama
 cd server
-pip uninstall vllm #optional:如果是按方式一准备的环境，需要先卸载环境中默认的vllm
-make install-vllm #安装定制版本的vllm
 make install-exllama #安装exllama kernels
 make install-exllamav2 #安装exllmav2 kernels
 cd .. #回到项目根目录
+source $HOME/.cargo/env
 BUILD_EXTENSIONS=True make install #安装text-generation服务
 ```
 4. 安装benchmark
-```
+```bash
 cd text-generation-inference
 make install-benchmark
 ```
 注意：若安装过程过慢，可以通过如下命令修改默认源提速。
-```
+```bash
 pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
 ```
 另外，`cargo install` 太慢也可以通过在`~/.cargo/config`中添加源来提速。

 ## 查看安装的版本号
-```
+```bash
 text-generation-launcher -V  #版本号与官方版本同步
 ```

+## 使用前
+
+```bash
+export PYTORCH_TUNABLEOP_ENABLED=0
+```

 ## Known Issue
+
 - 无

 ## 参考资料

--- a/assets/tgi_grafana.json
+++ b/assets/tgi_grafana.json
--- a/benchmark/src/app.rs
+++ b/benchmark/src/app.rs
@@ -497,7 +497,7 @@ fn statis_spans<'a>(data: &[f64], unit: &'static str) -> Vec<Line<'a>> {
                "Lowest:  {:.2} {unit}",
                data.iter()
                    .min_by(|a, b| a.total_cmp(b))
-                    .unwrap_or(&std::f64::NAN)
+                    .unwrap_or(&f64::NAN)
            ),
            Style::default().fg(Color::Reset),
        )]),
@@ -506,7 +506,7 @@ fn statis_spans<'a>(data: &[f64], unit: &'static str) -> Vec<Line<'a>> {
                "Highest: {:.2} {unit}",
                data.iter()
                    .max_by(|a, b| a.total_cmp(b))
-                    .unwrap_or(&std::f64::NAN)
+                    .unwrap_or(&f64::NAN)
            ),
            Style::default().fg(Color::Reset),
        )]),
@@ -555,17 +555,17 @@ fn latency_throughput_chart<'a>(
    let min_latency: f64 = *latency_iter
        .clone()
        .min_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
    let max_latency: f64 = *latency_iter
        .max_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
    let min_throughput: f64 = *throughput_iter
        .clone()
        .min_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
    let max_throughput: f64 = *throughput_iter
        .max_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);

    // Char min max values
    let min_x = if zoom {

--- a/benchmark/src/event.rs
+++ b/benchmark/src/event.rs
@@ -11,7 +11,7 @@ pub(crate) enum Event {
    /// Key press.
    Key(event::KeyEvent),
    /// Terminal resize.
-    Resize(u16, u16),
+    Resize,
 }

 pub(crate) async fn terminal_event_task(
@@ -47,8 +47,8 @@ async fn event_loop(fps: u32, event_sender: mpsc::Sender<Event>) {
        if event::poll(Duration::from_secs(0)).expect("no events available") {
            match event::read().expect("unable to read event") {
                event::Event::Key(e) => event_sender.send(Event::Key(e)).await.unwrap_or(()),
-                event::Event::Resize(w, h) => {
-                    event_sender.send(Event::Resize(w, h)).await.unwrap_or(())
+                event::Event::Resize(_w, _h) => {
+                    event_sender.send(Event::Resize).await.unwrap_or(())
                }
                _ => (),
            }

--- a/benchmark/src/generation.rs
+++ b/benchmark/src/generation.rs
 use std::time::{Duration, Instant};
-use text_generation_client::{
-    Batch, CachedBatch, ClientError, NextTokenChooserParameters, Request, ShardedClient,
+use text_generation_client::v3::{
+    Batch, CachedBatch, NextTokenChooserParameters, Request, ShardedClient,
    StoppingCriteriaParameters,
 };
+use text_generation_client::{Chunk, ClientError, Input};
 use tokenizers::{Tokenizer, TruncationDirection};
 use tokio::sync::{broadcast, mpsc};

@@ -142,6 +143,9 @@ async fn prefill(
        .map(|id| Request {
            id: id.into(),
            prefill_logprobs: false,
+            input_chunks: Some(Input {
+                chunks: vec![Chunk::Text(sequence.clone()).into()],
+            }),
            inputs: sequence.clone(),
            truncate: sequence_length,
            parameters: Some(parameters.clone()),
@@ -151,6 +155,9 @@ async fn prefill(
                ignore_eos_token: true, // Will not stop even if a eos token is generated
            }),
            top_n_tokens: top_n_tokens.unwrap_or(0),
+            blocks: vec![],
+            slots: vec![],
+            adapter_id: None,
        })
        .collect();

@@ -159,6 +166,7 @@ async fn prefill(
        requests,
        size: batch_size,
        max_tokens: batch_size * (sequence_length + decode_length),
+        max_blocks: 0,
    };

    // Run prefill

--- a/benchmark/src/lib.rs
+++ b/benchmark/src/lib.rs
@@ -8,7 +8,7 @@ use crate::app::App;
 use crate::event::Event;
 use crossterm::ExecutableCommand;
 use std::io;
-use text_generation_client::{GrammarType, NextTokenChooserParameters, ShardedClient};
+use text_generation_client::v3::{GrammarType, NextTokenChooserParameters, ShardedClient};
 use tokenizers::Tokenizer;
 use tokio::sync::{broadcast, mpsc};
 use tui::backend::CrosstermBackend;

--- a/benchmark/src/main.rs
+++ b/benchmark/src/main.rs
@@ -4,7 +4,7 @@
 /// and: https://github.com/orhun/rust-tui-template
 use clap::Parser;
 use std::path::Path;
-use text_generation_client::ShardedClient;
+use text_generation_client::v3::ShardedClient;
 use tokenizers::{FromPretrainedParameters, Tokenizer};
 use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::util::SubscriberInitExt;
@@ -147,7 +147,9 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
            tracing::info!("Downloading tokenizer");

            // Parse Huggingface hub token
-            let auth_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok();
+            let auth_token = std::env::var("HF_TOKEN")
+                .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
+                .ok();

            // Download and instantiate tokenizer
            // We need to download it outside of the Tokio runtime

--- a/benchmark/src/table.rs
+++ b/benchmark/src/table.rs
@@ -156,17 +156,17 @@ fn avg_min_max(data: &[f64]) -> (f64, f64, f64) {
    let min = data
        .iter()
        .min_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
    let max = data
        .iter()
        .max_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
    (average, *min, *max)
 }

 fn px(data: &[f64], p: u32) -> f64 {
    let i = (f64::from(p) / 100.0 * data.len() as f64) as usize;
-    *data.get(i).unwrap_or(&std::f64::NAN)
+    *data.get(i).unwrap_or(&f64::NAN)
 }

 fn format_value(value: f64, unit: &'static str) -> String {

--- a/benchmark/src/utils.rs
+++ b/benchmark/src/utils.rs
@@ -37,7 +37,7 @@ pub(crate) fn percentiles(values: &[f64], pecents: &[i32]) -> BTreeMap<String, f
        .iter()
        .map(|&p| {
            let i = (f64::from(p) / 100.0 * values.len() as f64) as usize;
-            (format!("p{p}"), *values.get(i).unwrap_or(&std::f64::NAN))
+            (format!("p{p}"), *values.get(i).unwrap_or(&f64::NAN))
        })
        .collect()
 }
--- a/clients/python/text_generation/__init__.py
+++ b/clients/python/text_generation/__init__.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-__version__ = "0.6.0"
+__version__ = "0.7.0"
+
+DEPRECATION_WARNING = (
+    "`text_generation` clients are deprecated and will be removed in the near future. "
+    "Please use the `InferenceClient` from the `huggingface_hub` package instead."
+)

 from text_generation.client import Client, AsyncClient
 from text_generation.inference_api import InferenceAPIClient, InferenceAPIAsyncClient
--- a/clients/python/text_generation/client.py
+++ b/clients/python/text_generation/client.py
--- a/clients/python/text_generation/types.py
+++ b/clients/python/text_generation/types.py
 from enum import Enum
-from pydantic import BaseModel, field_validator
+from pydantic import BaseModel, field_validator, ConfigDict
 from typing import Optional, List, Union, Any

 from text_generation.errors import ValidationError
@@ -46,30 +46,6 @@ class Tool(BaseModel):
    function: dict


-class ChatCompletionComplete(BaseModel):
-    # Index of the chat completion
-    index: int
-    # Message associated with the chat completion
-    message: Message
-    # Log probabilities for the chat completion
-    logprobs: Optional[Any]
-    # Reason for completion
-    finish_reason: str
-    # Usage details of the chat completion
-    usage: Optional[Any] = None
-
-
-class CompletionComplete(BaseModel):
-    # Index of the chat completion
-    index: int
-    # Message associated with the chat completion
-    text: str
-    # Log probabilities for the chat completion
-    logprobs: Optional[Any]
-    # Reason for completion
-    finish_reason: str
-
-
 class Function(BaseModel):
    name: Optional[str]
    arguments: str
@@ -95,24 +71,41 @@ class Choice(BaseModel):
    finish_reason: Optional[str] = None


-class ChatCompletionChunk(BaseModel):
-    id: str
-    object: str
-    created: int
+class CompletionRequest(BaseModel):
+    # Model identifier
    model: str
-    system_fingerprint: str
-    choices: List[Choice]
+    # Prompt
+    prompt: str
+    # The parameter for repetition penalty. 1.0 means no penalty.
+    # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+    repetition_penalty: Optional[float] = None
+    # The parameter for frequency penalty. 1.0 means no penalty
+    # Penalize new tokens based on their existing frequency in the text so far,
+    # decreasing the model's likelihood to repeat the same line verbatim.
+    frequency_penalty: Optional[float] = None
+    # Maximum number of tokens to generate
+    max_tokens: Optional[int] = None
+    # Flag to indicate streaming response
+    stream: bool = False
+    # Random sampling seed
+    seed: Optional[int] = None
+    # Sampling temperature
+    temperature: Optional[float] = None
+    # Top-p value for nucleus sampling
+    top_p: Optional[float] = None
+    # Stop generating tokens if a member of `stop` is generated
+    stop: Optional[List[str]] = None


-class ChatComplete(BaseModel):
-    # Chat completion details
-    id: str
-    object: str
-    created: int
-    model: str
-    system_fingerprint: str
-    choices: List[ChatCompletionComplete]
-    usage: Any
+class CompletionComplete(BaseModel):
+    # Index of the chat completion
+    index: int
+    # Message associated with the chat completion
+    text: str
+    # Log probabilities for the chat completion
+    logprobs: Optional[Any]
+    # Reason for completion
+    finish_reason: str


 class Completion(BaseModel):
@@ -163,6 +156,41 @@ class ChatRequest(BaseModel):
    tool_prompt: Optional[str] = None
    # Choice of tool to be used
    tool_choice: Optional[str] = None
+    # Stop generating tokens if a member of `stop` is generated
+    stop: Optional[List[str]] = None
+
+
+class ChatCompletionComplete(BaseModel):
+    # Index of the chat completion
+    index: int
+    # Message associated with the chat completion
+    message: Message
+    # Log probabilities for the chat completion
+    logprobs: Optional[Any]
+    # Reason for completion
+    finish_reason: str
+    # Usage details of the chat completion
+    usage: Optional[Any] = None
+
+
+class ChatComplete(BaseModel):
+    # Chat completion details
+    id: str
+    object: str
+    created: int
+    model: str
+    system_fingerprint: str
+    choices: List[ChatCompletionComplete]
+    usage: Any
+
+
+class ChatCompletionChunk(BaseModel):
+    id: str
+    object: str
+    created: int
+    model: str
+    system_fingerprint: str
+    choices: List[Choice]


 class Parameters(BaseModel):
@@ -424,5 +452,9 @@ class StreamResponse(BaseModel):

 # Inference API currently deployed model
 class DeployedModel(BaseModel):
+    # Disable warning for use of `model_` prefix in `model_id`. Be mindful about adding members
+    # with model_ prefixes, since this disables guardrails for colliding fields:
+    # https://github.com/pydantic/pydantic/issues/9177
+    model_config = ConfigDict(protected_namespaces=())
    model_id: str
    sha: str
--- a/docs/README.md
+++ b/docs/README.md
+Documentation available at: https://huggingface.co/docs/text-generation-inference
+
+## Release
+
+When making a release, please update the latest version in the documentation with:
+```
+export OLD_VERSION="2\.0\.3"
+export NEW_VERSION="2\.0\.4"
+find . -name '*.md' -exec sed -i -e "s/$OLD_VERSION/$NEW_VERSION/g" {} \;
+```
--- a/docs/openapi.json
+++ b/docs/openapi.json
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -3,12 +3,22 @@
    title: Text Generation Inference
  - local: quicktour
    title: Quick Tour
+  - local: installation_nvidia
+    title: Using TGI with Nvidia GPUs
+  - local: installation_amd
+    title: Using TGI with AMD GPUs
+  - local: installation_gaudi
+    title: Using TGI with Intel Gaudi
+  - local: installation_inferentia
+    title: Using TGI with AWS Inferentia
  - local: installation
-    title: Installation
+    title: Installation from source
  - local: supported_models
    title: Supported Models and Hardware
  - local: messages_api
    title: Messages API
+  - local: architecture
+    title: Internal Architecture
  title: Getting started
 - sections:
  - local: basic_tutorials/consuming_tgi
@@ -20,7 +30,7 @@
  - local: basic_tutorials/using_cli
    title: Using TGI CLI
  - local: basic_tutorials/launcher
-    title: All TGI CLI  options
+    title: All TGI CLI options
  - local: basic_tutorials/non_core_models
    title: Non-core Model Serving
  - local: basic_tutorials/safety
@@ -29,6 +39,10 @@
    title: Using Guidance, JSON, tools
  - local: basic_tutorials/visual_language_models
    title: Visual Language Models
+  - local: basic_tutorials/monitoring
+    title: Monitoring TGI with Prometheus and Grafana
+  - local: basic_tutorials/train_medusa
+    title: Train Medusa
  title: Tutorials
 - sections:
  - local: conceptual/streaming
@@ -46,6 +60,9 @@
  - local: conceptual/speculation
    title: Speculation (Medusa, ngram)
  - local: conceptual/guidance
-    title: How Guidance Works (via outlines)
+    title: How Guidance Works (via outlines
+  - local: conceptual/lora
+    title: LoRA (Low-Rank Adaptation)
+

  title: Conceptual Guides
--- a/docs/source/architecture.md
+++ b/docs/source/architecture.md
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@@ -2,13 +2,13 @@

 If the model you wish to serve is behind gated access or the model repository on Hugging Face Hub is private, and you have access to the model, you can provide your Hugging Face Hub access token. You can generate and copy a read token from [Hugging Face Hub tokens page](https://huggingface.co/settings/tokens)

-If you're using the CLI, set the `HUGGING_FACE_HUB_TOKEN` environment variable. For example:
+If you're using the CLI, set the `HF_TOKEN` environment variable. For example:

 ```
-export HUGGING_FACE_HUB_TOKEN=<YOUR READ TOKEN>
+export HF_TOKEN=<YOUR READ TOKEN>
 ```

-If you would like to do it through Docker, you can provide your token by specifying `HUGGING_FACE_HUB_TOKEN` as shown below.
+If you would like to do it through Docker, you can provide your token by specifying `HF_TOKEN` as shown below.

 ```bash
 model=meta-llama/Llama-2-7b-chat-hf
@@ -17,8 +17,8 @@ token=<your READ token>

 docker run --gpus all \
    --shm-size 1g \
-    -e HUGGING_FACE_HUB_TOKEN=$token \
+    -e HF_TOKEN=$token \
    -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.4 \
    --model-id $model
 ```
--- a/docs/source/basic_tutorials/launcher.md
+++ b/docs/source/basic_tutorials/launcher.md
--- a/docs/source/basic_tutorials/monitoring.md
+++ b/docs/source/basic_tutorials/monitoring.md