doc(launcher): add more docs to the `launcher` itself and link in the README (#257)

b0b97fd9 · Nicolas Patry · GitHub · 593a5634 · b0b97fd9 · b0b97fd9
Unverified Commit b0b97fd9 authored Apr 29, 2023 by Nicolas Patry Committed by GitHub Apr 29, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 135 additions and 0 deletions

README.md README.md +5 -0

launcher/src/main.rs launcher/src/main.rs +130 -0

No files found.
--- a/README.md
+++ b/README.md
@@ -84,6 +84,11 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
 ```
 **Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher.
+To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli:
+```
+text-generation-launcher --help
+```
 You can then query the model using either the `/generate` or `/generate_stream` routes:
 ```shell

--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -18,52 +18,182 @@ use subprocess::{ExitStatus, Popen, PopenConfig, PopenError, Redirection};
 #[derive(Parser, Debug)]
 #[clap(author, version, about, long_about = None)]
 struct Args {
+    /// The name of the model to load.
+    /// Can be a MODEL_ID as listed on <https://hf.co/models> like
+    /// `gpt2` or `OpenAssistant/oasst-sft-1-pythia-12b`.
+    /// Or it can be a local directory containing the necessary files
+    /// as saved by `save_pretrained(...)` methods of transformers
    #[clap(default_value = "bigscience/bloom-560m", long, env)]
    model_id: String,
+    /// The actual revision of the model if you're referring to a model
+    /// on the hub. You can use a specific commit id or a branch like `refs/pr/2`.
    #[clap(long, env)]
    revision: Option<String>,
+    /// Wether to shard or not the model across multiple GPUs
+    /// By default text-generation-inference will use all available GPUs to run
+    /// the model. Setting it to `false` deactivates `num_shard`.
    #[clap(long, env)]
    sharded: Option<bool>,
+    /// The number of shards to use if you don't want to use all GPUs on a given machine.
+    /// You can use `CUDA_VISIBLE_DEVICE=0,1 text-generation-launcher... --num_shard 2`
+    /// and `CUDA_VISIBLE_DEVICE=2,3 text-generation-launcher... --num_shard 2` to
+    /// launch 2 copies with 2 shard each on a given machine with 4 GPUs for instance.
    #[clap(long, env)]
    num_shard: Option<usize>,
+    /// Wether you want the model to be quantized or not. This will use bitsandbytes for
+    /// quantization on the fly.
    #[clap(long, env)]
    quantize: bool,
+    /// The maximum amount of concurrent requests for this particular deployment.
+    /// Having a low limit will refuse clients requests instead of having them
+    /// wait for too long and is usually good to handle backpressure correctly.
    #[clap(default_value = "128", long, env)]
    max_concurrent_requests: usize,
+    /// This is the maximum allowed value for clients to set `best_of`.
+    /// Best of makes `n` generations at the same time, and return the best
+    /// in terms of overall log probability over the entire generated sequence
    #[clap(default_value = "2", long, env)]
    max_best_of: usize,
+    /// This is the maximum allowed value for clients to set `stop_sequences`.
+    /// Stop sequences are used to allow the model to stop on more than just
+    /// the EOS token, and enable more complex "prompting" where users can preprompt
+    /// the model in a specific way and define their "own" stop token aligned with
+    /// their prompt.
    #[clap(default_value = "4", long, env)]
    max_stop_sequences: usize,
+    /// This is the maximum allowed input length (expressed in number of tokens)
+    /// for users. The larger this value, the longer prompt users can send which
+    /// can impact the overall memory required to handle the load.
+    /// Please note that some models have a finite range of sequence they can handle.
    #[clap(default_value = "1000", long, env)]
    max_input_length: usize,
+    /// This is the most important value to set as it defines the "memory budget"
+    /// of running clients requests.
+    /// Clients will send input sequences and ask to generate `max_new_tokens`
+    /// on top. with a value of `1512` users can send either a prompt of
+    /// `1000` and ask for `512` new tokens, or send a prompt of `1` and ask for
+    /// `1511` max_new_tokens.
+    /// The larger this value, the larger amount each request will be in your RAM
+    /// and the less effective batching can be.
    #[clap(default_value = "1512", long, env)]
    max_total_tokens: usize,
+    /// The maximum allowed batch size during dynamic batching.
+    /// Using `max_batch_total_tokens` should be favored in general
+    /// as it's a finer way to control RAM usage.
    #[clap(long, env)]
    max_batch_size: Option<usize>,
+    /// This represents the ratio of waiting queries vs running queries where
+    /// you want to start considering pausing the running queries to include the waiting
+    /// ones into the same batch.
+    /// `waiting_served_ratio=1.2` Means when 12 queries are waiting and there's
+    /// only 10 queries left in the current batch we check if we can fit those 12
+    /// waiting queries into the batching strategy, and if yes, then batching happens
+    /// delaying the 10 running queries by a `prefill` run.
+    ///
+    /// This setting is only applied if there is room in the batch
+    /// as defined by `max_batch_total_tokens`.
    #[clap(default_value = "1.2", long, env)]
    waiting_served_ratio: f32,
+    /// **IMPORTANT** This is one critical control to allow maximum usage
+    /// of the available hardware.
+    ///
+    /// This represents the total amount of potential tokens within a batch.
+    /// When using padding (not recommended) this would be equivalent of
+    /// `batch_size` * `max_total_tokens`.
+    ///
+    /// However in the non-padded (flash attention) version this can be much finer.
+    ///
+    /// For `max_batch_total_tokens=1000`, you could fit `10` queries of `total_tokens=100`
+    /// or a single query of `1000` tokens.
+    ///
+    /// So you don't have to control that finely
+    /// `max_batch_size` or `max_total_tokens`. In fact you could mostly relax them if you
+    /// want maximum flexibility. However, for your users if they are asking for the full amount of
+    /// total tokens, they are likely to wait for a very long time to get a spot
+    /// in the batch (since they are going to be alone) so setting `max_batch_size`
+    /// and `max_total_tokens` can still be useful to prevent those long waiting times.
+    ///
+    /// Overall this number should be the largest possible amount that fits the
+    /// remaining memory (after the model is loaded). Since the actual memory overhead
+    /// depends on other parameters like if you're using quantization, flash attention
+    /// or the model implementation, text-generation-inference cannot infer this number
+    /// automatically.
    #[clap(default_value = "32000", long, env)]
    max_batch_total_tokens: u32,
+    /// This setting defines how many tokens can be passed before forcing the waiting
+    /// queries to be put on the batch (if the size of the batch allows for it).
+    /// New queries require 1 `prefill` forward, which is different from `decode`
+    /// and therefore you need to pause the running batch in order to run `prefill`
+    /// to create the correct values for the waiting queries to be able to join the batch.
+    ///
+    /// With a value too small, queries will always "steal" the compute to run `prefill`
+    /// and running queries will be delayed by a lot.
+    ///
+    /// With a value too big, waiting queries could wait for a very long time
+    /// before being allowed a slot in the running batch. If your server is busy
+    /// that means that requests that could run in ~2s on an empty server could
+    /// end up running in ~20s because the query had to wait for 18s.
+    ///
+    /// This number is expressed in number of tokens to make it a bit more
+    /// "model" agnostic, but what should really matter is the overall latency
+    /// for end users.
    #[clap(default_value = "20", long, env)]
    max_waiting_tokens: usize,
    #[clap(default_value = "3000", long, short, env)]
+    /// The port to listen on.
    port: u16,
+    /// The name of the socket for gRPC communication between the webserver
+    /// and the shards.
    #[clap(default_value = "/tmp/text-generation-server", long, env)]
    shard_uds_path: String,
+    /// The address the master shard will listen on. (setting used by torch distributed)
    #[clap(default_value = "localhost", long, env)]
    master_addr: String,
+    /// The address the master port will listen on. (setting used by torch distributed)
    #[clap(default_value = "29500", long, env)]
    master_port: usize,
+    /// The location of the huggingface hub cache.
+    /// Used to override the location if you want to provide a mounted disk for instance
    #[clap(long, env)]
    huggingface_hub_cache: Option<String>,
+    /// The location of the huggingface hub cache.
+    /// Used to override the location if you want to provide a mounted disk for instance
    #[clap(long, env)]
    weights_cache_override: Option<String>,
+    /// For some models (like bloom), text-generation-inference implemented custom
+    /// cuda kernels to speed up inference. Those kernels were only tested on A100.
+    /// Use this flag to disable them if you're running on different hardware and
+    /// encounter issues.
    #[clap(long, env)]
    disable_custom_kernels: bool,
+    /// Outputs the logs in JSON format (useful for telemetry)
    #[clap(long, env)]
    json_output: bool,
    #[clap(long, env)]
    otlp_endpoint: Option<String>,
    #[clap(long, env)]
    cors_allow_origin: Vec<String>,
    #[clap(long, env)]