merge v0.4.1

99b471c2 · zhuwenwen · 1925d2e9 · 468d761b · 99b471c2 · 99b471c2
Commit 99b471c2 authored May 21, 2024 by zhuwenwen
20 changed files
--- a/docs/source/models/engine_args.rst
+++ b/docs/source/models/engine_args.rst
@@ -5,116 +5,19 @@ Engine Arguments
 Below, you can find an explanation of every engine argument for vLLM:
-.. option:: --model <model_name_or_path>
+.. argparse::
+    :module: vllm.engine.arg_utils
-    Name or path of the huggingface model to use.
+    :func: _engine_args_parser
+    :prog: -m vllm.entrypoints.openai.api_server
-.. option:: --tokenizer <tokenizer_name_or_path>
+    :nodefaultconst:
-    Name or path of the huggingface tokenizer to use.
+Async Engine Arguments
+----------------------
-.. option:: --revision <revision>
+Below are the additional arguments related to the asynchronous engine:
-    The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
+.. argparse::
-.. option:: --tokenizer-revision <revision>
+    :module: vllm.engine.arg_utils
+    :func: _async_engine_args_parser
-    The specific tokenizer version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
+    :prog: -m vllm.entrypoints.openai.api_server
+    :nodefaultconst:
-.. option:: --tokenizer-mode {auto,slow}
\ No newline at end of file
-    The tokenizer mode.
-    * "auto" will use the fast tokenizer if available.
-    * "slow" will always use the slow tokenizer.
-.. option:: --trust-remote-code
-    Trust remote code from huggingface.
-.. option:: --download-dir <directory>
-    Directory to download and load the weights, default to the default cache dir of huggingface.
-.. option:: --load-format {auto,pt,safetensors,npcache,dummy}
-    The format of the model weights to load.
-    * "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available.
-    * "pt" will load the weights in the pytorch bin format.
-    * "safetensors" will load the weights in the safetensors format.
-    * "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading.
-    * "dummy" will initialize the weights with random values, mainly for profiling.
-.. option:: --dtype {auto,half,float16,bfloat16,float,float32}
-    Data type for model weights and activations.
-    * "auto" will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models.
-    * "half" for FP16. Recommended for AWQ quantization.
-    * "float16" is the same as "half".
-    * "bfloat16" for a balance between precision and range.
-    * "float" is shorthand for FP32 precision.
-    * "float32" for FP32 precision.
-.. option:: --max-model-len <length>
-    Model context length. If unspecified, will be automatically derived from the model config.
-.. option:: --worker-use-ray
-    Use Ray for distributed serving, will be automatically set when using more than 1 GPU.
-.. option:: --pipeline-parallel-size (-pp) <size>
-    Number of pipeline stages.
-.. option:: --tensor-parallel-size (-tp) <size>
-    Number of tensor parallel replicas.
-.. option:: --max-parallel-loading-workers <workers>
-    Load model sequentially in multiple batches, to avoid RAM OOM when using tensor parallel and large models.
-.. option:: --block-size {8,16,32}
-    Token block size for contiguous chunks of tokens.
-.. option:: --enable-prefix-caching
-    Enables automatic prefix caching
-.. option:: --seed <seed>
-    Random seed for operations.
-.. option:: --swap-space <size>
-    CPU swap space size (GiB) per GPU.
-.. option:: --gpu-memory-utilization <fraction>
-    The fraction of GPU memory to be used for the model executor, which can range from 0 to 1. 
-    For example, a value of 0.5 would imply 50% GPU memory utilization.
-    If unspecified, will use the default value of 0.9.
-.. option:: --max-num-batched-tokens <tokens>
-    Maximum number of batched tokens per iteration.
-.. option:: --max-num-seqs <sequences>
-    Maximum number of sequences per iteration.
-.. option:: --max-paddings <paddings>
-    Maximum number of paddings in a batch.
-.. option:: --disable-log-stats
-    Disable logging statistics.
-.. option:: --quantization (-q) {awq,squeezellm,None}
-    Method used to quantize the weights.
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -30,23 +30,23 @@ Alongside each architecture, we include some popular models that use it.
  * - :code:`CohereForCausalLM`
    - Command-R
    - :code:`CohereForAI/c4ai-command-r-v01`, etc.
-    - 
+    -
  * - :code:`DbrxForCausalLM`
    - DBRX
    - :code:`databricks/dbrx-base`, :code:`databricks/dbrx-instruct`, etc.
-    - 
+    -
  * - :code:`DeciLMForCausalLM`
    - DeciLM
    - :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc.
-    - 
+    -
  * - :code:`BloomForCausalLM`
    - BLOOM, BLOOMZ, BLOOMChat
    - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc.
-    - 
+    -
  * - :code:`FalconForCausalLM`
    - Falcon
    - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
-    - 
+    -
  * - :code:`GemmaForCausalLM`
    - Gemma
    - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc.
@@ -54,19 +54,19 @@ Alongside each architecture, we include some popular models that use it.
  * - :code:`GPT2LMHeadModel`
    - GPT-2
    - :code:`gpt2`, :code:`gpt2-xl`, etc.
-    - 
+    -
  * - :code:`GPTBigCodeForCausalLM`
    - StarCoder, SantaCoder, WizardCoder
    - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc.
-    - 
+    -
  * - :code:`GPTJForCausalLM`
    - GPT-J
    - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc.
-    - 
+    -
  * - :code:`GPTNeoXForCausalLM`
    - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
    - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc.
-    - 
+    -
  * - :code:`InternLMForCausalLM`
    - InternLM
    - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc.
@@ -80,41 +80,45 @@ Alongside each architecture, we include some popular models that use it.
    - :code:`core42/jais-13b`, :code:`core42/jais-13b-chat`, :code:`core42/jais-30b-v3`, :code:`core42/jais-30b-chat-v3`, etc.
    -
  * - :code:`LlamaForCausalLM`
-    - LLaMA, LLaMA-2, Vicuna, Alpaca, Yi
+    - LLaMA, Llama 2, Meta Llama 3, Vicuna, Alpaca, Yi
-    - :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
+    - :code:`meta-llama/Meta-Llama-3-8B-Instruct`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
    - ✅︎
+  * - :code:`MiniCPMForCausalLM`
+    - MiniCPM
+    - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, etc.
+    -
  * - :code:`MistralForCausalLM`
    - Mistral, Mistral-Instruct
    - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc.
    - ✅︎
  * - :code:`MixtralForCausalLM`
    - Mixtral-8x7B, Mixtral-8x7B-Instruct
-    - :code:`mistralai/Mixtral-8x7B-v0.1`, :code:`mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.
+    - :code:`mistralai/Mixtral-8x7B-v0.1`, :code:`mistralai/Mixtral-8x7B-Instruct-v0.1`, :code:`mistral-community/Mixtral-8x22B-v0.1`, etc.
    - ✅︎
  * - :code:`MPTForCausalLM`
    - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
    - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc.
-    - 
+    -
  * - :code:`OLMoForCausalLM`
    - OLMo
    - :code:`allenai/OLMo-1B`, :code:`allenai/OLMo-7B`, etc.
-    - 
+    -
  * - :code:`OPTForCausalLM`
    - OPT, OPT-IML
    - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
-    - 
+    -
  * - :code:`OrionForCausalLM`
    - Orion
    - :code:`OrionStarAI/Orion-14B-Base`, :code:`OrionStarAI/Orion-14B-Chat`, etc.
-    - 
+    -
  * - :code:`PhiForCausalLM`
    - Phi
    - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc.
-    - 
+    -
  * - :code:`QWenLMHeadModel`
    - Qwen
    - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
-    - 
+    -
  * - :code:`Qwen2ForCausalLM`
    - Qwen2
    - :code:`Qwen/Qwen2-beta-7B`, :code:`Qwen/Qwen2-beta-7B-Chat`, etc.
@@ -122,11 +126,11 @@ Alongside each architecture, we include some popular models that use it.
  * - :code:`Qwen2MoeForCausalLM`
    - Qwen2MoE
    - :code:`Qwen/Qwen1.5-MoE-A2.7B`, :code:`Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
-    - 
+    -
  * - :code:`StableLmForCausalLM`
    - StableLM
    - :code:`stabilityai/stablelm-3b-4e1t/` , :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc.
-    - 
+    -
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
 Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` for instructions on how to implement support for your model.
@@ -164,3 +168,29 @@ Alternatively, you can raise an issue on our `GitHub <https://github.com/vllm-pr
        llm = LLM(model=..., revision=..., trust_remote_code=True)  # Name or path of your model
        output = llm.generate("Hello, my name is")
        print(output)
+Model Support Policy
+---------------------
+At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
+1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated!
+2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
+3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
+4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use.
+5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement.
+Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem.
+Note that, as an inference engine, vLLM does not introduce new models. Therefore, all models supported by vLLM are third-party models in this regard.
+We have the following levels of testing for models:
+1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `test_models.py <https://github.com/vllm-project/vllm/blob/main/tests/models/test_models.py>`_ and `test_big_models.py <https://github.com/vllm-project/vllm/blob/main/tests/models/test_big_models.py>`_ for the models that have passed this test.
+2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
+3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to `functionality tests <https://github.com/vllm-project/vllm/tree/main/tests>`_ and `examples <https://github.com/vllm-project/vllm/tree/main/examples>`_ for the models that have passed this test.
+4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
--- a/docs/source/quantization/fp8_e4m3_kvcache.rst
+++ b/docs/source/quantization/fp8_e4m3_kvcache.rst
+.. _fp8_e4m3_kvcache:
+FP8 E4M3 KV Cache
+==================
+Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache, 
+improving throughput. OCP (Open Compute Project www.opencompute.org) specifies two common 8-bit floating point data formats: E5M2 
+(5 exponent bits and 2 mantissa bits) and E4M3FN (4 exponent bits and 3 mantissa bits), often shortened as E4M3. One benefit of 
+the E4M3 format over E5M2 is that floating point numbers are represented in higher precision. However, the small dynamic range of 
+FP8 E4M3 (±240.0 can be represented) typically necessitates the use of a higher-precision (typically FP32) scaling factor alongside 
+each quantized tensor. For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling 
+factors of a finer granularity (e.g. per-channel).
+These scaling factors can be specified by passing an optional quantization param JSON to the LLM engine at load time. If 
+this JSON is not specified, scaling factors default to 1.0. These scaling factors are typically obtained when running an 
+unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO). 
+To install AMMO (AlgorithMic Model Optimization):
+.. code-block:: console
+        $ pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo
+Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon 
+offerings e.g. AMD MI300, NVIDIA Hopper or later support native hardware conversion to and from fp32, fp16, bf16, etc. 
+Thus, LLM inference is greatly accelerated with minimal accuracy loss.
+Here is an example of how to enable this feature:
+.. code-block:: python
+        # two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to 
+        # https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own.
+        from vllm import LLM, SamplingParams
+        sampling_params = SamplingParams(temperature=1.3, top_p=0.8)
+        llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
+                  kv_cache_dtype="fp8",
+                  quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
+        prompt = "London is the capital of"
+        out = llm.generate(prompt, sampling_params)[0].outputs[0].text
+        print(out)
+        # output w/ scaling factors:  England, the United Kingdom, and one of the world's leading financial,
+        # output w/o scaling factors:  England, located in the southeastern part of the country. It is known 
+Note, current prefix caching doesn't work with FP8 KV cache enabled, forward_prefix kernel should handle different KV and cache type.
--- a/docs/source/quantization/fp8_e5m2_kv_cache.rst
+++ b/docs/source/quantization/fp8_e5m2_kv_cache.rst
-.. _fp8_e5m2_kv_cache:
+.. _fp8_kv_cache:
 FP8 E5M2 KV Cache
 ==================
@@ -21,7 +21,7 @@ Here is an example of how to enable this feature:
    # Create a sampling params object.
    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
    # Create an LLM.
-    llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8_e5m2")
+    llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8")
    # Generate texts from the prompts. The output is a list of RequestOutput objects
    # that contain the prompt, generated text, and other information.
    outputs = llm.generate(prompts, sampling_params)
@@ -31,3 +31,6 @@ Here is an example of how to enable this feature:
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+Note, current prefix caching doesn't work with FP8 KV cache enabled, forward_prefix kernel should handle different KV and cache type.
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -4,7 +4,7 @@ vLLM provides an HTTP server that implements OpenAI's [Completions](https://plat
 You can start the server using Python, or using [Docker](deploying_with_docker.rst):
 ```bash
-python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-hf --dtype float32 --api-key token-abc123
+python -m vllm.entrypoints.openai.api_server --model mistralai/Mistral-7B-Instruct-v0.2 --dtype auto --api-key token-abc123
 ```
 To call the server, you can use the official OpenAI Python client library, or any other HTTP client.
@@ -16,9 +16,8 @@ client = OpenAI(
 )
 completion = client.chat.completions.create(
-  model="meta-llama/Llama-2-7b-hf",
+  model="mistralai/Mistral-7B-Instruct-v0.2",
  messages=[
-    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello!"}
  ]
 )
@@ -38,9 +37,8 @@ Or directly merge them into the JSON payload if you are using HTTP call directly
 ```python
 completion = client.chat.completions.create(
-  model="meta-llama/Llama-2-7b-hf",
+  model="mistralai/Mistral-7B-Instruct-v0.2",
  messages=[
-    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
  ],
  extra_body={
@@ -89,7 +87,7 @@ In order for the language model to support chat protocol, vLLM requires the mode
 a chat template in its tokenizer configuration. The chat template is a Jinja2 template that
 specifies how are roles, messages, and other chat-specific tokens are encoded in the input.
-An example chat template for `meta-llama/Llama-2-7b-chat-hf` can be found [here](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/09bd0f49e16738cdfaa6e615203e126038736eb0/tokenizer_config.json#L12)
+An example chat template for `mistralai/Mistral-7B-Instruct-v0.2` can be found [here](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2#instruction-format)
 Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model,
 you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat

--- a/docs/source/serving/run_on_sky.rst
+++ b/docs/source/serving/run_on_sky.rst
 .. _on_cloud:
-Running on clouds with SkyPilot
+Deploying and scaling up with SkyPilot
-===============================
+================================================
 .. raw:: html
@@ -9,51 +9,75 @@ Running on clouds with SkyPilot
        <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
    </p>
-vLLM can be run on the cloud to scale to multiple GPUs with `SkyPilot <https://github.com/skypilot-org/skypilot>`__, an open-source framework for running LLMs on any cloud.
+vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with `SkyPilot <https://github.com/skypilot-org/skypilot>`__, an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in `SkyPilot AI gallery <https://skypilot.readthedocs.io/en/latest/gallery/index.html>`__.
-To install SkyPilot and setup your cloud credentials, run:
+Prerequisites
+-------------
+- Go to the `HuggingFace model page <https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct>`__ and request access to the model :code:`meta-llama/Meta-Llama-3-8B-Instruct`.
+- Check that you have installed SkyPilot (`docs <https://skypilot.readthedocs.io/en/latest/getting-started/installation.html>`__).
+- Check that :code:`sky check` shows clouds or Kubernetes are enabled.
 .. code-block:: console
-    $ pip install skypilot
+    pip install skypilot-nightly
-    $ sky check
+    sky check
+Run on a single instance
+------------------------
 See the vLLM SkyPilot YAML for serving, `serving.yaml <https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml>`__.
 .. code-block:: yaml
    resources:
-        accelerators: A100
+        accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+        use_spot: True
+        disk_size: 512  # Ensure model checkpoints can fit.
+        disk_tier: best
+        ports: 8081  # Expose to internet traffic.
    envs:
-        MODEL_NAME: decapoda-research/llama-13b-hf
+        MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-        TOKENIZER: hf-internal-testing/llama-tokenizer
+        HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
    setup: |
-        conda create -n vllm python=3.9 -y
+        conda create -n vllm python=3.10 -y
        conda activate vllm
-        git clone https://github.com/vllm-project/vllm.git
-        cd vllm
+        pip install vllm==0.4.0.post1
-        pip install .
+        # Install Gradio for web UI.
-        pip install gradio
+        pip install gradio openai
+        pip install flash-attn==2.5.7
    run: |
        conda activate vllm
        echo 'Starting vllm api server...'
-        python -u -m vllm.entrypoints.api_server \
+        python -u -m vllm.entrypoints.openai.api_server \
-                        --model $MODEL_NAME \
+            --port 8081 \
-                        --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+            --model $MODEL_NAME \
-                        --tokenizer $TOKENIZER 2>&1 | tee api_server.log &
+            --trust-remote-code \
+            --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+            2>&1 | tee api_server.log &
        echo 'Waiting for vllm api server to start...'
        while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
        echo 'Starting gradio server...'
-        python vllm/examples/gradio_webserver.py
+        git clone https://github.com/vllm-project/vllm.git || true
+        python vllm/examples/gradio_openai_chatbot_webserver.py \
+            -m $MODEL_NAME \
+            --port 8811 \
+            --model-url http://localhost:8081/v1 \
+            --stop-token-ids 128009,128001
-Start the serving the LLaMA-13B model on an A100 GPU:
+Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...): 
 .. code-block:: console
-    $ sky launch serving.yaml
+    HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN
 Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion.
@@ -61,9 +85,226 @@ Check the output of the command. There will be a shareable gradio link (like the
    (task, pid=7431) Running on public URL: https://<gradio-hash>.gradio.live
-**Optional**: Serve the 65B model instead of the default 13B and use more GPU:
+**Optional**: Serve the 70B model instead of the default 8B and use more GPU:
+.. code-block:: console
+    HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct
+Scale up to multiple replicas
+-----------------------------
+SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file.
+.. code-block:: yaml
+    service:
+        replicas: 2
+        # An actual request for readiness probe.
+        readiness_probe:
+            path: /v1/chat/completions
+            post_data:
+            model: $MODEL_NAME
+            messages:
+                - role: user
+                content: Hello! What is your name?
+        max_tokens: 1
+.. raw:: html
+    <details>
+    <summary>Click to see the full recipe YAML</summary>
+.. code-block:: yaml
+    service:
+        replicas: 2
+        # An actual request for readiness probe.
+        readiness_probe:
+            path: /v1/chat/completions
+            post_data:
+            model: $MODEL_NAME
+            messages:
+                - role: user
+                content: Hello! What is your name?
+        max_tokens: 1
+    resources:
+        accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+        use_spot: True
+        disk_size: 512  # Ensure model checkpoints can fit.
+        disk_tier: best
+        ports: 8081  # Expose to internet traffic.
+    envs:
+        MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+        HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+    setup: |
+        conda create -n vllm python=3.10 -y
+        conda activate vllm
+        pip install vllm==0.4.0.post1
+        # Install Gradio for web UI.
+        pip install gradio openai
+        pip install flash-attn==2.5.7
+    run: |
+        conda activate vllm
+        echo 'Starting vllm api server...'
+        python -u -m vllm.entrypoints.openai.api_server \
+            --port 8081 \
+            --model $MODEL_NAME \
+            --trust-remote-code \
+            --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+            2>&1 | tee api_server.log &
+        echo 'Waiting for vllm api server to start...'
+        while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
+        echo 'Starting gradio server...'
+        git clone https://github.com/vllm-project/vllm.git || true
+        python vllm/examples/gradio_openai_chatbot_webserver.py \
+            -m $MODEL_NAME \
+            --port 8811 \
+            --model-url http://localhost:8081/v1 \
+            --stop-token-ids 128009,128001
+.. raw:: html
+    </details>
+Start the serving the Llama-3 8B model on multiple replicas:
+.. code-block:: console
+    HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN
+Wait until the service is ready:
 .. code-block:: console
-    sky launch -c vllm-serve-new -s serve.yaml --gpus A100:8 --env MODEL_NAME=decapoda-research/llama-65b-hf
+    watch -n10 sky serve status vllm
+.. raw:: html
+    <details>
+    <summary>Example outputs:</summary>
+.. code-block:: console
+    Services
+    NAME  VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT
+    vllm  1        35s     READY   2/2       xx.yy.zz.100:30001
+    Service Replicas
+    SERVICE_NAME  ID  VERSION  IP            LAUNCHED     RESOURCES          STATUS  REGION
+    vllm          1   1        xx.yy.zz.121  18 mins ago  1x GCP({'L4': 1})  READY   us-east4
+    vllm          2   1        xx.yy.zz.245  18 mins ago  1x GCP({'L4': 1})  READY   us-east4
+.. raw:: html
+    </details>
+After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
+.. code-block:: console
+    ENDPOINT=$(sky serve status --endpoint 8081 vllm)
+    curl -L http://$ENDPOINT/v1/chat/completions \
+        -H "Content-Type: application/json" \
+        -d '{
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "messages": [
+            {
+                "role": "system",
+                "content": "You are a helpful assistant."
+            },
+            {
+                "role": "user",
+                "content": "Who are you?"
+            }
+            ],
+            "stop_token_ids": [128009,  128001]
+        }'
+To enable autoscaling, you could specify additional configs in `services`:
+.. code-block:: yaml
+    services:
+        replica_policy:
+            min_replicas: 0
+            max_replicas: 3
+        target_qps_per_replica: 2
+This will scale the service up to when the QPS exceeds 2 for each replica.
+**Optional**: Connect a GUI to the endpoint
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
+.. raw:: html
+    <details>
+    <summary>Click to see the full GUI YAML</summary>
+.. code-block:: yaml
+    envs:
+        MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct
+        ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. 
+    resources:
+        cpus: 2
+    setup: |
+        conda activate vllm
+        if [ $? -ne 0 ]; then
+            conda create -n vllm python=3.10 -y
+            conda activate vllm
+        fi
+        # Install Gradio for web UI.
+        pip install gradio openai
+    run: |
+        conda activate vllm
+        export PATH=$PATH:/sbin
+        WORKER_IP=$(hostname -I | cut -d' ' -f1)
+        CONTROLLER_PORT=21001
+        WORKER_PORT=21002
+        echo 'Starting gradio server...'
+        git clone https://github.com/vllm-project/vllm.git || true
+        python vllm/examples/gradio_openai_chatbot_webserver.py \
+            -m $MODEL_NAME \
+            --port 8811 \
+            --model-url http://$ENDPOINT/v1 \
+            --stop-token-ids 128009,128001 | tee ~/gradio.log
+.. raw:: html
+    </details>
+1. Start the chat web UI:
+.. code-block:: console
+    sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm)
+2. Then, we can access the GUI at the returned gradio link:
+.. code-block:: console
+    | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live
--- a/examples/aqlm_example.py
+++ b/examples/aqlm_example.py
+import argparse
+from vllm import LLM, SamplingParams
+def main():
+    parser = argparse.ArgumentParser(description='AQLM examples')
+    parser.add_argument('--model',
+                        '-m',
+                        type=str,
+                        default=None,
+                        help='model path, as for HF')
+    parser.add_argument('--choice',
+                        '-c',
+                        type=int,
+                        default=0,
+                        help='known good models by index, [0-4]')
+    parser.add_argument('--tensor_parallel_size',
+                        '-t',
+                        type=int,
+                        default=1,
+                        help='tensor parallel size')
+    args = parser.parse_args()
+    models = [
+        "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf",
+        "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf",
+        "ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf",
+        "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf",
+        "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
+    ]
+    model = LLM(args.model if args.model is not None else models[args.choice],
+                tensor_parallel_size=args.tensor_parallel_size)
+    sampling_params = SamplingParams(max_tokens=100, temperature=0)
+    outputs = model.generate("Hello my name is",
+                             sampling_params=sampling_params)
+    print(outputs[0].outputs[0].text)
+if __name__ == '__main__':
+    main()
--- a/examples/fp8/README.md
+++ b/examples/fp8/README.md
+# FP8 KV Cache 
+This utility extracts the KV cache scaling factors from a quantized HF (Hugging Face) model. The extracted scaling factors are saved to a JSON file, which can later be used by vLLM (variable-length language model) during runtime. This tool is particularly useful when the KV cache data type is FP8 and is intended for use on ROCm (AMD GPU) platforms.
+## Prerequisites
+- Python 3.x
+- PyTorch
+- NumPy
+- Hugging Face Transformers
+- Hugging Face Hub
+- AMMO 
+Before incorporating the FP8 datatype for inference workloads, you must adhere to the following steps:
+1. Install all necessary prerequisites and dependencies. 
+2. Convert HF model into a quantized HF model. 
+3. Extract KV Cache Scaling Factors from quantized HF model.
+4. Load KV Cache Scaling Factors into VLLM.
+### 2. Convert HF model into a quantized HF model.
+Note: The following steps are adapted from the [TensorRT-LLM repository](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/README.md).
+`quantize.py` (examples/fp8/quantizer/quantize.py) uses the quantization toolkit  (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format).
+The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found at `examples/fp8/quantizer/README.md`.
+### 3. Extract KV Cache Scaling Factors from quantized HF model.
+`extract_scales.py` (examples/fp8/extract_scales.py) can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports Llama 2 models. It is also important to note the following:
+1. **File Structure**: The utility operates under the assumption that all parameters, including KV cache scaling factors, corresponding to a particular Tensor Parallelism (TP) rank are stored in a single file. These files must adhere to a specific naming convention where the TP rank is immediately identified after a specific keyword (e.g., "rank") in the filename.
+2. **TP Decomposition**: The utility assumes consistency between the TP decomposition employed by the quantizer tool and that used by vLLM.
+3. **AMMO Compatibility**: Currently, the generated KV cache scaling factors for AMMO remain uniform across all TP ranks.
+```python
+# prerequisites:
+# - Quantized HF LLaMa 2 model 
+python3 examples/fp8/extract_scales.py --help
+Usage: extract_scales.py [-h] --quantized_model QUANTIZED_MODEL [--load_format {auto,safetensors,npz,pt}] [--output_dir OUTPUT_DIR] [--output_name OUTPUT_NAME] [--tp_size TP_SIZE]
+KV Scale Extraction Example
+optional arguments:
+--quantized_model: Specify either the local path to, or name of, a quantized HF model. It is expected that the quantization format is FP8_E4M3, for use on ROCm (AMD GPU).
+Optional arguments:
+--cache_dir: Specify a cache directory to use in the event of a HF model download. (Default: None)
+--load_format: Specify the format of the model's tensor files containing the KV cache scaling factors. (Choices: auto, safetensors, npz, pt; Default: auto)
+--revision: Specify the model's revision number. (Default: None)
+--output_dir: Specify the output directory. By default the KV cache scaling factors will be saved in the model directory. (Default: None)
+--output_name: Specify the output filename. (Default: kv_cache_scales.json)
+--tp_size: Specify the tensor-parallel (TP) size that the quantized model should correspond to. If specified, during KV cache scaling factor extraction the observed TP size will be checked against this and an error will be raised if there is a mismatch. (Default: None)
+```
+```python
+Example:
+python3 examples/fp8/extract_scales.py --quantized_model <QUANTIZED_MODEL_DIR> --tp_size <TENSOR_PARALLEL_SIZE> --output_dir <PATH_TO_OUTPUT_DIR>
+```
+### 4. Load KV Cache Scaling Factors into VLLM.
+This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for KV cache scaling factors to be utilized for FP8.
+```python
+# prerequisites:
+# -  LLaMa 2 kv_cache_scales.json file
+python3 benchmarks/benchmark_throughput.py --help 
+usage: benchmark_throughput.py [-h] [--backend {vllm,hf,mii}] [--dataset DATASET] [--input-len INPUT_LEN] [--output-len OUTPUT_LEN] [--model MODEL]
+                               [--tokenizer TOKENIZER] [--quantization {awq,gptq,squeezellm,None}] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--n N]
+                               [--use-beam-search] [--num-prompts NUM_PROMPTS] [--seed SEED] [--hf-max-batch-size HF_MAX_BATCH_SIZE] [--trust-remote-code]
+                               [--max-model-len MAX_MODEL_LEN] [--dtype {auto,half,float16,bfloat16,float,float32}] [--enforce-eager] [--kv-cache-dtype {auto,fp8}]
+                               [--quantization-param-path KV_CACHE_quantization_param_path]
+Benchmark Throughput Example  
+optional arguments:
+  -h, --help  show this help message and exit
+  --backend {vllm,hf,mii}
+  --dataset DATASET  Path to the dataset.
+  --input-len INPUT_LEN  Input prompt length for each request
+  --output-len OUTPUT_LEN  Output length for each request. Overrides the output length from the dataset.
+  --model MODEL
+  --tokenizer TOKENIZER
+  --quantization {awq,gptq,squeezellm,None}, -q {awq,gptq,squeezellm,None}
+  --tensor-parallel-size TENSOR_PARALLEL_SIZE, -tp TENSOR_PARALLEL_SIZE
+  --n N  Number of generated sequences per prompt.
+  --use-beam-search
+  --num-prompts NUM_PROMPTS  Number of prompts to process.
+  --seed SEED
+  --hf-max-batch-size HF_MAX_BATCH_SIZE   Maximum batch size for HF backend.
+  --trust-remote-code trust remote code from huggingface
+  --max-model-len MAX_MODEL_LEN  Maximum length of a sequence (including prompt and output). If None, will be derived from the model.
+  --dtype {auto,half,float16,bfloat16,float,float32}  data type for model weights and activations. The "auto" option will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models.
+  --enforce-eager  enforce eager execution
+  --kv-cache-dtype {auto,fp8} Data type for kv cache storage. If "auto", will use model data type. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported ```for common inference criteria.
+  --quantization-param-path QUANT_PARAM_JSON Path to the JSON file containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache scaling factors default to 1.0, which may cause accuracy issues. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.
+```
+```
+Example:
+python3 benchmarks/benchmark_throughput.py --input-len <INPUT_LEN> --output-len <OUTPUT_LEN> -tp <TENSOR_PARALLEL_SIZE> --kv-cache-dtype fp8 --quantization-param-path <path/to/kv_cache_scales.json> --model <path-to-llama2>
+```python
--- a/examples/fp8/extract_scales.py
+++ b/examples/fp8/extract_scales.py
+import argparse
+import glob
+import json
+import os
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+import numpy as np
+import torch
+from safetensors.torch import safe_open
+from vllm.model_executor.layers.quantization.schema import QuantParamSchema
+# Adapted from vllm/model_executor/model_loader/weight_utils.py
+# The main differences are that we add the NPZ format and simplify
+# its functionality drastically for our purposes (e.g. we assume that
+# the quantized model exists locally and there is no need to download it)
+def _prepare_hf_weights(
+    quantized_model_dir: str,
+    load_format: str = "auto",
+    fall_back_to_pt: bool = True,
+) -> Tuple[str, List[str], bool]:
+    if not os.path.isdir(quantized_model_dir):
+        raise FileNotFoundError(
+            f"The quantized model directory `{quantized_model_dir}` "
+            "does not exist.")
+    use_safetensors = False
+    # Some quantized models use .pt files for storing the weights.
+    if load_format == "auto":
+        allow_patterns = ["*.safetensors", "*.bin"]
+    elif load_format == "safetensors":
+        use_safetensors = True
+        allow_patterns = ["*.safetensors"]
+    elif load_format == "pt":
+        allow_patterns = ["*.pt"]
+    elif load_format == "npz":
+        allow_patterns = ["*.npz"]
+    else:
+        raise ValueError(f"Unknown load_format: {load_format}")
+    if fall_back_to_pt:
+        allow_patterns += ["*.pt"]
+    hf_weights_files: List[str] = []
+    for pattern in allow_patterns:
+        hf_weights_files += glob.glob(
+            os.path.join(quantized_model_dir, pattern))
+        if len(hf_weights_files) > 0:
+            if pattern == "*.safetensors":
+                use_safetensors = True
+            break
+    if not use_safetensors:
+        # Exclude files that are not needed for inference.
+        # https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233
+        blacklist = [
+            "training_args.bin",
+            "optimizer.bin",
+            "optimizer.pt",
+            "scheduler.pt",
+            "scaler.pt",
+        ]
+        hf_weights_files = [
+            f for f in hf_weights_files
+            if not any(f.endswith(x) for x in blacklist)
+        ]
+    if len(hf_weights_files) == 0:
+        raise RuntimeError(
+            f"Cannot find any model weights with `{quantized_model_dir}`")
+    return hf_weights_files, use_safetensors
+# Adapted from vllm/model_executor/model_loader/weight_utils.py
+def _hf_tensorfile_iterator(filename: str, load_format: str,
+                            use_safetensors: bool):
+    if load_format == "npz":
+        assert not use_safetensors
+        with np.load(filename) as data:
+            for name in data.files:
+                param = torch.from_numpy(data[name])
+                yield name, param
+    elif use_safetensors:
+        with safe_open(filename, framework="pt") as f:
+            for name in f.keys():  # NOQA: SIM118
+                param = f.get_tensor(name)
+                yield name, param
+    else:
+        state = torch.load(filename, map_location="cpu")
+        for name, param in state.items():
+            yield name, param
+        del state
+        torch.cuda.empty_cache()
+def _kv_scales_extractor(
+        hf_tensor_files: Iterable[str],
+        use_safetensors: bool,
+        rank_keyword: str = "rank",
+        expected_tp_size: Optional[int] = None) -> Dict[int, Dict[int, float]]:
+    """
+    Given a list of files containing tensor data, attempt to extract KV cache
+    scales from these files. Intended as a helper function taking in the output
+    from _prepare_hf_weights.
+    Args:
+    rank_keyword        Matches the number immediately after this keyword in the
+                        tensor filename to determine the TP rank corresponding
+                        to said tensor file
+    expected_tp_size    If specified, the TP size of the tensor files is checked
+                        against this and an error is raised if they don't match.
+    Returns a dictionary mapping TP ranks to their relevant KV cache scales.
+    The per-rank scales are themselves represented as a dictionary of layer
+    indices to the respective per-layer scale.
+    """
+    for char in rank_keyword:
+        assert not char.isdecimal(
+        ), f"Rank keyword {rank_keyword} contains a numeric character!"
+    rank_scales_map = {}
+    for tensor_file in hf_tensor_files:
+        try:
+            rank_idx = tensor_file.find(rank_keyword)
+            if rank_idx != -1:
+                start_idx = rank_idx + len(rank_keyword)
+                stop_idx = start_idx
+                while stop_idx < len(
+                        tensor_file) and tensor_file[stop_idx].isdecimal():
+                    stop_idx += 1
+                if stop_idx == start_idx:
+                    raise RuntimeError("Did not find rank # in filename.")
+                rank = int(tensor_file[start_idx:stop_idx])
+            elif len(hf_tensor_files) == 1:
+                # Since there is only one tensor file, we can assume
+                # that it's intended for TP rank 0
+                rank = 0
+            else:
+                raise RuntimeError(
+                    f"Filename does not contain '{rank_keyword}'.")
+        except RuntimeError:
+            print("Unable to determine TP rank "
+                  f"corresponding to file '{tensor_file}'")
+            raise
+        if rank not in rank_scales_map:
+            layer_scales_map = {}
+            rank_scales_map[rank] = layer_scales_map
+        else:
+            raise RuntimeError(
+                f"Tensor file '{tensor_file}' shares TP rank {rank} "
+                "with another tensor file.")
+        module_delimiter = ":" if args.load_format == "npz" else "."
+        for name, param in _hf_tensorfile_iterator(tensor_file,
+                                                   args.load_format,
+                                                   use_safetensors):
+            if "kv_cache_scaling_factor" in name:
+                nums = [
+                    int(s) for s in name.split(module_delimiter)
+                    if s.isdecimal()
+                ]
+                assert len(
+                    nums) == 1, f"Could not determine layer idx for {name}"
+                layer_idx = nums[0]
+                assert layer_idx not in layer_scales_map, f"Duplicate scaling"\
+                    f" factor corresponding to layer {layer_idx}"
+                try:
+                    layer_scales_map[layer_idx] = param.item()
+                except RuntimeError:
+                    print(
+                        "This utility supports only per-tensor scalar scales "
+                        f"for now. The tensor\n {name} = {param} \nis an "
+                        "invalid scale factor.")
+                    raise
+    if all(
+            len(layer_scales_map) == 0
+            for layer_scales_map in rank_scales_map.values()):
+        # Note: this is true even if the rank_scales_map is empty
+        print("WARNING: No KV cache scale factors found. No output saved.")
+        return None
+    empirical_tp_world_size = max(rank_scales_map.keys()) + 1
+    if expected_tp_size is not None:
+        assert expected_tp_size == empirical_tp_world_size, \
+            f"User expected TP world size = {expected_tp_size} " \
+            "from model but tool is expecting TP world size = " \
+            f"{empirical_tp_world_size} from model instead."
+    for i in range(empirical_tp_world_size):
+        assert i in rank_scales_map, "Expected TP world size = "\
+            f"{empirical_tp_world_size} but did not find KV " \
+            f"cache scaling factors for TP rank {i}"
+    print(f"Found TP world size = {empirical_tp_world_size} "
+          "when extracting KV cache scales!")
+    return rank_scales_map
+def _metadata_extractor(quantized_model_dir: str,
+                        metadata_extract_fns: \
+                        Dict[str, Callable[[Dict[str, Any]], Any]]) \
+                        -> Dict[str, Any]:
+    """
+    Given a directory containing quantized model files, this function
+    aims to extract metadata from the JSON files within this directory.
+    Each JSON file is expected to represent a dictionary in JSON
+    format (referred to as a "JSON-dictionary"). Metadata extraction is
+    defined by a dictionary called metadata_extract_fns, where each
+    metadata field name is mapped to an extraction function.
+    These extraction functions are designed to take a JSON-dictionary
+    as their only argument  and return the corresponding metadata.
+    While extraction functions are permitted to raise  exceptions, they
+    should only raise a KeyError or ValueError if the metadata field
+    cannot  be extracted from the current JSON-dictionary, yet there's
+    a possibility of finding it in another JSON-dictionary.
+    The function returns a dictionary that maps metadata fields to
+    their extracted data. The keys of this dictionary correspond exactly
+    to those in metadata_extract_fns. If any fields fail to be extracted,
+    their corresponding values are set to None, and a warning is printed.
+    """
+    if not os.path.isdir(quantized_model_dir):
+        raise FileNotFoundError(
+            f"The quantized model directory `{quantized_model_dir}` "
+            "does not exist.")
+    metadata_files = glob.glob(os.path.join(quantized_model_dir, "*.json"))
+    result = {}
+    for file in metadata_files:
+        with open(file) as f:
+            try:
+                metadata = json.load(f)
+            except json.JSONDecodeError:
+                print(f"Could not parse `{file}` as a valid metadata file,"
+                      " skipping it.")
+                continue
+            if not isinstance(metadata, dict):
+                print(f"The file `{file}` does not correspond to a "
+                      "JSON-serialized dictionary, skipping it.")
+                continue
+            for metadata_name, extract_fn in metadata_extract_fns.items():
+                try:
+                    metadata_info = extract_fn(metadata)
+                    if metadata_name not in result:
+                        result[metadata_name] = metadata_info
+                    elif metadata_info != result[metadata_name]:
+                        raise RuntimeError(
+                            "Metadata mismatch! Originally found "
+                            f"{metadata_name} = {result[metadata_name]} but "
+                            f"now found {metadata_name} = {metadata_info} in "
+                            f"`{file}`")
+                except KeyError:
+                    # It is possible that a given file does not contain some
+                    # of our selected metadata as it could be located in some
+                    # other metadata file.
+                    # 'EFINAE': extract_fn failure is not an error.
+                    pass
+                except ValueError:
+                    # See above.
+                    pass
+    # Warn if we cannot find any of the requested metadata
+    for metadata_name in metadata_extract_fns:
+        if metadata_name not in result:
+            print("WARNING: Unable to find requested metadata field "
+                  f"`{metadata_name}`, setting it to None.")
+            result[metadata_name] = None
+    return result
+def main(args):
+    metadata_extract_fns = {
+        "model_type": lambda json_dict: json_dict["layers"][0]["decoder_type"],
+        "tp_size": lambda json_dict: int(json_dict["tensor_parallel"]),
+        "model_dtype": lambda json_dict: json_dict["dtype"]
+    }
+    recovered_metadata = _metadata_extractor(args.quantized_model,
+                                             metadata_extract_fns)
+    if args.tp_size is not None:
+        metadata_tp_size = recovered_metadata["tp_size"]
+        if metadata_tp_size is not None:
+            assert args.tp_size == metadata_tp_size, \
+              f"User expected TP world size = {args.tp_size} " \
+              f"but found TP world size = {metadata_tp_size} from metadata!"
+    expected_tp_size = args.tp_size or recovered_metadata["tp_size"]
+    rank_keyword = "rank"
+    hf_tensor_files, use_safetensors = _prepare_hf_weights(
+        args.quantized_model, args.load_format)
+    rank_scales_map = _kv_scales_extractor(hf_tensor_files, use_safetensors,
+                                           rank_keyword, expected_tp_size)
+    # Postprocess: formatting to the current schema. Consider pulling it
+    # out into a dedicated function should it ever become more complicated.
+    rank_scales_map = {
+        rank: {k: scale[k]
+               for k in sorted(scale.keys())}
+        for rank, scale in rank_scales_map.items()
+    }
+    # TODO: Expand this with activation and weights scaling factors when
+    # they are used in the future
+    schema = QuantParamSchema(
+        model_type=recovered_metadata["model_type"],
+        kv_cache={
+            "dtype": ("float8_e4m3fn" if len(rank_scales_map) > 0 else
+                      recovered_metadata["model_dtype"]),
+            "scaling_factor":
+            rank_scales_map
+        },
+    )
+    if args.output_dir is None:
+        output_file = os.path.join(args.quantized_model, args.output_name)
+    else:
+        if not os.path.isdir(args.output_dir):
+            os.makedirs(args.output_dir, exist_ok=True)
+        output_file = os.path.join(args.output_dir, args.output_name)
+    with open(output_file, 'w') as f:
+        f.write(schema.model_dump_json(indent=4))
+        print(f"Completed! KV cache scaling factors saved to {output_file}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="This simple utility extracts the "
+        "KV cache scaling factors from a quantized HF model "
+        "and saves them to a JSON file compatible with later "
+        "use by vLLM (pass this file to the appropriate "
+        "runtime typically using the argument "
+        "--quantization-param-path <filename>). This is only used "
+        "if the KV cache dtype is FP8 and on ROCm (AMD GPU).")
+    parser.add_argument(
+        "--quantized_model",
+        help="Specify the directory containing a single quantized HF model. "
+        "It is expected that the quantization format is FP8_E4M3, for use "
+        "on ROCm (AMD GPU).",
+        required=True)
+    parser.add_argument(
+        "--load_format",
+        help="Optionally specify the format of the model's tensor files "
+        "containing the KV cache scaling factors.",
+        choices=["auto", "safetensors", "npz", "pt"],
+        default="auto")
+    parser.add_argument(
+        "--output_dir",
+        help="Optionally specify the output directory. By default the "
+        "KV cache scaling factors will be saved in the model directory, "
+        "however you can override this behavior here.",
+        default=None)
+    parser.add_argument(
+        "--output_name",
+        help="Optionally specify the output filename.",
+        # TODO: Change this once additional scaling factors are enabled
+        default="kv_cache_scales.json")
+    parser.add_argument(
+        "--tp_size",
+        help="Optionally specify the tensor-parallel (TP) size that the "
+        "quantized model should correspond to. If specified, during KV "
+        "cache scaling factor extraction the observed TP size will be "
+        "checked against this and an error will be raised if there is "
+        "a mismatch. If not specified, the quantized model's expected "
+        "TP size is instead inferred from the largest TP rank observed. "
+        "The expected TP size is cross-checked against the TP ranks "
+        "observed in the quantized model and an error is raised if any "
+        "discrepancies are found.",
+        default=None,
+        type=int)
+    args = parser.parse_args()
+    main(args)
--- a/examples/fp8/quantizer/README.md
+++ b/examples/fp8/quantizer/README.md
+### Quantizer Utilities
+`quantize.py`: NVIDIA Quantization utilities using AMMO, ported from TensorRT-LLM:
+`https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py`
+### Prerequisite
+#### AMMO (AlgorithMic Model Optimization) Installation: nvidia-ammo 0.7.1 or later
+`pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo` 
+#### AMMO Download (code and docs)
+`https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.5.0.tar.gz`
+`https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.7.1.tar.gz`
+### Usage
+#### Run on H100 system for speed if FP8; number of GPUs depends on the model size
+#### Example: quantize Llama2-7b model from HF to FP8 with FP8 KV Cache:
+`python quantize.py --model_dir ./ll2-7b --dtype float16 --qformat fp8 --kv_cache_dtype fp8 --output_dir ./ll2_7b_fp8 --calib_size 512 --tp_size 1`
+Outputs: model structure, quantized model & parameters (with scaling factors) are in JSON and Safetensors (npz is generated only for the reference)
+```
+# ll ./ll2_7b_fp8/
+total 19998244
+drwxr-xr-x 2 root root        4096 Feb  7 01:08 ./
+drwxrwxr-x 8 1060 1061        4096 Feb  7 01:08 ../
+-rw-r--r-- 1 root root      176411 Feb  7 01:08 llama_tp1.json
+-rw-r--r-- 1 root root 13477087480 Feb  7 01:09 llama_tp1_rank0.npz
+-rw-r--r-- 1 root root  7000893272 Feb  7 01:08 rank0.safetensors
+#
+```
--- a/examples/fp8/quantizer/quantize.py
+++ b/examples/fp8/quantizer/quantize.py
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # noqa: E501
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Adapted from examples/quantization/hf_ptq.py
+"""
+import argparse
+import copy
+import json
+import random
+import time
+import ammo.torch.quantization as atq
+import numpy as np
+import torch
+from ammo.torch.export import export_model_config
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+from transformers import AutoModelForCausalLM, AutoTokenizer
+RAND_SEED = 1234
+MAX_SEQ_LEN = 2048
+EMPTY_CFG = {
+    "quant_cfg": {
+        "*weight_quantizer": {
+            "enable": False,
+        },
+        "*input_quantizer": {
+            "enable": False
+        },
+        "*lm_head*": {
+            "enable": False
+        },
+        "*output_layer*": {
+            "enable": False
+        },
+        "default": {
+            "enable": False
+        },
+    },
+    "algorithm": "max",
+}
+KV_CACHE_CFG = {
+    "*.query_key_value.output_quantizer": {
+        "num_bits": 8,
+        "axis": None,
+        "enable": True
+    },
+    "*.Wqkv.output_quantizer": {
+        "num_bits": 8,
+        "axis": None,
+        "enable": True
+    },
+    "*.W_pack.output_quantizer": {
+        "num_bits": 8,
+        "axis": None,
+        "enable": True
+    },
+    "*.c_attn.output_quantizer": {
+        "num_bits": 8,
+        "axis": None,
+        "enable": True
+    },
+    "*.k_proj.output_quantizer": {
+        "num_bits": 8,
+        "axis": None,
+        "enable": True
+    },
+    "*.v_proj.output_quantizer": {
+        "num_bits": 8,
+        "axis": None,
+        "enable": True
+    },
+}
+QUANT_CFG_CHOICES = {
+    "int8_sq": atq.INT8_SMOOTHQUANT_CFG,
+    "fp8": atq.FP8_DEFAULT_CFG,
+    "int4_awq": atq.INT4_AWQ_CFG,
+    "w4a8_awq": atq.W4A8_AWQ_BETA_CFG,
+    "int8_wo": EMPTY_CFG,
+    "int4_wo": EMPTY_CFG,
+    "full_prec": EMPTY_CFG,
+}
+MODEL_NAME_PATTERN_MAP = {
+    "GPT2": "gpt2",
+    "Xverse": "llama",
+    "Llama": "llama",
+    "Mistral": "llama",
+    "GPTJ": "gptj",
+    "FalconForCausalLM": "falcon",
+    "RWForCausalLM": "falcon",
+    "baichuan": "baichuan",
+    "MPT": "mpt",
+    "Bloom": "bloom",
+    "ChatGLM": "chatglm",
+    "QWen": "qwen",
+}
+def get_tokenizer(ckpt_path, max_seq_len=MAX_SEQ_LEN, model_type=None):
+    print(f"Initializing tokenizer from {ckpt_path}")
+    tokenizer = AutoTokenizer.from_pretrained(
+        ckpt_path,
+        model_max_length=max_seq_len,
+        padding_side="left",
+        trust_remote_code=True,
+    )
+    if model_type and model_type == "qwen":
+        # qwen use token id 151643 as pad and eos tokens
+        tokenizer.pad_token = tokenizer.convert_ids_to_tokens(151643)
+        tokenizer.eos_token = tokenizer.convert_ids_to_tokens(151643)
+    # can't set attribute 'pad_token' for "<unk>"
+    if tokenizer.pad_token != "<unk>":
+        tokenizer.pad_token = tokenizer.eos_token
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    assert (tokenizer.pad_token
+            is not None), f"Pad token for {model_type} cannot be set!"
+    return tokenizer
+def get_model(ckpt_path, dtype="fp16", device="cuda"):
+    print(f"Initializing model from {ckpt_path}")
+    if dtype == "bf16" or dtype == "bfloat16":
+        dtype = torch.bfloat16
+    elif dtype == "fp16" or dtype == "float16":
+        dtype = torch.float16
+    elif dtype == "fp32" or dtype == "float32":
+        dtype = torch.float32
+    else:
+        raise NotImplementedError(f"Unknown dtype {dtype}")
+    # model_kwargs = {"torch_dtype": dtype}
+    model_kwargs = {"torch_dtype": "auto"}
+    model = AutoModelForCausalLM.from_pretrained(ckpt_path,
+                                                 device_map="auto",
+                                                 **model_kwargs,
+                                                 trust_remote_code=True)
+    model.eval()
+    model_dtype = next(model.parameters()).dtype
+    if dtype != model_dtype:
+        print("[TensorRT-LLM][WARNING] The manually set model data type is "
+              f"{dtype}, but the data type of the HuggingFace model is "
+              f"{model_dtype}.")
+    return model
+def get_model_type(model):
+    for k, v in MODEL_NAME_PATTERN_MAP.items():
+        if k.lower() in type(model).__name__.lower():
+            return v
+    return None
+def get_calib_dataloader(data="cnn_dailymail",
+                         tokenizer=None,
+                         batch_size=1,
+                         calib_size=512,
+                         block_size=512,
+                         device=None):
+    print("Loading calibration dataset")
+    if data == "pileval":
+        dataset = load_dataset(
+            "json",
+            data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst",
+            split="train")
+        dataset = dataset["text"][:calib_size]
+    elif data == "cnn_dailymail":
+        dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train")
+        dataset = dataset["article"][:calib_size]
+    else:
+        raise NotImplementedError
+    batch_encoded = tokenizer.batch_encode_plus(dataset,
+                                                return_tensors="pt",
+                                                padding="max_length",
+                                                truncation=True,
+                                                max_length=block_size)
+    if device:
+        batch_encoded = batch_encoded.to(device)
+    batch_encoded = batch_encoded["input_ids"]
+    calib_dataloader = DataLoader(batch_encoded,
+                                  batch_size=batch_size,
+                                  shuffle=False)
+    return calib_dataloader
+def quantize_model(model, quant_cfg, calib_dataloader=None):
+    def calibrate_loop():
+        if calib_dataloader is None:
+            return
+        """Adjusts weights and scaling factors based on selected algorithms."""
+        for idx, data in enumerate(calib_dataloader):
+            print(f"Calibrating batch {idx}")
+            model(data)
+    print("Starting quantization...")
+    start_time = time.time()
+    atq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
+    end_time = time.time()
+    print("Quantization done. Total time used: {:.2f} s.".format(end_time -
+                                                                 start_time))
+    return model
+def main(args):
+    if not torch.cuda.is_available():
+        raise EnvironmentError("GPU is required for inference.")
+    random.seed(RAND_SEED)
+    np.random.seed(RAND_SEED)
+    model = get_model(args.model_dir, args.dtype, args.device)
+    model_type = get_model_type(model)
+    tokenizer = get_tokenizer(args.model_dir, model_type=model_type)
+    if args.qformat in ["full_prec", "int8_wo", "int4_wo"
+                        ] and args.kv_cache_dtype is None:
+        print(f"No quantization applied, export {args.dtype} model")
+    else:
+        if "awq" in args.qformat:
+            if args.calib_size > 32:
+                print("AWQ calibration could take longer with calib_size = "
+                      f"{args.calib_size}, Using calib_size=32 instead")
+                args.calib_size = 32
+            print("\nAWQ calibration could take longer than other calibration "
+                  "methods. Please increase the batch size to speed up the "
+                  "calibration process. Batch size can be set by adding the "
+                  "argument --batch_size <batch_size> to the command line.\n")
+        calib_dataloader = get_calib_dataloader(
+            tokenizer=tokenizer,
+            batch_size=args.batch_size,
+            calib_size=args.calib_size,
+            device=args.device,
+        )
+        if args.qformat in QUANT_CFG_CHOICES:
+            quant_cfg = QUANT_CFG_CHOICES[args.qformat]
+        else:
+            raise ValueError(
+                f"Unsupported quantization format: {args.qformat}")
+        if "awq" in args.qformat:
+            quant_cfg = copy.deepcopy(QUANT_CFG_CHOICES[args.qformat])
+            weight_quantizer = quant_cfg["quant_cfg"][
+                "*weight_quantizer"]  # type: ignore
+            if isinstance(weight_quantizer, list):
+                weight_quantizer = weight_quantizer[0]
+            weight_quantizer["block_sizes"][-1] = args.awq_block_size
+        if args.kv_cache_dtype is not None:
+            if args.kv_cache_dtype == "fp8":
+                for value in KV_CACHE_CFG.values():
+                    value.update({"num_bits": (4, 3)})  # type: ignore
+            quant_cfg["quant_cfg"].update(KV_CACHE_CFG)  # type: ignore
+        print(quant_cfg)
+        model = quantize_model(model, quant_cfg, calib_dataloader)
+    with torch.inference_mode():
+        if model_type is None:
+            print(f"Unknown model type {type(model).__name__}. Continue "
+                  "exporting...")
+            model_type = f"unknown:{type(model).__name__}"
+        export_path = args.output_dir
+        start_time = time.time()
+        if args.qformat == "int4_awq" and model_type == "qwen":
+            torch.save(model.state_dict(), export_path)
+        else:
+            export_npz = (model_type not in [
+                'gptj', 'falcon', 'chatglm', 'mpt', 'llama', 'baichuan'
+            ])
+            # export safetensors
+            export_model_config(
+                model,
+                model_type,
+                getattr(torch, args.dtype),
+                export_dir=export_path,
+                inference_tensor_parallel=args.tp_size,
+                inference_pipeline_parallel=args.pp_size,
+                # export_tensorrt_llm_config=(not export_npz),
+                export_tensorrt_llm_config=False,
+                export_npz=export_npz)
+            # Workaround for wo quantization
+            if args.qformat in ["int8_wo", "int4_wo", "full_prec"]:
+                with open(f"{export_path}/config.json", 'r') as f:
+                    tensorrt_llm_config = json.load(f)
+                if args.qformat == "int8_wo":
+                    tensorrt_llm_config["quantization"]["quant_algo"] = 'W8A16'
+                elif args.qformat == "int4_wo":
+                    tensorrt_llm_config["quantization"]["quant_algo"] = 'W4A16'
+                else:
+                    tensorrt_llm_config["quantization"]["quant_algo"] = None
+                with open(f"{export_path}/config.json", "w") as f:
+                    json.dump(tensorrt_llm_config, f, indent=4)
+        end_time = time.time()
+        print("Quantized model exported to {} \nTotal time used {:.2f} s.".
+              format(export_path, end_time - start_time))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--model_dir",
+                        help="Specify where the HuggingFace model is",
+                        required=True)
+    parser.add_argument("--device", default="cuda")
+    parser.add_argument("--dtype", help="Model data type.", default="float16")
+    parser.add_argument(
+        "--qformat",
+        help="Quantization format.",
+        default="full_prec",
+        choices=[
+            "fp8", "int8_sq", "int4_awq", "w4a8_awq", "int8_wo", "int4_wo",
+            "full_prec"
+        ],
+    )
+    parser.add_argument("--batch_size",
+                        help="Batch size for calibration.",
+                        type=int,
+                        default=1)
+    parser.add_argument("--calib_size",
+                        help="Number of samples for calibration.",
+                        type=int,
+                        default=512)
+    parser.add_argument("--output_dir", default="exported_model")
+    parser.add_argument("--tp_size", type=int, default=1)
+    parser.add_argument("--pp_size", type=int, default=1)
+    parser.add_argument("--awq_block_size", type=int, default=128)
+    parser.add_argument("--kv_cache_dtype",
+                        help="KV Cache dtype.",
+                        default=None,
+                        choices=["int8", "fp8", None])
+    args = parser.parse_args()
+    main(args)
--- a/examples/openai_chatcompletion_client.py
+++ b/examples/openai_chatcompletion_client.py
--- a/examples/tensorize_vllm_model.py
+++ b/examples/tensorize_vllm_model.py
+import argparse
+import dataclasses
+import os
+import time
+import uuid
+from functools import partial
+from typing import Type
+import torch
+import torch.nn as nn
+from tensorizer import (DecryptionParams, EncryptionParams, TensorDeserializer,
+                        TensorSerializer, stream_io)
+from tensorizer.utils import convert_bytes, get_mem_usage, no_init_or_tensor
+from transformers import AutoConfig, PretrainedConfig
+from vllm.distributed import initialize_model_parallel
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.model_executor.model_loader.tensorizer import TensorizerArgs
+from vllm.model_executor.models import ModelRegistry
+# yapf conflicts with isort for this docstring
+# yapf: disable
+"""
+tensorize_vllm_model.py is a script that can be used to serialize and 
+deserialize vLLM models. These models can be loaded using tensorizer 
+to the GPU extremely quickly over an HTTP/HTTPS endpoint, an S3 endpoint,
+or locally. Tensor encryption and decryption is also supported, although 
+libsodium must be installed to use it. Install vllm with tensorizer support 
+using `pip install vllm[tensorizer]`.
+To serialize a model, install vLLM from source, then run something 
+like this from the root level of this repository:
+python -m examples.tensorize_vllm_model \
+   --model EleutherAI/gpt-j-6B \
+   --dtype float16 \
+   serialize \
+   --serialized-directory s3://my-bucket/ \
+   --suffix vllm
+Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
+and saves it to your S3 bucket. A local directory can also be used. This
+assumes your S3 credentials are specified as environment variables
+in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and `S3_ENDPOINT`.
+To provide S3 credentials directly, you can provide `--s3-access-key-id` and 
+`--s3-secret-access-key`, as well as `--s3-endpoint` as CLI args to this 
+script.
+You can also encrypt the model weights with a randomly-generated key by 
+providing a `--keyfile` argument.
+To deserialize a model, you can run something like this from the root 
+level of this repository:
+python -m examples.tensorize_vllm_model \
+   --model EleutherAI/gpt-j-6B \
+   --dtype float16 \
+   deserialize \
+   --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/vllm/model.tensors
+Which downloads the model tensors from your S3 bucket and deserializes them.
+You can also provide a `--keyfile` argument to decrypt the model weights if 
+they were serialized with encryption.
+For more information on the available arguments for serializing, run 
+`python -m examples.tensorize_vllm_model serialize --help`.
+Or for deserializing:
+`python -m examples.tensorize_vllm_model deserialize --help`.
+Once a model is serialized, it can be used to load the model when running the
+OpenAI inference client at `vllm/entrypoints/openai/api_server.py` by providing
+the `--tensorizer-uri` CLI argument that is functionally the same as the
+`--path-to-tensors` argument in this script, along with `--vllm-tensorized`, to
+signify that the model to be deserialized is a vLLM model, rather than a 
+HuggingFace `PreTrainedModel`, which can also be deserialized using tensorizer
+in the same inference server, albeit without the speed optimizations. To
+deserialize an encrypted file, the `--encryption-keyfile` argument can be used
+to provide the path to the keyfile used to encrypt the model weights. For
+information on all the arguments that can be used to configure tensorizer's
+deserialization, check out the tensorizer options argument group in the
+`vllm/entrypoints/openai/api_server.py` script with `--help`.
+Tensorizer can also be invoked with the `LLM` class directly to load models:
+    llm = LLM(model="facebook/opt-125m",
+              load_format="tensorizer",
+              tensorizer_uri=path_to_opt_tensors,
+              num_readers=3,
+              vllm_tensorized=True)
+"""
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="An example script that can be used to serialize and "
+        "deserialize vLLM models. These models "
+        "can be loaded using tensorizer directly to the GPU "
+        "extremely quickly. Tensor encryption and decryption is "
+        "also supported, although libsodium must be installed to "
+        "use it.")
+    parser = EngineArgs.add_cli_args(parser)
+    subparsers = parser.add_subparsers(dest='command')
+    serialize_parser = subparsers.add_parser(
+        'serialize', help="Serialize a model to `--serialized-directory`")
+    serialize_parser.add_argument(
+        "--suffix",
+        type=str,
+        required=False,
+        help=(
+            "The suffix to append to the serialized model directory, which is "
+            "used to construct the location of the serialized model tensors, "
+            "e.g. if `--serialized-directory` is `s3://my-bucket/` and "
+            "`--suffix` is `v1`, the serialized model tensors will be "
+            "saved to "
+            "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
+            "If none is provided, a random UUID will be used."))
+    serialize_parser.add_argument(
+        "--serialized-directory",
+        type=str,
+        required=True,
+        help="The directory to serialize the model to. "
+        "This can be a local directory or S3 URI. The path to where the "
+        "tensors are saved is a combination of the supplied `dir` and model "
+        "reference ID. For instance, if `dir` is the serialized directory, "
+        "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
+        "be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
+        "where `suffix` is given by `--suffix` or a random UUID if not "
+        "provided.")
+    serialize_parser.add_argument(
+        "--keyfile",
+        type=str,
+        required=False,
+        help=("Encrypt the model weights with a randomly-generated binary key,"
+              " and save the key at this path"))
+    deserialize_parser = subparsers.add_parser(
+        'deserialize',
+        help=("Deserialize a model from `--path-to-tensors`"
+              " to verify it can be loaded and used."))
+    deserialize_parser.add_argument(
+        "--path-to-tensors",
+        type=str,
+        required=True,
+        help="The local path or S3 URI to the model tensors to deserialize. ")
+    deserialize_parser.add_argument(
+        "--keyfile",
+        type=str,
+        required=False,
+        help=("Path to a binary key to use to decrypt the model weights,"
+              " if the model was serialized with encryption"))
+    return parser.parse_args()
+def make_model_contiguous(model):
+    # Ensure tensors are saved in memory contiguously
+    for param in model.parameters():
+        param.data = param.data.contiguous()
+def _get_vllm_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
+    architectures = getattr(config, "architectures", [])
+    for arch in architectures:
+        model_cls = ModelRegistry.load_model_cls(arch)
+        if model_cls is not None:
+            return model_cls
+    raise ValueError(
+        f"Model architectures {architectures} are not supported for now. "
+        f"Supported architectures: {ModelRegistry.get_supported_archs()}")
+def serialize():
+    eng_args_dict = {f.name: getattr(args, f.name) for f in
+                     dataclasses.fields(EngineArgs)}
+    engine_args = EngineArgs.from_cli_args(argparse.Namespace(**eng_args_dict))
+    engine = LLMEngine.from_engine_args(engine_args)
+    model = (engine.model_executor.driver_worker.
+             model_runner.model)
+    encryption_params = EncryptionParams.random() if keyfile else None
+    if keyfile:
+        with _write_stream(keyfile) as stream:
+            stream.write(encryption_params.key)
+    with _write_stream(model_path) as stream:
+        serializer = TensorSerializer(stream, encryption=encryption_params)
+        serializer.write_module(model)
+        serializer.close()
+    print("Serialization complete. Model tensors saved to", model_path)
+    if keyfile:
+        print("Key saved to", keyfile)
+def deserialize():
+    config = AutoConfig.from_pretrained(model_ref)
+    with no_init_or_tensor():
+        model_class = _get_vllm_model_architecture(config)
+        model = model_class(config)
+    before_mem = get_mem_usage()
+    start = time.time()
+    if keyfile:
+        with _read_stream(keyfile) as stream:
+            key = stream.read()
+            decryption_params = DecryptionParams.from_key(key)
+            tensorizer_args.deserializer_params['encryption'] = \
+                decryption_params
+    with (_read_stream(model_path)) as stream, TensorDeserializer(
+            stream, **tensorizer_args.deserializer_params) as deserializer:
+        deserializer.load_into_module(model)
+        end = time.time()
+    # Brag about how fast we are.
+    total_bytes_str = convert_bytes(deserializer.total_tensor_bytes)
+    duration = end - start
+    per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
+    after_mem = get_mem_usage()
+    print(
+        f"Deserialized {total_bytes_str} in {end - start:0.2f}s, {per_second}/s"
+    )
+    print(f"Memory usage before: {before_mem}")
+    print(f"Memory usage after: {after_mem}")
+    return model
+args = parse_args()
+s3_access_key_id = (args.s3_access_key_id or os.environ.get("S3_ACCESS_KEY_ID")
+                    or None)
+s3_secret_access_key = (args.s3_secret_access_key
+                        or os.environ.get("S3_SECRET_ACCESS_KEY") or None)
+s3_endpoint = (args.s3_endpoint or os.environ.get("S3_ENDPOINT_URL") or None)
+_read_stream, _write_stream = (partial(
+    stream_io.open_stream,
+    mode=mode,
+    s3_access_key_id=s3_access_key_id,
+    s3_secret_access_key=s3_secret_access_key,
+    s3_endpoint=s3_endpoint,
+) for mode in ("rb", "wb+"))
+model_ref = args.model
+model_name = model_ref.split("/")[1]
+os.environ["MASTER_ADDR"] = "127.0.0.1"
+os.environ["MASTER_PORT"] = "8080"
+torch.distributed.init_process_group(world_size=1, rank=0)
+initialize_model_parallel()
+keyfile = args.keyfile if args.keyfile else None
+if args.command == "serialize":
+    input_dir = args.serialized_directory.rstrip('/')
+    suffix = args.suffix if args.suffix else uuid.uuid4().hex
+    base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
+    model_path = f"{base_path}/model.tensors"
+    serialize()
+elif args.command == "deserialize":
+    tensorizer_args = TensorizerArgs.from_cli_args(args)
+    model_path = args.path_to_tensors
+    deserialize()
+else:
+    raise ValueError("Either serialize or deserialize must be specified.")
--- a/format.sh
+++ b/format.sh
@@ -93,9 +93,21 @@ fi
 echo 'vLLM yapf: Done'
 # Run mypy
-# TODO(zhuohan): Enable mypy
+echo 'vLLM mypy:'
-# echo 'vLLM mypy:'
+mypy vllm/attention --config-file pyproject.toml
-# mypy
+mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/distributed --config-file pyproject.toml
+mypy vllm/entrypoints --config-file pyproject.toml
+mypy vllm/executor --config-file pyproject.toml
+mypy vllm/usage --config-file pyproject.toml
+mypy vllm/*.py --config-file pyproject.toml
+mypy vllm/transformers_utils --config-file pyproject.toml
+mypy vllm/engine  --config-file pyproject.toml
+mypy vllm/worker --config-file pyproject.toml
+mypy vllm/spec_decode --config-file pyproject.toml
+mypy vllm/model_executor/*.py  --config-file pyproject.toml
+# mypy vllm/lora/*.py --config-file pyproject.toml
 CODESPELL_EXCLUDES=(
    '--skip' '*docs/source/_build/**'
@@ -228,5 +240,3 @@ if ! git diff --quiet &>/dev/null; then
    exit 1
 fi
--- a/patch_xformers.rocm.sh
+++ b/patch_xformers.rocm.sh
-#!/bin/bash
-set -e
-XFORMERS_VERSION="0.0.23"
-export XFORMERS_INSTALLED_VERSION=$(python -c 'import xformers; print(xformers.__version__)')
-if [ "$XFORMERS_INSTALLED_VERSION" != "$XFORMERS_VERSION" ]; then
-    echo "ERROR: xformers version must be ${XFORMERS_VERSION}. ${XFORMERS_INSTALLED_VERSION} is installed"
-    exit 1
-fi
-export XFORMERS_FMHA_FLASH_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.flash.__file__)')
-export XFORMERS_FMHA_COMMON_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.common.__file__)')
-echo "XFORMERS_FMHA_FLASH_PATH = ${XFORMERS_FMHA_FLASH_PATH}"
-echo "XFORMERS_FMHA_COMMON_PATH = ${XFORMERS_FMHA_COMMON_PATH}"
-if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-${XFORMERS_VERSION}.rocm.patch"; then
-    echo "Applying patch to ${XFORMERS_FMHA_FLASH_PATH}"
-    patch -p0 $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-${XFORMERS_VERSION}.rocm.patch"
-    echo "Successfully patch ${XFORMERS_FMHA_FLASH_PATH}"
-else
-    echo "${XFORMERS_FMHA_FLASH_PATH} was patched before"
-fi
-if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-${XFORMERS_VERSION}.rocm.patch"; then
-    echo "Applying patch to ${XFORMERS_FMHA_COMMON_PATH}"
-    patch -p0 $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-${XFORMERS_VERSION}.rocm.patch"
-    echo "Successfully patch ${XFORMERS_FMHA_COMMON_PATH}"
-else
-    echo "${XFORMERS_FMHA_COMMON_PATH} was patched before"
-fi
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
    "ninja",
    "packaging",
    "setuptools >= 49.4.0",
-    "torch == 2.1.2",
+    "torch == 2.2.1",
    "wheel",
 ]
 build-backend = "setuptools.build_meta"
@@ -13,6 +13,10 @@ build-backend = "setuptools.build_meta"
 [tool.ruff]
 # Allow lines to be as long as 80.
 line-length = 80
+exclude = [
+    # External file, leaving license intact
+    "examples/fp8/quantizer/quantize.py"
+]
 [tool.ruff.lint]
 select = [
@@ -42,11 +46,16 @@ ignore = [
 python_version = "3.8"
 ignore_missing_imports = true
+check_untyped_defs = true
+follow_imports = "skip"
 files = "vllm"
 # TODO(woosuk): Include the code from Megatron and HuggingFace.
-exclude = "vllm/model_executor/parallel_utils/|vllm/model_executor/models/"
+exclude = [
+    "vllm/model_executor/parallel_utils/|vllm/model_executor/models/",
+    # Ignore triton kernels in ops.
+    'vllm/attention/ops/.*\.py$'
+]
 [tool.codespell]
 ignore-words-list = "dout, te, indicies"

--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -3,5 +3,5 @@ cmake>=3.21
 ninja
 packaging
 setuptools>=49.4.0
-torch==2.1.2
+torch==2.2.1
 wheel
--- a/requirements.txt
+++ b/requirements.txt
-cmake>=3.21
+cmake >= 3.21
 ninja  # For faster builds.
 psutil
-ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
-torch == 2.1.2
 requests
-psutil
 py-cpuinfo
-transformers >= 4.39.1  # Required for StarCoder2 & Llava.
+transformers >= 4.40.0  # Required for StarCoder2 & Llava, Llama 3.
-xformers == 0.0.23.post1  # Required for CUDA 12.1.
+tokenizers >= 0.19.1  # Required for Llama 3.
 fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.
 prometheus_client >= 0.18.0
-pynvml == 11.5.0
+tiktoken == 0.6.0  # Required for DBRX tokenizer
-triton >= 2.1.0
+lm-format-enforcer == 0.9.8
-outlines == 0.0.34
+outlines == 0.0.34 # Requires torch >= 2.1.0
-tiktoken == 0.6.0 # Required for DBRX tokenizer
+typing_extensions
+filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
+# Common dependencies
+-r requirements-common.txt
+# Dependencies for x86_64 CPUs
+torch == 2.2.1+cpu
+triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
\ No newline at end of file
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
+# Common dependencies
+-r requirements-common.txt
+# Dependencies for NVIDIA GPUs
+ray >= 2.9
+nvidia-ml-py # for pynvml package
+vllm-nccl-cu12>=2.18,<2.19  # for downloading nccl library
+torch == 2.2.1
+xformers == 0.0.25  # Requires PyTorch 2.2.1