update vllm0.5.0

d824b9d3 · zhuwenwen · 8d5187cb · d824b9d3 · d824b9d3 · d824b9d3
Commit d824b9d3 authored Sep 30, 2024 by zhuwenwen
20 changed files
--- a/examples/api_client.py
+++ b/examples/api_client.py
+"""Example Python client for vllm.entrypoints.api_server"""
+
+import argparse
+import json
+from typing import Iterable, List
+
+import requests
+
+
+def clear_line(n: int = 1) -> None:
+    LINE_UP = '\033[1A'
+    LINE_CLEAR = '\x1b[2K'
+    for _ in range(n):
+        print(LINE_UP, end=LINE_CLEAR, flush=True)
+
+
+def post_http_request(prompt: str,
+                      api_url: str,
+                      n: int = 1,
+                      stream: bool = False) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    pload = {
+        "prompt": prompt,
+        "n": n,
+        "use_beam_search": True,
+        "temperature": 0.0,
+        "max_tokens": 16,
+        "stream": stream,
+    }
+    response = requests.post(api_url, headers=headers, json=pload, stream=True)
+    return response
+
+
+def get_streaming_response(response: requests.Response) -> Iterable[List[str]]:
+    for chunk in response.iter_lines(chunk_size=8192,
+                                     decode_unicode=False,
+                                     delimiter=b"\0"):
+        if chunk:
+            data = json.loads(chunk.decode("utf-8"))
+            output = data["text"]
+            yield output
+
+
+def get_response(response: requests.Response) -> List[str]:
+    data = json.loads(response.content)
+    output = data["text"]
+    return output
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--n", type=int, default=4)
+    parser.add_argument("--prompt", type=str, default="San Francisco is a")
+    parser.add_argument("--stream", action="store_true")
+    args = parser.parse_args()
+    prompt = args.prompt
+    api_url = f"http://{args.host}:{args.port}/generate"
+    n = args.n
+    stream = args.stream
+
+    print(f"Prompt: {prompt!r}\n", flush=True)
+    response = post_http_request(prompt, api_url, n, stream)
+
+    if stream:
+        num_printed_lines = 0
+        for h in get_streaming_response(response):
+            clear_line(num_printed_lines)
+            num_printed_lines = 0
+            for i, line in enumerate(h):
+                num_printed_lines += 1
+                print(f"Beam candidate {i}: {line!r}", flush=True)
+    else:
+        output = get_response(response)
+        for i, line in enumerate(output):
+            print(f"Beam candidate {i}: {line!r}", flush=True)
--- a/examples/aqlm_example.py
+++ b/examples/aqlm_example.py
+import argparse
+
+from vllm import LLM, SamplingParams
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='AQLM examples')
+
+    parser.add_argument('--model',
+                        '-m',
+                        type=str,
+                        default=None,
+                        help='model path, as for HF')
+    parser.add_argument('--choice',
+                        '-c',
+                        type=int,
+                        default=0,
+                        help='known good models by index, [0-4]')
+    parser.add_argument('--tensor_parallel_size',
+                        '-t',
+                        type=int,
+                        default=1,
+                        help='tensor parallel size')
+
+    args = parser.parse_args()
+
+    models = [
+        "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf",
+        "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf",
+        "ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf",
+        "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf",
+        "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
+    ]
+
+    model = LLM(args.model if args.model is not None else models[args.choice],
+                tensor_parallel_size=args.tensor_parallel_size)
+
+    sampling_params = SamplingParams(max_tokens=100, temperature=0)
+    outputs = model.generate("Hello my name is",
+                             sampling_params=sampling_params)
+    print(outputs[0].outputs[0].text)
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/fp8/README.md
+++ b/examples/fp8/README.md
+# FP8 KV Cache 
+
+This utility extracts the KV cache scaling factors from a quantized HF (Hugging Face) model. The extracted scaling factors are saved to a JSON file, which can later be used by vLLM (variable-length language model) during runtime. This tool is particularly useful when the KV cache data type is FP8 and is intended for use on ROCm (AMD GPU) platforms.
+
+## Prerequisites
+
+- Python 3.x
+- PyTorch
+- NumPy
+- Hugging Face Transformers
+- Hugging Face Hub
+- AMMO 
+
+Before incorporating the FP8 datatype for inference workloads, you must adhere to the following steps:
+1. Install all necessary prerequisites and dependencies. 
+2. Convert HF model into a quantized HF model. 
+3. Extract KV Cache Scaling Factors from quantized HF model.
+4. Load KV Cache Scaling Factors into VLLM.
+
+### 2. Convert HF model into a quantized HF model.
+Note: The following steps are adapted from the [TensorRT-LLM repository](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/README.md).
+
+`quantize.py` (examples/fp8/quantizer/quantize.py) uses the quantization toolkit  (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format).
+
+The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found at `examples/fp8/quantizer/README.md`.
+
+### 3. Extract KV Cache Scaling Factors from quantized HF model.
+`extract_scales.py` (examples/fp8/extract_scales.py) can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports Llama 2 models. It is also important to note the following:
+1. **File Structure**: The utility operates under the assumption that all parameters, including KV cache scaling factors, corresponding to a particular Tensor Parallelism (TP) rank are stored in a single file. These files must adhere to a specific naming convention where the TP rank is immediately identified after a specific keyword (e.g., "rank") in the filename.
+
+2. **TP Decomposition**: The utility assumes consistency between the TP decomposition employed by the quantizer tool and that used by vLLM.
+
+3. **AMMO Compatibility**: Currently, the generated KV cache scaling factors for AMMO remain uniform across all TP ranks.
+
+```python
+# prerequisites:
+# - Quantized HF LLaMa 2 model 
+python3 examples/fp8/extract_scales.py --help
+Usage: extract_scales.py [-h] --quantized_model QUANTIZED_MODEL [--load_format {auto,safetensors,npz,pt}] [--output_dir OUTPUT_DIR] [--output_name OUTPUT_NAME] [--tp_size TP_SIZE]
+
+KV Scale Extraction Example
+
+optional arguments:
+--quantized_model: Specify either the local path to, or name of, a quantized HF model. It is expected that the quantization format is FP8_E4M3, for use on ROCm (AMD GPU).
+Optional arguments:
+--cache_dir: Specify a cache directory to use in the event of a HF model download. (Default: None)
+--load_format: Specify the format of the model's tensor files containing the KV cache scaling factors. (Choices: auto, safetensors, npz, pt; Default: auto)
+--revision: Specify the model's revision number. (Default: None)
+--output_dir: Specify the output directory. By default the KV cache scaling factors will be saved in the model directory. (Default: None)
+--output_name: Specify the output filename. (Default: kv_cache_scales.json)
+--tp_size: Specify the tensor-parallel (TP) size that the quantized model should correspond to. If specified, during KV cache scaling factor extraction the observed TP size will be checked against this and an error will be raised if there is a mismatch. (Default: None)
+```
+```python
+Example:
+python3 examples/fp8/extract_scales.py --quantized_model <QUANTIZED_MODEL_DIR> --tp_size <TENSOR_PARALLEL_SIZE> --output_dir <PATH_TO_OUTPUT_DIR>
+```
+### 4. Load KV Cache Scaling Factors into VLLM.
+This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for KV cache scaling factors to be utilized for FP8.
+```python
+# prerequisites:
+# -  LLaMa 2 kv_cache_scales.json file
+
+python3 benchmarks/benchmark_throughput.py --help 
+usage: benchmark_throughput.py [-h] [--backend {vllm,hf,mii}] [--dataset DATASET] [--input-len INPUT_LEN] [--output-len OUTPUT_LEN] [--model MODEL]
+                               [--tokenizer TOKENIZER] [--quantization {awq,gptq,squeezellm,None}] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--n N]
+                               [--use-beam-search] [--num-prompts NUM_PROMPTS] [--seed SEED] [--hf-max-batch-size HF_MAX_BATCH_SIZE] [--trust-remote-code]
+                               [--max-model-len MAX_MODEL_LEN] [--dtype {auto,half,float16,bfloat16,float,float32}] [--enforce-eager] [--kv-cache-dtype {auto,fp8}]
+                               [--quantization-param-path KV_CACHE_quantization_param_path]
+
+Benchmark Throughput Example  
+optional arguments:
+  -h, --help  show this help message and exit
+  --backend {vllm,hf,mii}
+  --dataset DATASET  Path to the dataset.
+  --input-len INPUT_LEN  Input prompt length for each request
+  --output-len OUTPUT_LEN  Output length for each request. Overrides the output length from the dataset.
+  --model MODEL
+  --tokenizer TOKENIZER
+  --quantization {awq,gptq,squeezellm,None}, -q {awq,gptq,squeezellm,None}
+  --tensor-parallel-size TENSOR_PARALLEL_SIZE, -tp TENSOR_PARALLEL_SIZE
+  --n N  Number of generated sequences per prompt.
+  --use-beam-search
+  --num-prompts NUM_PROMPTS  Number of prompts to process.
+  --seed SEED
+  --hf-max-batch-size HF_MAX_BATCH_SIZE   Maximum batch size for HF backend.
+  --trust-remote-code trust remote code from huggingface
+  --max-model-len MAX_MODEL_LEN  Maximum length of a sequence (including prompt and output). If None, will be derived from the model.
+  --dtype {auto,half,float16,bfloat16,float,float32}  data type for model weights and activations. The "auto" option will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models.
+  --enforce-eager  enforce eager execution
+  --kv-cache-dtype {auto,fp8} Data type for kv cache storage. If "auto", will use model data type. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported ```for common inference criteria.
+  --quantization-param-path QUANT_PARAM_JSON Path to the JSON file containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache scaling factors default to 1.0, which may cause accuracy issues. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.
+```
+```
+Example:
+python3 benchmarks/benchmark_throughput.py --input-len <INPUT_LEN> --output-len <OUTPUT_LEN> -tp <TENSOR_PARALLEL_SIZE> --kv-cache-dtype fp8 --quantization-param-path <path/to/kv_cache_scales.json> --model <path-to-llama2>
+```python
--- a/examples/fp8/extract_scales.py
+++ b/examples/fp8/extract_scales.py
+import argparse
+import glob
+import json
+import os
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+
+import numpy as np
+import torch
+from safetensors.torch import safe_open
+
+from vllm.model_executor.layers.quantization.schema import QuantParamSchema
+
+
+# Adapted from vllm/model_executor/model_loader/weight_utils.py
+# The main differences are that we add the NPZ format and simplify
+# its functionality drastically for our purposes (e.g. we assume that
+# the quantized model exists locally and there is no need to download it)
+def _prepare_hf_weights(
+    quantized_model_dir: str,
+    load_format: str = "auto",
+    fall_back_to_pt: bool = True,
+) -> Tuple[str, List[str], bool]:
+    if not os.path.isdir(quantized_model_dir):
+        raise FileNotFoundError(
+            f"The quantized model directory `{quantized_model_dir}` "
+            "does not exist.")
+    use_safetensors = False
+    # Some quantized models use .pt files for storing the weights.
+    if load_format == "auto":
+        allow_patterns = ["*.safetensors", "*.bin"]
+    elif load_format == "safetensors":
+        use_safetensors = True
+        allow_patterns = ["*.safetensors"]
+    elif load_format == "pt":
+        allow_patterns = ["*.pt"]
+    elif load_format == "npz":
+        allow_patterns = ["*.npz"]
+    else:
+        raise ValueError(f"Unknown load_format: {load_format}")
+    if fall_back_to_pt:
+        allow_patterns += ["*.pt"]
+
+    hf_weights_files: List[str] = []
+    for pattern in allow_patterns:
+        hf_weights_files += glob.glob(
+            os.path.join(quantized_model_dir, pattern))
+        if len(hf_weights_files) > 0:
+            if pattern == "*.safetensors":
+                use_safetensors = True
+            break
+
+    if not use_safetensors:
+        # Exclude files that are not needed for inference.
+        # https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233
+        blacklist = [
+            "training_args.bin",
+            "optimizer.bin",
+            "optimizer.pt",
+            "scheduler.pt",
+            "scaler.pt",
+        ]
+        hf_weights_files = [
+            f for f in hf_weights_files
+            if not any(f.endswith(x) for x in blacklist)
+        ]
+
+    if len(hf_weights_files) == 0:
+        raise RuntimeError(
+            f"Cannot find any model weights with `{quantized_model_dir}`")
+
+    return hf_weights_files, use_safetensors
+
+
+# Adapted from vllm/model_executor/model_loader/weight_utils.py
+def _hf_tensorfile_iterator(filename: str, load_format: str,
+                            use_safetensors: bool):
+    if load_format == "npz":
+        assert not use_safetensors
+        with np.load(filename) as data:
+            for name in data.files:
+                param = torch.from_numpy(data[name])
+                yield name, param
+    elif use_safetensors:
+        with safe_open(filename, framework="pt") as f:
+            for name in f.keys():  # NOQA: SIM118
+                param = f.get_tensor(name)
+                yield name, param
+    else:
+        state = torch.load(filename, map_location="cpu")
+        for name, param in state.items():
+            yield name, param
+        del state
+        torch.cuda.empty_cache()
+
+
+def _kv_scales_extractor(
+        hf_tensor_files: Iterable[str],
+        use_safetensors: bool,
+        rank_keyword: str = "rank",
+        expected_tp_size: Optional[int] = None) -> Dict[int, Dict[int, float]]:
+    """
+    Given a list of files containing tensor data, attempt to extract KV cache
+    scales from these files. Intended as a helper function taking in the output
+    from _prepare_hf_weights.
+    Args:
+    rank_keyword        Matches the number immediately after this keyword in the
+                        tensor filename to determine the TP rank corresponding
+                        to said tensor file
+    expected_tp_size    If specified, the TP size of the tensor files is checked
+                        against this and an error is raised if they don't match.
+    Returns a dictionary mapping TP ranks to their relevant KV cache scales.
+    The per-rank scales are themselves represented as a dictionary of layer
+    indices to the respective per-layer scale.
+    """
+    for char in rank_keyword:
+        assert not char.isdecimal(
+        ), f"Rank keyword {rank_keyword} contains a numeric character!"
+    rank_scales_map = {}
+    for tensor_file in hf_tensor_files:
+        try:
+            rank_idx = tensor_file.find(rank_keyword)
+            if rank_idx != -1:
+                start_idx = rank_idx + len(rank_keyword)
+                stop_idx = start_idx
+                while stop_idx < len(
+                        tensor_file) and tensor_file[stop_idx].isdecimal():
+                    stop_idx += 1
+                if stop_idx == start_idx:
+                    raise RuntimeError("Did not find rank # in filename.")
+                rank = int(tensor_file[start_idx:stop_idx])
+            elif len(hf_tensor_files) == 1:
+                # Since there is only one tensor file, we can assume
+                # that it's intended for TP rank 0
+                rank = 0
+            else:
+                raise RuntimeError(
+                    f"Filename does not contain '{rank_keyword}'.")
+        except RuntimeError:
+            print("Unable to determine TP rank "
+                  f"corresponding to file '{tensor_file}'")
+            raise
+
+        if rank not in rank_scales_map:
+            layer_scales_map = {}
+            rank_scales_map[rank] = layer_scales_map
+        else:
+            raise RuntimeError(
+                f"Tensor file '{tensor_file}' shares TP rank {rank} "
+                "with another tensor file.")
+
+        module_delimiter = ":" if args.load_format == "npz" else "."
+        for name, param in _hf_tensorfile_iterator(tensor_file,
+                                                   args.load_format,
+                                                   use_safetensors):
+            if "kv_cache_scaling_factor" in name:
+                nums = [
+                    int(s) for s in name.split(module_delimiter)
+                    if s.isdecimal()
+                ]
+                assert len(
+                    nums) == 1, f"Could not determine layer idx for {name}"
+                layer_idx = nums[0]
+                assert layer_idx not in layer_scales_map, f"Duplicate scaling"\
+                    f" factor corresponding to layer {layer_idx}"
+                try:
+                    layer_scales_map[layer_idx] = param.item()
+                except RuntimeError:
+                    print(
+                        "This utility supports only per-tensor scalar scales "
+                        f"for now. The tensor\n {name} = {param} \nis an "
+                        "invalid scale factor.")
+                    raise
+
+    if all(
+            len(layer_scales_map) == 0
+            for layer_scales_map in rank_scales_map.values()):
+        # Note: this is true even if the rank_scales_map is empty
+        print("WARNING: No KV cache scale factors found. No output saved.")
+        return None
+    empirical_tp_world_size = max(rank_scales_map.keys()) + 1
+    if expected_tp_size is not None:
+        assert expected_tp_size == empirical_tp_world_size, \
+            f"User expected TP world size = {expected_tp_size} " \
+            "from model but tool is expecting TP world size = " \
+            f"{empirical_tp_world_size} from model instead."
+    for i in range(empirical_tp_world_size):
+        assert i in rank_scales_map, "Expected TP world size = "\
+            f"{empirical_tp_world_size} but did not find KV " \
+            f"cache scaling factors for TP rank {i}"
+    print(f"Found TP world size = {empirical_tp_world_size} "
+          "when extracting KV cache scales!")
+    return rank_scales_map
+
+
+def _metadata_extractor(quantized_model_dir: str,
+                        metadata_extract_fns: \
+                        Dict[str, Callable[[Dict[str, Any]], Any]]) \
+                        -> Dict[str, Any]:
+    """
+    Given a directory containing quantized model files, this function
+    aims to extract metadata from the JSON files within this directory.
+    Each JSON file is expected to represent a dictionary in JSON
+    format (referred to as a "JSON-dictionary"). Metadata extraction is
+    defined by a dictionary called metadata_extract_fns, where each
+    metadata field name is mapped to an extraction function.
+
+    These extraction functions are designed to take a JSON-dictionary
+    as their only argument  and return the corresponding metadata.
+    While extraction functions are permitted to raise  exceptions, they
+    should only raise a KeyError or ValueError if the metadata field
+    cannot  be extracted from the current JSON-dictionary, yet there's
+    a possibility of finding it in another JSON-dictionary.
+
+    The function returns a dictionary that maps metadata fields to
+    their extracted data. The keys of this dictionary correspond exactly
+    to those in metadata_extract_fns. If any fields fail to be extracted,
+    their corresponding values are set to None, and a warning is printed.
+    """
+    if not os.path.isdir(quantized_model_dir):
+        raise FileNotFoundError(
+            f"The quantized model directory `{quantized_model_dir}` "
+            "does not exist.")
+    metadata_files = glob.glob(os.path.join(quantized_model_dir, "*.json"))
+
+    result = {}
+    for file in metadata_files:
+        with open(file) as f:
+            try:
+                metadata = json.load(f)
+            except json.JSONDecodeError:
+                print(f"Could not parse `{file}` as a valid metadata file,"
+                      " skipping it.")
+                continue
+            if not isinstance(metadata, dict):
+                print(f"The file `{file}` does not correspond to a "
+                      "JSON-serialized dictionary, skipping it.")
+                continue
+            for metadata_name, extract_fn in metadata_extract_fns.items():
+                try:
+                    metadata_info = extract_fn(metadata)
+                    if metadata_name not in result:
+                        result[metadata_name] = metadata_info
+                    elif metadata_info != result[metadata_name]:
+                        raise RuntimeError(
+                            "Metadata mismatch! Originally found "
+                            f"{metadata_name} = {result[metadata_name]} but "
+                            f"now found {metadata_name} = {metadata_info} in "
+                            f"`{file}`")
+                except KeyError:
+                    # It is possible that a given file does not contain some
+                    # of our selected metadata as it could be located in some
+                    # other metadata file.
+                    # 'EFINAE': extract_fn failure is not an error.
+                    pass
+                except ValueError:
+                    # See above.
+                    pass
+
+    # Warn if we cannot find any of the requested metadata
+    for metadata_name in metadata_extract_fns:
+        if metadata_name not in result:
+            print("WARNING: Unable to find requested metadata field "
+                  f"`{metadata_name}`, setting it to None.")
+            result[metadata_name] = None
+
+    return result
+
+
+def main(args):
+    metadata_extract_fns = {
+        "model_type": lambda json_dict: json_dict["layers"][0]["decoder_type"],
+        "tp_size": lambda json_dict: int(json_dict["tensor_parallel"]),
+        "model_dtype": lambda json_dict: json_dict["dtype"]
+    }
+    recovered_metadata = _metadata_extractor(args.quantized_model,
+                                             metadata_extract_fns)
+    if args.tp_size is not None:
+        metadata_tp_size = recovered_metadata["tp_size"]
+        if metadata_tp_size is not None:
+            assert args.tp_size == metadata_tp_size, \
+              f"User expected TP world size = {args.tp_size} " \
+              f"but found TP world size = {metadata_tp_size} from metadata!"
+    expected_tp_size = args.tp_size or recovered_metadata["tp_size"]
+    rank_keyword = "rank"
+    hf_tensor_files, use_safetensors = _prepare_hf_weights(
+        args.quantized_model, args.load_format)
+    rank_scales_map = _kv_scales_extractor(hf_tensor_files, use_safetensors,
+                                           rank_keyword, expected_tp_size)
+    # Postprocess: formatting to the current schema. Consider pulling it
+    # out into a dedicated function should it ever become more complicated.
+    rank_scales_map = {
+        rank: {k: scale[k]
+               for k in sorted(scale.keys())}
+        for rank, scale in rank_scales_map.items()
+    }
+    # TODO: Expand this with activation and weights scaling factors when
+    # they are used in the future
+    schema = QuantParamSchema(
+        model_type=recovered_metadata["model_type"],
+        kv_cache={
+            "dtype": ("float8_e4m3fn" if len(rank_scales_map) > 0 else
+                      recovered_metadata["model_dtype"]),
+            "scaling_factor":
+            rank_scales_map
+        },
+    )
+
+    if args.output_dir is None:
+        output_file = os.path.join(args.quantized_model, args.output_name)
+    else:
+        if not os.path.isdir(args.output_dir):
+            os.makedirs(args.output_dir, exist_ok=True)
+        output_file = os.path.join(args.output_dir, args.output_name)
+
+    with open(output_file, 'w') as f:
+        f.write(schema.model_dump_json(indent=4))
+        print(f"Completed! KV cache scaling factors saved to {output_file}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="This simple utility extracts the "
+        "KV cache scaling factors from a quantized HF model "
+        "and saves them to a JSON file compatible with later "
+        "use by vLLM (pass this file to the appropriate "
+        "runtime typically using the argument "
+        "--quantization-param-path <filename>). This is only used "
+        "if the KV cache dtype is FP8 and on ROCm (AMD GPU).")
+    parser.add_argument(
+        "--quantized_model",
+        help="Specify the directory containing a single quantized HF model. "
+        "It is expected that the quantization format is FP8_E4M3, for use "
+        "on ROCm (AMD GPU).",
+        required=True)
+    parser.add_argument(
+        "--load_format",
+        help="Optionally specify the format of the model's tensor files "
+        "containing the KV cache scaling factors.",
+        choices=["auto", "safetensors", "npz", "pt"],
+        default="auto")
+    parser.add_argument(
+        "--output_dir",
+        help="Optionally specify the output directory. By default the "
+        "KV cache scaling factors will be saved in the model directory, "
+        "however you can override this behavior here.",
+        default=None)
+    parser.add_argument(
+        "--output_name",
+        help="Optionally specify the output filename.",
+        # TODO: Change this once additional scaling factors are enabled
+        default="kv_cache_scales.json")
+    parser.add_argument(
+        "--tp_size",
+        help="Optionally specify the tensor-parallel (TP) size that the "
+        "quantized model should correspond to. If specified, during KV "
+        "cache scaling factor extraction the observed TP size will be "
+        "checked against this and an error will be raised if there is "
+        "a mismatch. If not specified, the quantized model's expected "
+        "TP size is instead inferred from the largest TP rank observed. "
+        "The expected TP size is cross-checked against the TP ranks "
+        "observed in the quantized model and an error is raised if any "
+        "discrepancies are found.",
+        default=None,
+        type=int)
+    args = parser.parse_args()
+
+    main(args)
--- a/examples/fp8/quantizer/README.md
+++ b/examples/fp8/quantizer/README.md
+### Quantizer Utilities
+`quantize.py`: NVIDIA Quantization utilities using AMMO, ported from TensorRT-LLM:
+`https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py`
+
+### Prerequisite
+
+#### AMMO (AlgorithMic Model Optimization) Installation: nvidia-ammo 0.7.1 or later
+`pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo` 
+
+#### AMMO Download (code and docs)
+`https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.5.0.tar.gz`
+`https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.7.1.tar.gz`
+
+### Usage
+
+#### Run on H100 system for speed if FP8; number of GPUs depends on the model size
+
+#### Example: quantize Llama2-7b model from HF to FP8 with FP8 KV Cache:
+`python quantize.py --model_dir ./ll2-7b --dtype float16 --qformat fp8 --kv_cache_dtype fp8 --output_dir ./ll2_7b_fp8 --calib_size 512 --tp_size 1`
+
+Outputs: model structure, quantized model & parameters (with scaling factors) are in JSON and Safetensors (npz is generated only for the reference)
+```
+# ll ./ll2_7b_fp8/
+total 19998244
+drwxr-xr-x 2 root root        4096 Feb  7 01:08 ./
+drwxrwxr-x 8 1060 1061        4096 Feb  7 01:08 ../
+-rw-r--r-- 1 root root      176411 Feb  7 01:08 llama_tp1.json
+-rw-r--r-- 1 root root 13477087480 Feb  7 01:09 llama_tp1_rank0.npz
+-rw-r--r-- 1 root root  7000893272 Feb  7 01:08 rank0.safetensors
+#
+```
+
--- a/examples/fp8/quantizer/quantize.py
+++ b/examples/fp8/quantizer/quantize.py
--- a/examples/gradio_openai_chatbot_webserver.py
+++ b/examples/gradio_openai_chatbot_webserver.py
+import argparse
+
+import gradio as gr
+from openai import OpenAI
+
+# Argument parser setup
+parser = argparse.ArgumentParser(
+    description='Chatbot Interface with Customizable Parameters')
+parser.add_argument('--model-url',
+                    type=str,
+                    default='http://localhost:8000/v1',
+                    help='Model URL')
+parser.add_argument('-m',
+                    '--model',
+                    type=str,
+                    required=True,
+                    help='Model name for the chatbot')
+parser.add_argument('--temp',
+                    type=float,
+                    default=0.8,
+                    help='Temperature for text generation')
+parser.add_argument('--stop-token-ids',
+                    type=str,
+                    default='',
+                    help='Comma-separated stop token IDs')
+parser.add_argument("--host", type=str, default=None)
+parser.add_argument("--port", type=int, default=8001)
+
+# Parse the arguments
+args = parser.parse_args()
+
+# Set OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = args.model_url
+
+# Create an OpenAI client to interact with the API server
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+
+def predict(message, history):
+    # Convert chat history to OpenAI format
+    history_openai_format = [{
+        "role": "system",
+        "content": "You are a great ai assistant."
+    }]
+    for human, assistant in history:
+        history_openai_format.append({"role": "user", "content": human})
+        history_openai_format.append({
+            "role": "assistant",
+            "content": assistant
+        })
+    history_openai_format.append({"role": "user", "content": message})
+
+    # Create a chat completion request and send it to the API server
+    stream = client.chat.completions.create(
+        model=args.model,  # Model name to use
+        messages=history_openai_format,  # Chat history
+        temperature=args.temp,  # Temperature for text generation
+        stream=True,  # Stream response
+        extra_body={
+            'repetition_penalty':
+            1,
+            'stop_token_ids': [
+                int(id.strip()) for id in args.stop_token_ids.split(',')
+                if id.strip()
+            ] if args.stop_token_ids else []
+        })
+
+    # Read and return generated text from response stream
+    partial_message = ""
+    for chunk in stream:
+        partial_message += (chunk.choices[0].delta.content or "")
+        yield partial_message
+
+
+# Create and launch a chat interface with Gradio
+gr.ChatInterface(predict).queue().launch(server_name=args.host,
+                                         server_port=args.port,
+                                         share=True)
--- a/examples/gradio_webserver.py
+++ b/examples/gradio_webserver.py
+import argparse
+import json
+
+import gradio as gr
+import requests
+
+
+def http_bot(prompt):
+    headers = {"User-Agent": "vLLM Client"}
+    pload = {
+        "prompt": prompt,
+        "stream": True,
+        "max_tokens": 128,
+    }
+    response = requests.post(args.model_url,
+                             headers=headers,
+                             json=pload,
+                             stream=True)
+
+    for chunk in response.iter_lines(chunk_size=8192,
+                                     decode_unicode=False,
+                                     delimiter=b"\0"):
+        if chunk:
+            data = json.loads(chunk.decode("utf-8"))
+            output = data["text"][0]
+            yield output
+
+
+def build_demo():
+    with gr.Blocks() as demo:
+        gr.Markdown("# vLLM text completion demo\n")
+        inputbox = gr.Textbox(label="Input",
+                              placeholder="Enter text and press ENTER")
+        outputbox = gr.Textbox(label="Output",
+                               placeholder="Generated result from the model")
+        inputbox.submit(http_bot, [inputbox], [outputbox])
+    return demo
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default=None)
+    parser.add_argument("--port", type=int, default=8001)
+    parser.add_argument("--model-url",
+                        type=str,
+                        default="http://localhost:8000/generate")
+    args = parser.parse_args()
+
+    demo = build_demo()
+    demo.queue().launch(server_name=args.host,
+                        server_port=args.port,
+                        share=True)
--- a/examples/llava_example.py
+++ b/examples/llava_example.py
--- a/examples/llm_engine_example.py
+++ b/examples/llm_engine_example.py
--- a/examples/logging_configuration.md
+++ b/examples/logging_configuration.md
--- a/examples/lora_with_quantization_inference.py
+++ b/examples/lora_with_quantization_inference.py
--- a/examples/multilora_inference.py
+++ b/examples/multilora_inference.py
--- a/examples/offline_inference.py
+++ b/examples/offline_inference.py
+from vllm import LLM, SamplingParams
+
+if __name__ == '__main__':
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=16)
+
+    # Create an LLM.
+    llm = LLM(model="facebook/opt-125m",tensor_parallel_size=1, distributed_executor_backend="ray", dtype="float16",trust_remote_code=True, enforce_eager=True)
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/examples/offline_inference_arctic.py
+++ b/examples/offline_inference_arctic.py
--- a/examples/offline_inference_distributed.py
+++ b/examples/offline_inference_distributed.py
--- a/examples/offline_inference_embedding.py
+++ b/examples/offline_inference_embedding.py
--- a/examples/offline_inference_neuron.py
+++ b/examples/offline_inference_neuron.py
--- a/examples/offline_inference_openai.md
+++ b/examples/offline_inference_openai.md
--- a/examples/offline_inference_with_prefix.py
+++ b/examples/offline_inference_with_prefix.py