added support for quantize on LLM module (#1080)

fbe66e1d · orellavie1212 · GitHub · 90979c38 · fbe66e1d
Unverified Commit fbe66e1d authored Sep 18, 2023 by orellavie1212 Committed by GitHub Sep 18, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 0 deletions

vllm/entrypoints/llm.py vllm/entrypoints/llm.py +5 -0

No files found.
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -38,6 +38,9 @@ class LLM:
            However, if the `torch_dtype` in the config is `float32`, we will
            use `float16` instead.
        seed: The seed to initialize the random number generator for sampling.
+        quantization: The method used to quantize the model weights. Currently,
+            we support "awq". If None, we assume the model weights are not
+            quantized and use `dtype` to determine the data type of the weights.
        revision: The specific model version to use. It can be a branch name,
            a tag name, or a commit id.
    """
@@ -51,6 +54,7 @@ class LLM:
        tensor_parallel_size: int = 1,
        dtype: str = "auto",
        seed: int = 0,
+        quantization: Optional[str] = None,
        **kwargs,
    ) -> None:
        if "disable_log_stats" not in kwargs:
@@ -63,6 +67,7 @@ class LLM:
            tensor_parallel_size=tensor_parallel_size,
            dtype=dtype,
            seed=seed,
+            quantization=quantization,
            **kwargs,
        )
        self.llm_engine = LLMEngine.from_engine_args(engine_args)