v0.5.0.post1

a5753ff5 · zhuwenwen · 21c06ecb · 0f0d8bc0 · a5753ff5 · a5753ff5
Commit a5753ff5 authored Jun 19, 2024 by zhuwenwen
20 changed files
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -9,7 +9,8 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
                                                  ResultHandler, WorkerMonitor)
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+from vllm.utils import (cuda_device_count_stateless,
+                        get_distributed_init_method, get_ip, get_open_port,
                        get_vllm_instance_id, make_async)

 logger = init_logger(__name__)
@@ -33,8 +34,7 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
        # Disable torch async compiling which won't work with daemonic processes
        os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"

-        from torch.cuda import device_count
-        assert world_size <= device_count(), (
+        assert world_size <= cuda_device_count_stateless(), (
            "please set tensor_parallel_size to less than max local gpu count")

        distributed_init_method = get_distributed_init_method(

--- a/vllm/executor/tpu_executor.py
+++ b/vllm/executor/tpu_executor.py
+from typing import List, Set, Tuple
+
+import torch
+
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        make_async)
+
+logger = init_logger(__name__)
+
+
+class TPUExecutor(ExecutorBase):
+
+    def _init_executor(self) -> None:
+        assert not self.scheduler_config.chunked_prefill_enabled, (
+            "Chunked prefill is not yet supported for TPU backend")
+        assert not self.speculative_config, (
+            "Speculative decoding is not yet supported for TPU backend")
+        if self.model_config.dtype in (torch.float16, torch.float32):
+            logger.warning(
+                "The TPU backend currently does not support %s. "
+                "Using bfloat16 instead.", self.model_config.dtype)
+            self.model_config.dtype = torch.bfloat16
+
+        # Instantiate the worker and load the model to the device.
+        self._init_worker()
+
+    def _init_worker(self):
+        from vllm.worker.tpu_worker import TPUWorker
+
+        assert self.parallel_config.world_size == 1, (
+            "TPUExecutor currently only supports a single TPU chip.")
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        self.driver_worker = TPUWorker(
+            self.model_config,
+            self.parallel_config,
+            self.scheduler_config,
+            self.device_config,
+            self.cache_config,
+            self.load_config,
+            self.vision_language_config,
+            local_rank=0,
+            rank=0,
+            distributed_init_method=distributed_init_method,
+        )
+        self.driver_worker.init_device()
+        self.driver_worker.load_model()
+
+    def initialize_cache(
+        self,
+        num_gpu_blocks: int,
+        num_cpu_blocks: int,
+    ) -> None:
+        """Initialize the KV cache by invoking the underlying worker."""
+        # NOTE: This is logged in the executor because there can be >1 worker
+        # with other executors. We could log in the engine level, but work
+        # remains to abstract away the device for non-GPU configurations.
+        logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
+                    num_cpu_blocks)
+        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available KV blocks by invoking the
+        underlying worker.
+        """
+        return self.driver_worker.determine_num_available_blocks()
+
+    def execute_model(
+        self,
+        execute_model_req: ExecuteModelRequest,
+    ) -> List[SamplerOutput]:
+        output = self.driver_worker.execute_model(execute_model_req)
+        return output
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        raise NotImplementedError("LoRA is not implemented for TPU backend.")
+
+    def remove_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError("LoRA is not implemented for TPU backend.")
+
+    def list_loras(self) -> Set[int]:
+        raise NotImplementedError("LoRA is not implemented for TPU backend.")
+
+    def check_health(self) -> None:
+        # TPUExecutor will always be healthy as long as it's running.
+        return
+
+
+class TPUExecutorAsync(TPUExecutor, ExecutorAsyncBase):
+
+    async def execute_model_async(
+        self,
+        sexecute_model_req: ExecuteModelRequest,
+    ) -> SamplerOutput:
+        output = await make_async(self.driver_worker.execute_model
+                                  )(sexecute_model_req)
+        return output
--- a/vllm/inputs.py
+++ b/vllm/inputs.py
@@ -4,7 +4,7 @@ from typing import (TYPE_CHECKING, List, Literal, Optional, Sequence,
 from typing_extensions import NotRequired

 if TYPE_CHECKING:
-    from vllm.sequence import MultiModalData
+    from vllm.multimodal import MultiModalData


 class ParsedText(TypedDict):

--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
 import torch.nn as nn

-from vllm.utils import is_cpu, is_hip
+from vllm.utils import is_cpu, is_hip, is_tpu


 class CustomOp(nn.Module):
@@ -56,5 +56,7 @@ class CustomOp(nn.Module):
            return self.forward_hip
        elif is_cpu():
            return self.forward_cpu
+        elif is_tpu():
+            return self.forward_tpu
        else:
            return self.forward_cuda
--- a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
--- a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -7,8 +7,8 @@ from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
    QuantizationConfig)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme, CompressedTensorsW8A8DynamicToken,
-    CompressedTensorsW8A8StaticTensor)
+    CompressedTensorsScheme, CompressedTensorsW4A16,
+    CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
    QuantizationArgs, QuantizationStrategy, find_first_name_or_class_match)

@@ -47,16 +47,27 @@ class CompressedTensorsConfig(QuantizationConfig):
        layer_quant_details: Dict[str, Any] = dict()
        ignore: List[str] = config.get("ignore", None)

+        # The quant_config has multiple config_groups, each containing
+        # an input_activations key with details about how the activations are
+        # quantized, a weights key indicating how the weights are quantized,
+        # and a list of targets under the `targets` key, dictating which
+        # layers are impacted by the quantization details. The quantization
+        # details follow the structure defined by the QuantizationArgs
+        # pydantic model, which is used to verify the structure of the
+        # quant_config and also store the details for later use.
        for key, quant_config in config["config_groups"].items():
            targets = quant_config.get("targets")
            for target in targets:
                layer_quant_details[target] = {}
                layer_quant_details[target][
-                    "weight"] = QuantizationArgs.parse_obj(
+                    "weights"] = QuantizationArgs.parse_obj(
                        quant_config.get("weights"))
+                try:
                    layer_quant_details[target][
-                    "input"] = QuantizationArgs.parse_obj(
+                        "input_activations"] = QuantizationArgs.parse_obj(
                            quant_config.get("input_activations"))
+                except Exception:
+                    layer_quant_details[target]["input_activations"] = None

        return cls(layer_quant_details=layer_quant_details, ignore=ignore)

@@ -86,8 +97,23 @@ class CompressedTensorsConfig(QuantizationConfig):

        return is_8_bits and is_token_tensor and is_symmetric and is_dynamic

+    def _is_w4a16(self, weight_quant: BaseModel,
+                  input_quant: BaseModel) -> bool:
+        input_quant_none = input_quant is None
+        is_4_bits = weight_quant.num_bits == 4
+        is_symmetric = weight_quant.symmetric
+        is_static = not weight_quant.dynamic
+
+        return is_4_bits and input_quant_none and is_symmetric and is_static
+
    def _get_schema(self, weight_quant: BaseModel,
                    input_quant: BaseModel) -> "CompressedTensorsScheme":
+
+        if self._is_w4a16(weight_quant, input_quant):
+            return CompressedTensorsW4A16(num_bits=weight_quant.num_bits,
+                                          strategy=weight_quant.strategy,
+                                          group_size=weight_quant.group_size)
+
        if self._is_static_tensor_w8a8(weight_quant, input_quant):
            return CompressedTensorsW8A8StaticTensor()

@@ -113,8 +139,9 @@ class CompressedTensorsConfig(QuantizationConfig):
            raise ValueError(
                f"Could not find quantization details for {layer}.")

-        return self._get_schema(weight_quant=layer_quant_details["weight"],
-                                input_quant=layer_quant_details["input"])
+        return self._get_schema(
+            weight_quant=layer_quant_details["weights"],
+            input_quant=layer_quant_details["input_activations"])


 class CompressedTensorsLinearMethod(LinearMethodBase):
@@ -140,6 +167,7 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
            layer=layer,
            input_size_per_partition=input_size_per_partition,
            output_partition_sizes=output_partition_sizes,
+            input_size=input_size,
            output_size=output_size,
            params_dtype=params_dtype,
            weight_loader=weight_loader)

--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
 from .compressed_tensors_scheme import CompressedTensorsScheme  # noqa: F401
 from .compressed_tensors_unquantized import (  # noqa: F401
    CompressedTensorsUnquantized)
+from .compressed_tensors_w4a16 import CompressedTensorsW4A16  # noqa: F401
 from .compressed_tensors_w8a8_dynamictoken import (  # noqa: F401, E501
    CompressedTensorsW8A8DynamicToken)
 from .compressed_tensors_w8a8_statictensor import (  # noqa: F401, E501

--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
@@ -81,5 +81,5 @@ class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme):
        weight_scale = layer.weight_scale

        x_q, input_scales = custom_ops.scaled_int8_quant(x)
-        return custom_ops.cutlass_scaled_mm_dq(x_q, weight.t(), input_scales,
+        return custom_ops.cutlass_scaled_mm(x_q, weight.t(), input_scales,
                                            weight_scale, x.dtype)
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
@@ -99,5 +99,5 @@ class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme):
        # Input quantize
        x_q, _ = custom_ops.scaled_int8_quant(x, act_scale)

-        return custom_ops.cutlass_scaled_mm_dq(x_q, weight.t(), act_scale,
+        return custom_ops.cutlass_scaled_mm(x_q, weight.t(), act_scale,
                                            weight_scale, x.dtype)
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -257,11 +257,13 @@ class Fp8LinearMethod(LinearMethodBase):
        #   If dynamic, layer.input_scale is None and x_scale computed from x.
        #   If static, layer.input_scale is scalar and x_scale is input_scale.

-        if bias is None and self.cutlass_fp8_supported:
+        # Temporarily disable CUTLASS kernels due to an illegal memory access
+        #if  bias is None and self.cutlass_fp8_supported:
+        if False:
            qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale)

            # Fused GEMM_DQ
-            output = ops.cutlass_scaled_mm_dq(
+            output = ops.cutlass_scaled_mm(
                qinput,
                layer.weight,
                out_dtype=x.dtype,

--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -16,6 +16,7 @@ import requests
 import torch

 import vllm.envs as envs
+from vllm.version import __version__ as VLLM_VERSION

 _config_home = envs.VLLM_CONFIG_ROOT
 _USAGE_STATS_JSON_PATH = os.path.join(_config_home, "vllm/usage_stats.json")
@@ -163,9 +164,8 @@ class UsageMessage:
        ])

        # vLLM information
-        import vllm  # delayed import to prevent circular import
        self.context = usage_context.value
-        self.vllm_version = vllm.__version__
+        self.vllm_version = VLLM_VERSION
        self.model_architecture = model_architecture

        # Metadata