Commit a5753ff5 authored by zhuwenwen's avatar zhuwenwen
Browse files

v0.5.0.post1

parents 21c06ecb 0f0d8bc0
...@@ -9,7 +9,8 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper, ...@@ -9,7 +9,8 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
ResultHandler, WorkerMonitor) ResultHandler, WorkerMonitor)
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.sequence import ExecuteModelRequest, SamplerOutput
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, from vllm.utils import (cuda_device_count_stateless,
get_distributed_init_method, get_ip, get_open_port,
get_vllm_instance_id, make_async) get_vllm_instance_id, make_async)
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -33,8 +34,7 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor): ...@@ -33,8 +34,7 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
# Disable torch async compiling which won't work with daemonic processes # Disable torch async compiling which won't work with daemonic processes
os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
from torch.cuda import device_count assert world_size <= cuda_device_count_stateless(), (
assert world_size <= device_count(), (
"please set tensor_parallel_size to less than max local gpu count") "please set tensor_parallel_size to less than max local gpu count")
distributed_init_method = get_distributed_init_method( distributed_init_method = get_distributed_init_method(
......
from typing import List, Set, Tuple
import torch
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.sequence import ExecuteModelRequest, SamplerOutput
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
make_async)
logger = init_logger(__name__)
class TPUExecutor(ExecutorBase):
def _init_executor(self) -> None:
assert not self.scheduler_config.chunked_prefill_enabled, (
"Chunked prefill is not yet supported for TPU backend")
assert not self.speculative_config, (
"Speculative decoding is not yet supported for TPU backend")
if self.model_config.dtype in (torch.float16, torch.float32):
logger.warning(
"The TPU backend currently does not support %s. "
"Using bfloat16 instead.", self.model_config.dtype)
self.model_config.dtype = torch.bfloat16
# Instantiate the worker and load the model to the device.
self._init_worker()
def _init_worker(self):
from vllm.worker.tpu_worker import TPUWorker
assert self.parallel_config.world_size == 1, (
"TPUExecutor currently only supports a single TPU chip.")
distributed_init_method = get_distributed_init_method(
get_ip(), get_open_port())
self.driver_worker = TPUWorker(
self.model_config,
self.parallel_config,
self.scheduler_config,
self.device_config,
self.cache_config,
self.load_config,
self.vision_language_config,
local_rank=0,
rank=0,
distributed_init_method=distributed_init_method,
)
self.driver_worker.init_device()
self.driver_worker.load_model()
def initialize_cache(
self,
num_gpu_blocks: int,
num_cpu_blocks: int,
) -> None:
"""Initialize the KV cache by invoking the underlying worker."""
# NOTE: This is logged in the executor because there can be >1 worker
# with other executors. We could log in the engine level, but work
# remains to abstract away the device for non-GPU configurations.
logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
num_cpu_blocks)
self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
def determine_num_available_blocks(self) -> Tuple[int, int]:
"""Determine the number of available KV blocks by invoking the
underlying worker.
"""
return self.driver_worker.determine_num_available_blocks()
def execute_model(
self,
execute_model_req: ExecuteModelRequest,
) -> List[SamplerOutput]:
output = self.driver_worker.execute_model(execute_model_req)
return output
def add_lora(self, lora_request: LoRARequest) -> bool:
raise NotImplementedError("LoRA is not implemented for TPU backend.")
def remove_lora(self, lora_id: int) -> bool:
raise NotImplementedError("LoRA is not implemented for TPU backend.")
def list_loras(self) -> Set[int]:
raise NotImplementedError("LoRA is not implemented for TPU backend.")
def check_health(self) -> None:
# TPUExecutor will always be healthy as long as it's running.
return
class TPUExecutorAsync(TPUExecutor, ExecutorAsyncBase):
async def execute_model_async(
self,
sexecute_model_req: ExecuteModelRequest,
) -> SamplerOutput:
output = await make_async(self.driver_worker.execute_model
)(sexecute_model_req)
return output
...@@ -4,7 +4,7 @@ from typing import (TYPE_CHECKING, List, Literal, Optional, Sequence, ...@@ -4,7 +4,7 @@ from typing import (TYPE_CHECKING, List, Literal, Optional, Sequence,
from typing_extensions import NotRequired from typing_extensions import NotRequired
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.sequence import MultiModalData from vllm.multimodal import MultiModalData
class ParsedText(TypedDict): class ParsedText(TypedDict):
......
import torch.nn as nn import torch.nn as nn
from vllm.utils import is_cpu, is_hip from vllm.utils import is_cpu, is_hip, is_tpu
class CustomOp(nn.Module): class CustomOp(nn.Module):
...@@ -56,5 +56,7 @@ class CustomOp(nn.Module): ...@@ -56,5 +56,7 @@ class CustomOp(nn.Module):
return self.forward_hip return self.forward_hip
elif is_cpu(): elif is_cpu():
return self.forward_cpu return self.forward_cpu
elif is_tpu():
return self.forward_tpu
else: else:
return self.forward_cuda return self.forward_cuda
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 5
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 64,
"num_warps": 8,
"num_stages": 5
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 5
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 5
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"24": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2
},
"32": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"48": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2
},
"64": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3
},
"96": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3
},
"128": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3
},
"256": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 3
},
"512": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 3
},
"1024": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 8,
"num_stages": 3
},
"1536": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3
},
"3072": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3
},
"4096": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3
}
}
...@@ -7,8 +7,8 @@ from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase ...@@ -7,8 +7,8 @@ from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501 from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501
QuantizationConfig) QuantizationConfig)
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
CompressedTensorsScheme, CompressedTensorsW8A8DynamicToken, CompressedTensorsScheme, CompressedTensorsW4A16,
CompressedTensorsW8A8StaticTensor) CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor)
from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
QuantizationArgs, QuantizationStrategy, find_first_name_or_class_match) QuantizationArgs, QuantizationStrategy, find_first_name_or_class_match)
...@@ -47,16 +47,27 @@ class CompressedTensorsConfig(QuantizationConfig): ...@@ -47,16 +47,27 @@ class CompressedTensorsConfig(QuantizationConfig):
layer_quant_details: Dict[str, Any] = dict() layer_quant_details: Dict[str, Any] = dict()
ignore: List[str] = config.get("ignore", None) ignore: List[str] = config.get("ignore", None)
# The quant_config has multiple config_groups, each containing
# an input_activations key with details about how the activations are
# quantized, a weights key indicating how the weights are quantized,
# and a list of targets under the `targets` key, dictating which
# layers are impacted by the quantization details. The quantization
# details follow the structure defined by the QuantizationArgs
# pydantic model, which is used to verify the structure of the
# quant_config and also store the details for later use.
for key, quant_config in config["config_groups"].items(): for key, quant_config in config["config_groups"].items():
targets = quant_config.get("targets") targets = quant_config.get("targets")
for target in targets: for target in targets:
layer_quant_details[target] = {} layer_quant_details[target] = {}
layer_quant_details[target][ layer_quant_details[target][
"weight"] = QuantizationArgs.parse_obj( "weights"] = QuantizationArgs.parse_obj(
quant_config.get("weights")) quant_config.get("weights"))
try:
layer_quant_details[target][ layer_quant_details[target][
"input"] = QuantizationArgs.parse_obj( "input_activations"] = QuantizationArgs.parse_obj(
quant_config.get("input_activations")) quant_config.get("input_activations"))
except Exception:
layer_quant_details[target]["input_activations"] = None
return cls(layer_quant_details=layer_quant_details, ignore=ignore) return cls(layer_quant_details=layer_quant_details, ignore=ignore)
...@@ -86,8 +97,23 @@ class CompressedTensorsConfig(QuantizationConfig): ...@@ -86,8 +97,23 @@ class CompressedTensorsConfig(QuantizationConfig):
return is_8_bits and is_token_tensor and is_symmetric and is_dynamic return is_8_bits and is_token_tensor and is_symmetric and is_dynamic
def _is_w4a16(self, weight_quant: BaseModel,
input_quant: BaseModel) -> bool:
input_quant_none = input_quant is None
is_4_bits = weight_quant.num_bits == 4
is_symmetric = weight_quant.symmetric
is_static = not weight_quant.dynamic
return is_4_bits and input_quant_none and is_symmetric and is_static
def _get_schema(self, weight_quant: BaseModel, def _get_schema(self, weight_quant: BaseModel,
input_quant: BaseModel) -> "CompressedTensorsScheme": input_quant: BaseModel) -> "CompressedTensorsScheme":
if self._is_w4a16(weight_quant, input_quant):
return CompressedTensorsW4A16(num_bits=weight_quant.num_bits,
strategy=weight_quant.strategy,
group_size=weight_quant.group_size)
if self._is_static_tensor_w8a8(weight_quant, input_quant): if self._is_static_tensor_w8a8(weight_quant, input_quant):
return CompressedTensorsW8A8StaticTensor() return CompressedTensorsW8A8StaticTensor()
...@@ -113,8 +139,9 @@ class CompressedTensorsConfig(QuantizationConfig): ...@@ -113,8 +139,9 @@ class CompressedTensorsConfig(QuantizationConfig):
raise ValueError( raise ValueError(
f"Could not find quantization details for {layer}.") f"Could not find quantization details for {layer}.")
return self._get_schema(weight_quant=layer_quant_details["weight"], return self._get_schema(
input_quant=layer_quant_details["input"]) weight_quant=layer_quant_details["weights"],
input_quant=layer_quant_details["input_activations"])
class CompressedTensorsLinearMethod(LinearMethodBase): class CompressedTensorsLinearMethod(LinearMethodBase):
...@@ -140,6 +167,7 @@ class CompressedTensorsLinearMethod(LinearMethodBase): ...@@ -140,6 +167,7 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
layer=layer, layer=layer,
input_size_per_partition=input_size_per_partition, input_size_per_partition=input_size_per_partition,
output_partition_sizes=output_partition_sizes, output_partition_sizes=output_partition_sizes,
input_size=input_size,
output_size=output_size, output_size=output_size,
params_dtype=params_dtype, params_dtype=params_dtype,
weight_loader=weight_loader) weight_loader=weight_loader)
......
from .compressed_tensors_scheme import CompressedTensorsScheme # noqa: F401 from .compressed_tensors_scheme import CompressedTensorsScheme # noqa: F401
from .compressed_tensors_unquantized import ( # noqa: F401 from .compressed_tensors_unquantized import ( # noqa: F401
CompressedTensorsUnquantized) CompressedTensorsUnquantized)
from .compressed_tensors_w4a16 import CompressedTensorsW4A16 # noqa: F401
from .compressed_tensors_w8a8_dynamictoken import ( # noqa: F401, E501 from .compressed_tensors_w8a8_dynamictoken import ( # noqa: F401, E501
CompressedTensorsW8A8DynamicToken) CompressedTensorsW8A8DynamicToken)
from .compressed_tensors_w8a8_statictensor import ( # noqa: F401, E501 from .compressed_tensors_w8a8_statictensor import ( # noqa: F401, E501
......
...@@ -81,5 +81,5 @@ class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme): ...@@ -81,5 +81,5 @@ class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme):
weight_scale = layer.weight_scale weight_scale = layer.weight_scale
x_q, input_scales = custom_ops.scaled_int8_quant(x) x_q, input_scales = custom_ops.scaled_int8_quant(x)
return custom_ops.cutlass_scaled_mm_dq(x_q, weight.t(), input_scales, return custom_ops.cutlass_scaled_mm(x_q, weight.t(), input_scales,
weight_scale, x.dtype) weight_scale, x.dtype)
...@@ -99,5 +99,5 @@ class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme): ...@@ -99,5 +99,5 @@ class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme):
# Input quantize # Input quantize
x_q, _ = custom_ops.scaled_int8_quant(x, act_scale) x_q, _ = custom_ops.scaled_int8_quant(x, act_scale)
return custom_ops.cutlass_scaled_mm_dq(x_q, weight.t(), act_scale, return custom_ops.cutlass_scaled_mm(x_q, weight.t(), act_scale,
weight_scale, x.dtype) weight_scale, x.dtype)
...@@ -257,11 +257,13 @@ class Fp8LinearMethod(LinearMethodBase): ...@@ -257,11 +257,13 @@ class Fp8LinearMethod(LinearMethodBase):
# If dynamic, layer.input_scale is None and x_scale computed from x. # If dynamic, layer.input_scale is None and x_scale computed from x.
# If static, layer.input_scale is scalar and x_scale is input_scale. # If static, layer.input_scale is scalar and x_scale is input_scale.
if bias is None and self.cutlass_fp8_supported: # Temporarily disable CUTLASS kernels due to an illegal memory access
#if bias is None and self.cutlass_fp8_supported:
if False:
qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale) qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale)
# Fused GEMM_DQ # Fused GEMM_DQ
output = ops.cutlass_scaled_mm_dq( output = ops.cutlass_scaled_mm(
qinput, qinput,
layer.weight, layer.weight,
out_dtype=x.dtype, out_dtype=x.dtype,
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -16,6 +16,7 @@ import requests ...@@ -16,6 +16,7 @@ import requests
import torch import torch
import vllm.envs as envs import vllm.envs as envs
from vllm.version import __version__ as VLLM_VERSION
_config_home = envs.VLLM_CONFIG_ROOT _config_home = envs.VLLM_CONFIG_ROOT
_USAGE_STATS_JSON_PATH = os.path.join(_config_home, "vllm/usage_stats.json") _USAGE_STATS_JSON_PATH = os.path.join(_config_home, "vllm/usage_stats.json")
...@@ -163,9 +164,8 @@ class UsageMessage: ...@@ -163,9 +164,8 @@ class UsageMessage:
]) ])
# vLLM information # vLLM information
import vllm # delayed import to prevent circular import
self.context = usage_context.value self.context = usage_context.value
self.vllm_version = vllm.__version__ self.vllm_version = VLLM_VERSION
self.model_architecture = model_architecture self.model_architecture = model_architecture
# Metadata # Metadata
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment