Unverified Commit 9f771b3a authored by Jinzhen Lin's avatar Jinzhen Lin Committed by GitHub
Browse files

[Quantization] add humming quantization kernel (#34556)

parent c9d3c6e6
......@@ -953,8 +953,12 @@ class ModelConfig:
"mxfp4",
"gpt_oss_mxfp4",
"cpu_awq",
"humming",
"gguf",
]
# if the user specifies humming, we should always use humming
if self.quantization == "humming":
overrides = ["humming"] + overrides
quantization_methods = [
q for q in supported_quantization if q not in overrides
]
......
......@@ -152,6 +152,10 @@ if TYPE_CHECKING:
VLLM_RAY_EXTRA_ENV_VARS_TO_COPY: str = ""
VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
VLLM_MARLIN_INPUT_DTYPE: Literal["int8", "fp8"] | None = None
VLLM_HUMMING_ONLINE_QUANT_CONFIG: dict[str, Any] | None = None
VLLM_HUMMING_INPUT_QUANT_CONFIG: dict[str, Any] | None = None
VLLM_HUMMING_USE_F16_ACCUM: bool = False
VLLM_HUMMING_MOE_GEMM_TYPE: Literal["indexed", "grouped", "auto"] | None = None
VLLM_MXFP4_USE_MARLIN: bool | None = None
VLLM_DEEPEPLL_NVFP4_DISPATCH: bool = False
VLLM_V1_USE_OUTLINES_CACHE: bool = False
......@@ -285,6 +289,15 @@ def maybe_convert_bool(value: str | None) -> bool | None:
return bool(int(value))
def maybe_convert_json_str_or_file(value: str | None) -> dict[str, Any] | None:
if value is None:
return None
if os.path.exists(value):
with open(value) as f:
return json.load(f)
return json.loads(value)
def disable_compile_cache() -> bool:
return bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0")))
......@@ -1193,6 +1206,25 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_MARLIN_INPUT_DTYPE": env_with_choices(
"VLLM_MARLIN_INPUT_DTYPE", None, ["int8", "fp8"]
),
# The online quantization dtype for humming kernel
"VLLM_HUMMING_ONLINE_QUANT_CONFIG": lambda: maybe_convert_json_str_or_file(
os.environ.get("VLLM_HUMMING_ONLINE_QUANT_CONFIG", None)
),
# The activation dtype config for humming kernel
"VLLM_HUMMING_INPUT_QUANT_CONFIG": lambda: maybe_convert_json_str_or_file(
os.environ.get("VLLM_HUMMING_INPUT_QUANT_CONFIG", None)
),
# Whether to use fp16 accumulator mma
"VLLM_HUMMING_USE_F16_ACCUM": lambda: maybe_convert_bool(
os.environ.get("VLLM_HUMMING_USE_F16_ACCUM", "0")
),
# Whether to use indexed gemm for humming moe
# if 1, force use indexed gemm
# if 0, force use grouped gemm
# if None, choose better gemm type automatically
"VLLM_HUMMING_MOE_GEMM_TYPE": lambda: maybe_convert_bool(
os.environ.get("VLLM_HUMMING_MOE_GEMM_TYPE", None)
),
# Whether to use DeepEPLL kernels for NVFP4 quantization and dispatch method
# only supported on Blackwell GPUs and with
# https://github.com/deepseek-ai/DeepEP/pull/341
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Fused MoE utilities for Humming."""
import json
import math
from typing import TYPE_CHECKING, Any
import torch
from humming import dtypes
from humming.config import GemmType as HummingGemmType
from humming.layer import HummingLayerMeta, HummingMethod
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm import envs
from vllm.forward_context import get_forward_context
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
moe_align_block_size,
)
from vllm.model_executor.layers.fused_moe.moe_fused_mul_sum import moe_fused_mul_sum
from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
moe_permute,
moe_unpermute,
)
from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
TopKWeightAndReduceDelegate,
TopKWeightAndReduceNoOP,
)
from vllm.model_executor.layers.fused_moe.utils import _resize_cache
from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
from vllm.platforms import current_platform
from vllm.v1.worker.workspace import current_workspace_manager
if TYPE_CHECKING:
from vllm.model_executor.layers.quantization.humming import HummingMoEMethod
logger = init_logger(__name__)
def get_humming_moe_gemm_type() -> str:
env_gemm_type: str = envs.VLLM_HUMMING_MOE_GEMM_TYPE or ""
env_gemm_type = env_gemm_type.lower()
if env_gemm_type in ["indexed", "grouped"]:
gemm_type = env_gemm_type
elif current_platform.has_device_capability(90):
# for device that supports TMA, use grouped gemm
gemm_type = "grouped"
else:
gemm_type = "indexed"
logger.info_once(f"Using {gemm_type} gemm for humming moe") # noqa
return gemm_type
class HummingExpertsBase(mk.FusedMoEExpertsModular):
def __init__(
self,
layer: torch.nn.Module,
quant_method: "HummingMoEMethod",
prepare_finalize: mk.FusedMoEPrepareAndFinalizeModular | None = None,
):
self.layer = layer
self.num_experts = self.layer.num_experts
self.global_num_experts = self.layer.global_num_experts
self.init_humming_moe()
if prepare_finalize is not None:
max_num_tokens: int | None = None
num_dispatchers: int | None = None
if self.is_batched:
max_num_tokens = prepare_finalize.max_num_tokens_per_rank()
num_dispatchers = prepare_finalize.num_dispatchers()
assert quant_method.moe_quant_config is not None
super().__init__(
moe_config=quant_method.moe,
quant_config=quant_method.moe_quant_config,
max_num_tokens=max_num_tokens,
num_dispatchers=num_dispatchers,
)
else:
assert not self.is_batched
def init_humming_moe(self):
self.compute_config = {
"use_batch_invariant": envs.VLLM_BATCH_INVARIANT,
"use_f16_accum": envs.VLLM_HUMMING_USE_F16_ACCUM,
"gemm_type": self.humming_gemm_type.value,
}
self.w13_tuning_config = HummingMethod.get_default_tuning_configs(
layer=self.layer,
use_f16_accum=envs.VLLM_HUMMING_USE_F16_ACCUM,
use_batch_invariant=envs.VLLM_BATCH_INVARIANT,
gemm_type=self.humming_gemm_type,
sublayer_name="w13",
)
self.w2_tuning_config = HummingMethod.get_default_tuning_configs(
layer=self.layer,
use_f16_accum=envs.VLLM_HUMMING_USE_F16_ACCUM,
use_batch_invariant=envs.VLLM_BATCH_INVARIANT,
gemm_type=self.humming_gemm_type,
sublayer_name="w2",
)
self.compute_config_str = json.dumps(self.compute_config)
self.w13_tuning_config_str = json.dumps(self.w13_tuning_config)
self.w2_tuning_config_str = json.dumps(self.w2_tuning_config)
def get_global_valid_shape_m(self, topk_ids: torch.Tensor):
num_tokens = topk_ids.size(0)
ctx = get_forward_context()
if ctx.dp_metadata is not None:
num_tokens = ctx.dp_metadata.num_tokens_across_dp_cpu.sum().item()
return num_tokens * topk_ids.size(1)
def estimate_local_valid_shape_m(self, topk_ids: torch.Tensor):
# estimate shape_m for kernel tuning
global_valid_shape_m = self.get_global_valid_shape_m(topk_ids)
num_experts = self.num_experts
global_num_experts = self.global_num_experts
return math.ceil(global_valid_shape_m * num_experts / global_num_experts)
@property
def humming_gemm_type(self) -> HummingGemmType:
raise NotImplementedError
@property
def is_batched(self) -> bool:
return self.activation_format() == mk.FusedMoEActivationFormat.BatchedExperts
@staticmethod
def _supports_quant_scheme(
weight_key: QuantKey | None,
activation_key: QuantKey | None,
) -> bool:
return True
def supports_expert_map(self) -> bool:
return True
@staticmethod
def _supports_current_device() -> bool:
platform = current_platform
return platform.is_cuda() and platform.has_device_capability((7, 5))
@staticmethod
def _supports_no_act_and_mul() -> bool:
return True
@staticmethod
def _supports_activation(activation: MoEActivation) -> bool:
# Humming uses apply_moe_activation() callback for activation,
# so any activation supported there can be used here.
return activation in [
MoEActivation.SILU,
MoEActivation.GELU,
MoEActivation.SWIGLUOAI,
MoEActivation.SWIGLUSTEP,
MoEActivation.SILU_NO_MUL,
MoEActivation.GELU_NO_MUL,
MoEActivation.RELU2_NO_MUL,
]
@staticmethod
def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
return not (
moe_parallel_config.use_fi_nvl_two_sided_kernels
or moe_parallel_config.use_fi_nvl_one_sided_kernels
)
def moe_problem_size(
self,
a1: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
topk_ids: torch.Tensor,
) -> tuple[int, int, int, int, int]:
meta1: HummingLayerMeta = self.layer.humming_metas["w13"]
meta2: HummingLayerMeta = self.layer.humming_metas["w2"]
assert meta1.num_experts == meta2.num_experts
num_experts = meta1.num_experts
top_k = topk_ids.size(1)
assert w1.size(0) == num_experts
assert w2.size(0) == num_experts
if not self.is_batched:
num_tokens = a1.size(0)
assert topk_ids.size(0) == num_tokens
else:
assert a1.dim() == 3
assert a1.size(0) == num_experts
num_tokens = a1.size(1)
return meta1.num_experts, num_tokens, meta1.shape_n // 2, meta1.shape_k, top_k
def get_buffer_metas(self, M: int, topk: int, activation: MoEActivation):
num_experts = self.num_experts
N = self.layer.intermediate_size
K = self.layer.hidden_size
assert isinstance(num_experts, int)
assert isinstance(N, int)
assert isinstance(K, int)
# hidden_states
# (-> quanted_gate_up_input) (if not BF16/FP16 activation)
# -> gate_up_output
# -> activation_output
# (-> quanted_down_input) (if not BF16/FP16 activation)
# -> down_output
# (-> output) (if not is_batched)
# Neighboring nodes are required to utilize distinct workspaces.
# The output must be derived from workspace1.
output_shape: tuple[int, ...]
if self.is_batched:
max_num_tokens = self.max_num_tokens
num_dispatchers = self.num_dispatchers
assert max_num_tokens is not None and num_dispatchers is not None
input_shape_m = num_experts * max_num_tokens
real_shape_m = num_experts * max_num_tokens * num_dispatchers
output_shape = (num_experts, max_num_tokens * num_dispatchers, K)
else:
input_shape_m = M
if self.humming_gemm_type != HummingGemmType.INDEXED:
input_shape_m = M * topk
real_shape_m = M * topk
output_shape = (M, K)
down_input_size = N if activation.is_gated else (N * 2)
a_dtype = self.layer.humming_metas["w13"].a_dtype
c_dtype = self.layer.humming_metas["w13"].c_dtype
num_bits = a_dtype.num_bits
torch_dtype_map = {
dtypes.float16: torch.float16,
dtypes.bfloat16: torch.bfloat16,
dtypes.float8e4m3: torch.float8_e4m3fn,
dtypes.int8: torch.int8,
dtypes.int4: torch.uint8,
}
buffer_metas = {
"quanted_gate_up_input": {
"shape": (input_shape_m, K),
"dtype": torch_dtype_map[a_dtype],
},
"gate_up_output": {
"shape": (real_shape_m, N * 2),
"dtype": torch_dtype_map[c_dtype],
},
"activation_output": {
"shape": (real_shape_m, down_input_size),
"dtype": torch_dtype_map[c_dtype],
},
"quanted_down_input": {
"shape": (real_shape_m, down_input_size),
"dtype": torch_dtype_map[a_dtype],
},
"down_output": {
"shape": output_shape if self.is_batched else (real_shape_m, K),
"dtype": torch_dtype_map[c_dtype],
},
"output": {
"shape": output_shape,
"dtype": torch_dtype_map[c_dtype],
},
}
for key in buffer_metas:
meta = buffer_metas[key]
if "quanted" in key and a_dtype.num_bits == 4:
meta["shape"] = meta["shape"][:-1] + (meta["shape"][-1] // 2,)
if num_bits == 16:
required_buffers = ["gate_up_output", "activation_output", "down_output"]
else:
required_buffers = [
"quanted_gate_up_input",
"gate_up_output",
"activation_output",
"quanted_down_input",
"down_output",
]
# batched moe use down_output as output
if not self.is_batched:
required_buffers.append("output")
return buffer_metas, required_buffers
def _workspace_shapes(self, M: int, topk: int, activation: MoEActivation):
buffer_metas, required_buffers = self.get_buffer_metas(M, topk, activation)
workspace1_nbytes = 0
workspace2_nbytes = 0
for index, name in enumerate(required_buffers[::-1]):
buffer_meta = buffer_metas[name]
nelement = math.prod(buffer_meta["shape"])
nbytes = nelement * buffer_meta["dtype"].itemsize
if index % 2 == 0:
workspace1_nbytes = max(workspace1_nbytes, nbytes)
else:
workspace2_nbytes = max(workspace2_nbytes, nbytes)
output_key = "down_output" if self.is_batched else "output"
output_shape = buffer_metas[output_key]["shape"]
return (workspace1_nbytes // 2,), (workspace2_nbytes // 2,), output_shape
def workspace_shapes(
self,
M: int,
N: int,
K: int,
topk: int,
global_num_experts: int,
local_num_experts: int,
expert_tokens_meta: mk.ExpertTokensMetadata | None,
activation: MoEActivation,
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
return self._workspace_shapes(M, topk, activation)
def make_workspaces(self, M: int, topk: int, activation: MoEActivation):
shapes = self._workspace_shapes(M, topk, activation)
workspace1_shape, workspace2_shape, output_shape = shapes
torch_dtype = self.layer.param_dtype
workspace1, workspace2 = current_workspace_manager().get_simultaneous(
(workspace1_shape, torch_dtype),
(workspace2_shape, torch_dtype),
)
output = _resize_cache(workspace1, output_shape)
return workspace1, workspace2, output
def prepare_buffers(
self,
workspace1: torch.Tensor,
workspace2: torch.Tensor,
M: int,
topk: int,
activation: MoEActivation,
) -> dict[str, torch.Tensor]:
buffer_metas, required_buffers = self.get_buffer_metas(M, topk, activation)
buffers = {}
for index, name in enumerate(required_buffers[::-1]):
buffer_meta = buffer_metas[name]
workspace = workspace1 if index % 2 == 0 else workspace2
workspace = workspace.view(buffer_meta["dtype"])
buffers[name] = _resize_cache(workspace, buffer_meta["shape"])
return buffers
def apply(
self,
output: torch.Tensor,
hidden_states: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
activation: MoEActivation,
global_num_experts: int,
expert_map: torch.Tensor | None,
a1q_scale: torch.Tensor | None,
a2_scale: torch.Tensor | None,
workspace13: torch.Tensor,
workspace2: torch.Tensor,
expert_tokens_meta: mk.ExpertTokensMetadata | None,
apply_router_weight_on_input: bool,
):
assert not apply_router_weight_on_input
self.main_apply(
hidden_states=hidden_states,
topk_weights=topk_weights,
topk_ids=topk_ids,
workspace1=workspace13,
workspace2=workspace2,
expert_tokens_meta=expert_tokens_meta,
)
def main_apply(
self,
hidden_states: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
workspace1: torch.Tensor,
workspace2: torch.Tensor,
expert_tokens_meta: mk.ExpertTokensMetadata | None,
):
raise NotImplementedError
class HummingIndexedExperts(HummingExpertsBase):
def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
return TopKWeightAndReduceNoOP()
@staticmethod
def activation_format() -> mk.FusedMoEActivationFormat:
return mk.FusedMoEActivationFormat.Standard
@property
def humming_gemm_type(self) -> HummingGemmType:
return HummingGemmType.INDEXED
def prepare_humming_moe_kwargs(
self,
topk_ids: torch.Tensor,
expert_map: torch.Tensor | None,
expert_tokens_meta: mk.ExpertTokensMetadata | None,
) -> tuple[dict[str, Any], dict[str, Any]]:
valid_shape_m = self.estimate_local_valid_shape_m(topk_ids)
for min_shape_m, max_shape_m, config in self.w13_tuning_config:
if valid_shape_m > min_shape_m and valid_shape_m <= max_shape_m:
moe_block_size = config["block_shape"][0]
break
else:
raise ValueError(f"cannot found moe_block_size for shape {valid_shape_m}")
sorted_ids, expert_ids, num_tokens_padded = moe_align_block_size(
topk_ids=topk_ids,
block_size=moe_block_size,
num_experts=self.global_num_experts,
expert_map=expert_map,
ignore_invalid_experts=True,
)
moe_common_kwargs = {
"sorted_ids": sorted_ids,
"expert_ids": expert_ids,
"num_tokens_padded": num_tokens_padded,
"compute_config": self.compute_config_str,
"valid_shape_m": valid_shape_m,
}
top_k = topk_ids.size(1)
moe_kwargs1 = {"top_k": top_k, "tuning_config": self.w13_tuning_config_str}
moe_kwargs2 = {"top_k": 1, "tuning_config": self.w2_tuning_config_str}
moe_kwargs1.update(moe_common_kwargs)
moe_kwargs2.update(moe_common_kwargs)
return moe_kwargs1, moe_kwargs2
def main_apply(
self,
hidden_states: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
workspace1: torch.Tensor,
workspace2: torch.Tensor,
expert_tokens_meta: mk.ExpertTokensMetadata | None,
):
hidden_states = hidden_states.view(-1, hidden_states.size(-1))
buffers = self.prepare_buffers(
workspace1,
workspace2,
topk_ids.size(0),
topk_ids.size(1),
self.layer.activation,
)
moe_kwargs1, moe_kwargs2 = self.prepare_humming_moe_kwargs(
topk_ids=topk_ids,
expert_map=self.layer.expert_map,
expert_tokens_meta=expert_tokens_meta,
)
inputs, input_scale = HummingMethod.may_quant_input(
layer=self.layer,
inputs=hidden_states,
quanted_input=buffers.get("quanted_gate_up_input", None),
sublayer_name="w13",
)
HummingMethod.forward_layer(
layer=self.layer,
inputs=inputs,
input_scale=input_scale,
outputs=buffers["gate_up_output"],
sublayer_name="w13",
**moe_kwargs1,
)
self.activation(
activation=self.layer.activation,
input=buffers["gate_up_output"],
output=buffers["activation_output"],
)
inputs, input_scale = HummingMethod.may_quant_input(
layer=self.layer,
inputs=buffers["activation_output"],
quanted_input=buffers.get("quanted_down_input", None),
sublayer_name="w2",
)
HummingMethod.forward_layer(
layer=self.layer,
inputs=inputs,
input_scale=input_scale,
outputs=buffers["down_output"].view(-1, hidden_states.size(-1)),
sublayer_name="w2",
**moe_kwargs2,
)
moe_fused_mul_sum(
inputs=buffers["down_output"].view(*topk_ids.shape, -1),
topk_weights=topk_weights,
topk_ids=topk_ids,
expert_map=self.layer.expert_map,
outputs=buffers["output"],
)
class HummingGroupedExperts(HummingExpertsBase):
def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
return TopKWeightAndReduceNoOP()
@staticmethod
def activation_format() -> mk.FusedMoEActivationFormat:
return mk.FusedMoEActivationFormat.Standard
@property
def humming_gemm_type(self) -> HummingGemmType:
return HummingGemmType.GROUPED_CONTIGUOUS
def main_apply(
self,
hidden_states: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
workspace1: torch.Tensor,
workspace2: torch.Tensor,
expert_tokens_meta: mk.ExpertTokensMetadata | None,
):
valid_shape_m = self.estimate_local_valid_shape_m(topk_ids)
buffers = self.prepare_buffers(
workspace1,
workspace2,
topk_ids.size(0),
topk_ids.size(1),
self.layer.activation,
)
hidden_states, _, expert_first_token_offset, inv_perm, _ = moe_permute(
hidden_states=hidden_states,
a1q_scale=None,
topk_ids=topk_ids,
n_expert=self.global_num_experts,
n_local_expert=self.num_experts,
expert_map=self.layer.expert_map,
)
inputs, input_scale = HummingMethod.may_quant_input(
layer=self.layer,
inputs=hidden_states,
quanted_input=buffers.get("quanted_gate_up_input", None),
sublayer_name="w13",
)
HummingMethod.forward_layer(
layer=self.layer,
inputs=inputs,
input_scale=input_scale,
outputs=buffers["gate_up_output"],
valid_shape_m=valid_shape_m,
expert_layout=expert_first_token_offset,
compute_config=self.compute_config_str,
tuning_config=self.w13_tuning_config_str,
sublayer_name="w13",
)
self.activation(
activation=self.layer.activation,
input=buffers["gate_up_output"],
output=buffers["activation_output"],
)
inputs, input_scale = HummingMethod.may_quant_input(
layer=self.layer,
inputs=buffers["activation_output"],
quanted_input=buffers.get("quanted_down_input", None),
sublayer_name="w2",
)
HummingMethod.forward_layer(
layer=self.layer,
inputs=inputs,
input_scale=input_scale,
outputs=buffers["down_output"],
valid_shape_m=valid_shape_m,
expert_layout=expert_first_token_offset,
compute_config=self.compute_config_str,
tuning_config=self.w2_tuning_config_str,
sublayer_name="w2",
)
moe_unpermute(
out=buffers["output"],
permuted_hidden_states=buffers["down_output"].view(*topk_ids.shape, -1),
topk_weights=topk_weights,
inv_permuted_idx=inv_perm,
expert_first_token_offset=expert_first_token_offset,
)
class BatchedHummingGroupedExperts(HummingExpertsBase):
def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
return TopKWeightAndReduceDelegate()
@staticmethod
def activation_format() -> mk.FusedMoEActivationFormat:
return mk.FusedMoEActivationFormat.BatchedExperts
@property
def humming_gemm_type(self) -> HummingGemmType:
return HummingGemmType.GROUPED_MASKED
def main_apply(
self,
hidden_states: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
workspace1: torch.Tensor,
workspace2: torch.Tensor,
expert_tokens_meta: mk.ExpertTokensMetadata | None,
):
assert expert_tokens_meta is not None
hidden_states = hidden_states.view(-1, hidden_states.size(-1))
valid_shape_m = self.estimate_local_valid_shape_m(topk_ids)
expert_num_tokens = expert_tokens_meta.expert_num_tokens
buffers = self.prepare_buffers(
workspace1,
workspace2,
topk_ids.size(0),
topk_ids.size(1),
self.layer.activation,
)
inputs, input_scale = HummingMethod.may_quant_input(
layer=self.layer,
inputs=hidden_states,
quanted_input=buffers.get("quanted_gate_up_input", None),
sublayer_name="w13",
)
HummingMethod.forward_layer(
layer=self.layer,
inputs=inputs,
input_scale=input_scale,
outputs=buffers["gate_up_output"],
valid_shape_m=valid_shape_m,
expert_layout=expert_num_tokens,
compute_config=self.compute_config_str,
tuning_config=self.w13_tuning_config_str,
sublayer_name="w13",
)
self.activation(
activation=self.layer.activation,
input=buffers["gate_up_output"],
output=buffers["activation_output"],
)
inputs, input_scale = HummingMethod.may_quant_input(
layer=self.layer,
inputs=buffers["activation_output"],
quanted_input=buffers.get("quanted_down_input", None),
sublayer_name="w2",
)
HummingMethod.forward_layer(
layer=self.layer,
inputs=inputs,
input_scale=input_scale,
outputs=buffers["down_output"].view(-1, hidden_states.size(-1)),
valid_shape_m=valid_shape_m,
expert_layout=expert_num_tokens,
compute_config=self.compute_config_str,
tuning_config=self.w2_tuning_config_str,
sublayer_name="w2",
)
......@@ -1097,7 +1097,11 @@ class FusedMoE(PluggableLayer):
expert_id: int,
return_success: bool = False,
) -> bool | None:
if self.quant_config and self.quant_config.get_name() == "gpt_oss_mxfp4":
quant_config_name = self.quant_config and self.quant_config.get_name()
if quant_config_name == "humming":
assert hasattr(self.quant_method, "weight_schema")
quant_config_name = self.quant_method.weight_schema.quant_method
if quant_config_name == "gpt_oss_mxfp4":
# (FIXME) for gpt-oss all experts are combined
if "bias" in weight_name:
dim1 = loaded_weight.shape[1]
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
from torch._subclasses.fake_tensor import FakeTensor
from vllm.platforms import current_platform
from vllm.triton_utils import tl, triton
@triton.jit
def moe_fused_mul_sum_kernel(
inputs_ptr,
topk_weights_ptr,
outputs_ptr,
top_ids_ptr,
expert_map_ptr,
num_tokens,
stride_m,
has_expert_map: tl.constexpr,
top_k: tl.constexpr,
size: tl.constexpr,
BLOCK_M: tl.constexpr,
BLOCK_K: tl.constexpr,
):
pid_k = tl.program_id(0)
pid_m = tl.program_id(1)
offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
offs_k = pid_k * BLOCK_K + tl.arange(0, BLOCK_K)
m_mask = offs_m < num_tokens
k_mask = offs_k < size
mask = m_mask[:, None] & k_mask[None, :]
a_base = inputs_ptr + (offs_m * stride_m)[:, None] + offs_k[None, :]
b_base = topk_weights_ptr + offs_m * top_k
acc = tl.zeros((BLOCK_M, BLOCK_K), dtype=tl.float32)
for n in tl.static_range(top_k):
b_val = tl.load(b_base + n, mask=m_mask, other=0.0).to(tl.float32)
if has_expert_map:
id_val = tl.load(top_ids_ptr + offs_m * top_k + n, mask=m_mask, other=0)
expert_mask = tl.load(expert_map_ptr + id_val) >= 0
a_vec = tl.load(
a_base + n * size,
mask=mask & expert_mask[:, None],
other=0.0,
).to(tl.float32)
else:
a_vec = tl.load(
a_base + n * size,
mask=mask,
other=0.0,
).to(tl.float32)
acc += a_vec * b_val[:, None]
out_ptrs = outputs_ptr + (offs_m * size)[:, None] + offs_k[None, :]
tl.store(
out_ptrs,
acc.to(outputs_ptr.dtype.element_ty),
mask=mask,
)
def _heuristic_config(
num_tokens: int,
top_k: int,
size: int,
element_size: int,
):
is_fp32 = element_size > 2
is_sm90_plus = current_platform.has_device_capability(90)
is_sm80_before = not current_platform.has_device_capability(80)
if current_platform.has_device_capability(90):
# SM90/SM100+: prefer small tiles + many CTAs.
if is_fp32:
BLOCK_M = 1 if num_tokens <= 4 else 2
else:
if num_tokens <= 4:
BLOCK_M = 1
elif num_tokens <= 128:
BLOCK_M = 2
else:
BLOCK_M = 4
elif is_fp32:
if num_tokens <= 4:
BLOCK_M = 1
elif num_tokens <= 32:
BLOCK_M = 2
elif num_tokens <= 128:
BLOCK_M = 4
else:
BLOCK_M = 4
else:
if num_tokens <= 4:
BLOCK_M = 1
elif num_tokens <= 32:
BLOCK_M = 2
elif num_tokens <= 128:
BLOCK_M = 4
elif num_tokens <= 1024:
BLOCK_M = 16
else:
BLOCK_M = 8
if is_fp32:
max_block_k = 256
elif is_sm80_before or is_sm90_plus:
max_block_k = 512
else:
max_block_k = 1024
BLOCK_K = min(triton.next_power_of_2(size), max_block_k)
BLOCK_K = max(BLOCK_K, 256)
total = BLOCK_M * BLOCK_K
if is_fp32:
num_warps = max(8, min(16, total // 64))
else:
num_warps = max(4, min(16, total // 256))
if is_sm80_before:
num_warps = min(num_warps, 8)
num_stages = 2
elif is_sm90_plus:
num_warps = min(num_warps, 8)
num_stages = 4 if total <= 2048 else 2
else:
num_stages = 4 if total <= 2048 else 2
return BLOCK_M, BLOCK_K, num_warps, num_stages
def moe_fused_mul_sum(
inputs: torch.Tensor,
topk_weights: torch.Tensor,
outputs: torch.Tensor | None = None,
topk_ids: torch.Tensor | None = None,
expert_map: torch.Tensor | None = None,
) -> torch.Tensor:
"""
Fused kernel for MoE (Mixture of Experts) to perform weighted summation
of expert outputs.
Args:
inputs: The output from experts.
Shape: (num_tokens, top_k, hidden_size).
topk_weights: The weights assigned to each expert for each token.
Shape: (num_tokens, top_k).
outputs: Optional pre-allocated output tensor.
Shape: (num_tokens, hidden_size).
topk_ids: Optional indices of the top-k experts. Used when
`expert_map` is provided. Shape: (num_tokens, top_k).
expert_map: Optional mapping for Expert Parallelism. A value < 0
indicates an invalid token/expert pair that will be skipped.
Returns:
The fused weighted sum of expert outputs.
Shape: (num_tokens, hidden_size).
"""
assert inputs.ndim == 3
assert topk_weights.ndim == 2
assert inputs.is_contiguous()
assert topk_weights.is_contiguous()
assert inputs.dtype in (torch.float32, torch.float16, torch.bfloat16)
assert topk_weights.dtype in (torch.float32, torch.float16, torch.bfloat16)
num_tokens, top_k, size = inputs.shape
output_shape = (num_tokens, size)
if outputs is None:
outputs = torch.empty(output_shape, dtype=inputs.dtype, device=inputs.device)
assert outputs.shape == output_shape
assert topk_weights.shape == (num_tokens, top_k)
if not isinstance(inputs, FakeTensor):
BLOCK_M, BLOCK_K, num_warps, num_stages = _heuristic_config(
num_tokens,
top_k,
size,
inputs.element_size(),
)
grid = (triton.cdiv(size, BLOCK_K), triton.cdiv(num_tokens, BLOCK_M))
moe_fused_mul_sum_kernel[grid](
inputs,
topk_weights,
outputs,
topk_ids,
expert_map,
num_tokens,
top_k * size,
expert_map is not None,
top_k,
size,
BLOCK_M,
BLOCK_K,
num_warps=num_warps,
num_stages=num_stages,
)
return outputs
......@@ -60,6 +60,7 @@ WEIGHT_LOADER_V2_SUPPORTED = [
"ModelOptFp8PbWoLinearMethod",
"QuarkLinearMethod",
"ModelOptNvFp4LinearMethod",
"HummingLinearMethod",
]
......@@ -245,6 +246,7 @@ class LinearBase(PluggableLayer):
self,
input_size: int,
output_size: int,
bias: bool = False,
skip_bias_add: bool = False,
params_dtype: torch.dtype | None = None,
quant_config: QuantizationConfig | None = None,
......@@ -258,6 +260,7 @@ class LinearBase(PluggableLayer):
# Keep input parameters
self.input_size = input_size
self.output_size = output_size
self.has_bias = bias
self.skip_bias_add = skip_bias_add
if params_dtype is None:
params_dtype = torch.get_default_dtype()
......@@ -323,6 +326,7 @@ class ReplicatedLinear(LinearBase):
super().__init__(
input_size,
output_size,
bias,
skip_bias_add,
params_dtype,
quant_config,
......@@ -458,6 +462,7 @@ class ColumnParallelLinear(LinearBase):
super().__init__(
input_size,
output_size,
bias,
skip_bias_add,
params_dtype,
quant_config,
......@@ -483,6 +488,7 @@ class ColumnParallelLinear(LinearBase):
else self.weight_loader
),
)
if bias:
self.bias = Parameter(
torch.empty(self.output_size_per_partition, dtype=params_dtype)
......@@ -817,8 +823,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
# for the packing.
packed_dim = getattr(param, "packed_dim", None)
if packed_dim == output_dim:
shard_size = shard_size // param.packed_factor
shard_offset = shard_offset // param.packed_factor
shard_size = round(shard_size // param.packed_factor)
shard_offset = round(shard_offset // param.packed_factor)
# Special case for Marlin.
shard_size, shard_offset = adjust_marlin_shard(
param, shard_size, shard_offset
......@@ -1252,8 +1258,8 @@ class QKVParallelLinear(ColumnParallelLinear):
)
if packed_dim == output_dim:
shard_size = shard_size // param.packed_factor
shard_offset = shard_offset // param.packed_factor
shard_size = round(shard_size // param.packed_factor)
shard_offset = round(shard_offset // param.packed_factor)
# Special case for Marlin.
shard_size, shard_offset = adjust_marlin_shard(
......@@ -1315,8 +1321,8 @@ class QKVParallelLinear(ColumnParallelLinear):
# for the packing.
packed_dim = getattr(param, "packed_dim", None)
if packed_dim == output_dim:
shard_size = shard_size // param.packed_factor
shard_offset = shard_offset // param.packed_factor
shard_size = round(shard_size // param.packed_factor)
shard_offset = round(shard_offset // param.packed_factor)
# Special case for Marlin.
shard_size, shard_offset = adjust_marlin_shard(
......@@ -1440,6 +1446,7 @@ class RowParallelLinear(LinearBase):
super().__init__(
input_size,
output_size,
bias,
skip_bias_add,
params_dtype,
quant_config,
......
......@@ -22,6 +22,7 @@ QuantizationMethods = Literal[
"gptq_marlin",
"awq_marlin",
"gptq",
"humming",
"compressed-tensors",
"bitsandbytes",
"experts_int8",
......@@ -126,6 +127,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
from .gguf import GGUFConfig
from .gptq import GPTQConfig
from .gptq_marlin import GPTQMarlinConfig
from .humming import HummingConfig
from .inc import INCConfig
from .modelopt import (
ModelOptFp8Config,
......@@ -162,6 +164,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
"mxfp4": Mxfp4Config,
"gpt_oss_mxfp4": GptOssMxfp4Config,
"cpu_awq": CPUAWQConfig,
"humming": HummingConfig,
"online": OnlineQuantizationConfig,
}
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import math
from collections.abc import Callable
from typing import TYPE_CHECKING, Any
import regex as re
import torch
from vllm import envs
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig,
FusedMoEQuantConfig,
FusedMoEQuantDesc,
)
from vllm.model_executor.layers.fused_moe.layer import (
FusedMoE,
FusedMoEMethodBase,
)
from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import (
UnquantizedFusedMoEMethod,
)
from vllm.model_executor.layers.linear import (
LinearBase,
LinearMethodBase,
UnquantizedLinearMethod,
)
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig,
QuantizeMethodBase,
)
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.parameter import (
BasevLLMParameter,
BlockQuantScaleParameter,
ChannelQuantScaleParameter,
GroupQuantScaleParameter,
ModelWeightParameter,
PackedvLLMParameter,
PerTensorScaleParameter,
RowvLLMParameter,
)
from vllm.model_executor.utils import set_weight_attrs
if TYPE_CHECKING:
from vllm.model_executor.models.utils import WeightsMapper
try:
from humming.dtypes import DataType
from humming.layer import HummingMethod
from humming.schema import (
BaseInputSchema,
BaseWeightSchema,
HummingInputSchema,
HummingWeightSchema,
)
from humming.utils.weight import quantize_weight
from vllm.model_executor.layers.fused_moe.fused_humming_moe import (
BatchedHummingGroupedExperts,
HummingGroupedExperts,
HummingIndexedExperts,
get_humming_moe_gemm_type,
)
except ModuleNotFoundError:
HummingMethod = None
def assert_humming_available():
assert HummingMethod is not None, (
"humming is not available, please run "
"'pip install git+https://github.com/inclusionAI/humming' to install it."
)
def prepare_padded_shape(shape, x):
padded_shape = math.ceil(shape / x) * x
return padded_shape, padded_shape - shape
def prepare_param(tensor, name, extra_attrs):
extra_attrs = extra_attrs.copy()
scale_type = extra_attrs.pop("scale_type", None)
param_cls_name_map = {
"block": BlockQuantScaleParameter,
"tensor": PerTensorScaleParameter,
"group": GroupQuantScaleParameter,
"channel": ChannelQuantScaleParameter,
"input_scale": PerTensorScaleParameter,
}
param_cls: type[BasevLLMParameter]
if "packed_dim" in extra_attrs:
param_cls = PackedvLLMParameter
elif scale_type in param_cls_name_map:
param_cls = param_cls_name_map[scale_type]
elif "output_dim" in extra_attrs and "input_dim" in extra_attrs:
param_cls = ModelWeightParameter
elif "input_dim" in extra_attrs:
param_cls = RowvLLMParameter
elif "output_dim" in extra_attrs:
param_cls = ChannelQuantScaleParameter
else:
param_cls = BasevLLMParameter
kwargs_keys = [
"input_dim",
"output_dim",
"packed_dim",
"packed_factor",
"weight_loader",
]
cls_kwargs = {}
for key in extra_attrs.copy():
if key in kwargs_keys:
cls_kwargs[key] = extra_attrs.pop(key)
param = param_cls(data=tensor, **cls_kwargs)
set_weight_attrs(param, extra_attrs)
param.param_name = name
param.ignore_warning = True
if scale_type in ["tensor", "input_scale"]:
param.needs_scalar_to_array = True
return param
def prepare_moe_param(tensor, name, extra_attrs):
param = torch.nn.Parameter(tensor, requires_grad=False)
if "scale_type" in extra_attrs:
extra_attrs["quant_method"] = extra_attrs["scale_type"]
if "input_dim" in extra_attrs and "output_dim" in extra_attrs:
input_dim = extra_attrs["input_dim"]
output_dim = extra_attrs["output_dim"]
extra_attrs["is_transposed"] = input_dim < output_dim
set_weight_attrs(param, extra_attrs)
param.param_name = name
return param
def may_pad_loaded_weight(param, loaded_weight):
pad_shape = getattr(param, "pad_shape", None)
if pad_shape is None:
return loaded_weight
value = 1 if loaded_weight.dtype == torch.float8_e8m0fnu else 0
padding = []
for x in pad_shape[::-1][: loaded_weight.ndim]:
padding += [0, x]
loaded_weight = torch.nn.functional.pad(
input=loaded_weight,
pad=padding,
value=value,
)
return loaded_weight
def compressed_tensors_get_config(config: dict[str, Any], key: str):
assert key in ["weights", "input_activations"]
target_group_config = None
for group_config in config["config_groups"].values():
if "Linear" in group_config["targets"]:
if "weights" not in group_config:
return None
if key not in group_config or group_config[key] is None:
return None
target_group_config = group_config[key].copy()
break
if target_group_config is None:
return None
target_group_config["quant_method"] = config["quant_method"]
if config["quant_method"] == "compressed-tensors":
target_group_config["format"] = config["format"]
elif config["quant_method"] == "modelopt":
target_group_config["quant_algo"] = config["quant_algo"]
return target_group_config
class HummingConfig(QuantizationConfig):
packed_modules_mapping = {}
def __init__(self, full_config: dict[str, Any] | None = None):
assert_humming_available()
self.full_config: dict[str, Any] = full_config or {}
@classmethod
def get_name(cls) -> QuantizationMethods:
return "humming"
@classmethod
def get_supported_act_dtypes(cls) -> list[torch.dtype]:
return [torch.bfloat16, torch.half]
@classmethod
def get_min_capability(cls) -> int:
return 75
@classmethod
def get_config_filenames(cls) -> list[str]:
return []
@classmethod
def from_config(cls, config: dict[str, Any]) -> "HummingConfig":
return cls(full_config=config)
@classmethod
def override_quantization_method(
cls, hf_quant_cfg, user_quant, hf_config=None
) -> QuantizationMethods | None:
return "humming" if user_quant == "humming" else None
def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
self.hf_to_vllm_mapper = hf_to_vllm_mapper
def is_layer_skipped(self, config: dict[str, Any], prefix: str):
keys = ["ignored_layers", "ignore", "modules_to_not_convert"]
ignored_layers = self.get_from_keys_or(config, keys, []) or []
if hasattr(self, "hf_to_vllm_mapper"):
ignored_layers = self.hf_to_vllm_mapper.apply_list(ignored_layers)
if any(module_name in prefix for module_name in ignored_layers):
return True
if "lm_head" in prefix:
return True
for regex in config.get("dynamic", {}):
if regex[:1] != "-":
continue
if re.match(regex[2:], prefix):
return True
return False
def get_layer_weight_schema(self, config: dict[str, Any], prefix: str):
if self.is_layer_skipped(config, prefix):
return None
if config["quant_method"] in ["compressed-tensors", "modelopt"]:
group_config = compressed_tensors_get_config(config, "weights")
if group_config is None:
return None
config = group_config
layer_config = config
layer_dynamic = config.get("dynamic", {})
if not isinstance(layer_dynamic, dict):
layer_dynamic = {}
for regex, override_config in layer_dynamic.items():
if regex[:1] != "+":
continue
if re.match(regex[2:], prefix):
layer_config = config.copy()
layer_config.update(override_config)
break
if "quant_method" in layer_config:
return BaseWeightSchema.from_config(layer_config)
return None
def get_layer_input_schema(self, config: dict[str, Any], prefix: str):
if self.is_layer_skipped(config, prefix):
return None
if config["quant_method"] in ["compressed-tensors", "modelopt"]:
group_config = compressed_tensors_get_config(config, "input_activations")
if group_config is None:
return None
config = group_config
if config.get("quant_method", None) in BaseInputSchema.INPUT_SCHEMA_MAP:
return BaseInputSchema.from_config(config)
return None
def get_quant_config_for_layer(
self, prefix: str, layer_type: str
) -> "HummingLayerQuantizationConfig | None":
weight_schema: BaseWeightSchema | None = None
force_weight_schema: HummingWeightSchema | None = None
if self.full_config:
weight_schema = self.get_layer_weight_schema(self.full_config, prefix)
is_online_quant = False
online_quant_config = envs.VLLM_HUMMING_ONLINE_QUANT_CONFIG or {}
if not self.full_config or online_quant_config.get("force_requant", False):
online_quant_config["quant_method"] = "humming"
schema = self.get_layer_weight_schema(online_quant_config, prefix)
if not self.full_config:
weight_schema = schema
is_online_quant = True
else:
force_weight_schema = schema
if weight_schema is not None:
if weight_schema.quant_method == "gpt_oss_mxfp4" and layer_type != "moe":
return None
input_schema = None
force_input_schema = None
if self.full_config:
input_schema = self.get_layer_input_schema(self.full_config, prefix)
if envs.VLLM_HUMMING_INPUT_QUANT_CONFIG:
quant_config = envs.VLLM_HUMMING_INPUT_QUANT_CONFIG.copy()
quant_config["quant_method"] = "humming"
force_input_schema = self.get_layer_input_schema(quant_config, prefix)
if input_schema is None:
input_schema = force_input_schema
if force_weight_schema is not None and force_input_schema is None:
force_input_schema = HummingInputSchema()
return HummingLayerQuantizationConfig(
weight_schema=weight_schema,
input_schema=input_schema,
force_weight_schema=force_weight_schema,
force_input_schema=force_input_schema,
is_online_quant=is_online_quant,
)
return None
def get_quant_method(
self, layer: torch.nn.Module, prefix: str
) -> "QuantizeMethodBase | None":
layer_type = "other"
if isinstance(layer, FusedMoE):
layer_type = "moe"
elif isinstance(layer, LinearBase):
layer_type = "linear"
# TODO: remove this after humming moe backend is ready
quant_method = self.full_config.get("quant_method", None)
moe_activation = getattr(layer, "activation", None)
if quant_method == "mxfp4" and moe_activation == MoEActivation.SWIGLUOAI:
self.full_config["quan_method"] = "gpt_oss_mxfp4"
quant_config = self.get_quant_config_for_layer(prefix, layer_type)
if quant_config is None:
if isinstance(layer, FusedMoE):
return UnquantizedFusedMoEMethod(layer.moe_config)
elif isinstance(layer, LinearBase):
return UnquantizedLinearMethod()
elif isinstance(layer, LinearBase):
return HummingLinearMethod(quant_config)
elif isinstance(layer, FusedMoE):
return HummingMoEMethod(quant_config, layer.moe_config)
return None
class HummingLayerQuantizationConfig(HummingConfig):
def __init__(
self,
weight_schema: "BaseWeightSchema",
input_schema: "BaseInputSchema | None" = None,
force_weight_schema: "HummingWeightSchema | None" = None,
force_input_schema: "HummingInputSchema | None" = None,
is_online_quant: bool = False,
):
self.weight_schema = weight_schema
if input_schema is None:
input_schema = HummingInputSchema()
self.input_schema = input_schema
self.force_weight_schema = force_weight_schema
self.force_input_schema = force_input_schema
self.is_online_quant = is_online_quant
@classmethod
def from_config(cls, config):
weight_schema = BaseWeightSchema.from_config(config)
return cls(weight_schema)
def get_quant_method(
self, layer: torch.nn.Module, prefix: str
) -> QuantizeMethodBase | None:
raise NotImplementedError
class HummingLinearMethod(LinearMethodBase):
def __init__(self, quant_config: HummingLayerQuantizationConfig):
self.quant_config = quant_config
self.weight_schema = quant_config.weight_schema
self.input_schema = quant_config.input_schema
self.force_weight_schema = quant_config.force_weight_schema
self.force_input_schema = quant_config.force_input_schema
self.is_online_quant = self.quant_config.is_online_quant
def prepare_weight_loader(self, layer: torch.nn.Module, weight_loader: Callable):
def new_weight_loader(
param: torch.nn.Parameter,
loaded_weight: torch.Tensor,
shard_id: str | int | None = None,
):
name = param.param_name
float_dtypes = [torch.float16, torch.bfloat16, torch.float32]
is_unquantized = name == "weight" and loaded_weight.dtype in float_dtypes
if is_unquantized and self.is_online_quant:
# online quant (fp16/bf16 -> quant_type)
assert isinstance(self.weight_schema, HummingWeightSchema)
f16_dtype = DataType.from_torch_dtype(layer.param_dtype)
has_global_scale = "TENSOR" in str(self.weight_schema.weight_scale_type)
tensor_list = quantize_weight(
weight=loaded_weight,
dtype=self.weight_schema.b_dtype,
scale_dtype=self.weight_schema.bs_dtype or f16_dtype,
group_size=self.weight_schema.weight_scale_group_size,
has_zero_point=self.weight_schema.has_zero_point,
has_global_scale=has_global_scale,
is_fp_zero_point=self.weight_schema.is_fp_zero_point,
pack=True,
)
key_list = ["weight", "weight_scale", "zero_point", "global_scale"]
for key, tensor in zip(key_list, tensor_list):
if tensor is None or tensor.nelement() == 0:
continue
param = getattr(layer, key)
param.weight_loader(param, tensor, shard_id)
return None
elif is_unquantized and not self.is_online_quant:
# fallback to unquantized linear
# some model skip some layer when quantizing model, but
# don't mark the layer as unquantized.
if not layer.is_fallback:
layer.is_fallback = True
for name, _ in list(layer.named_parameters()):
if name != "bias":
delattr(layer, name)
delattr(layer, "locks")
self.__class__ = UnquantizedLinearMethod # type: ignore
tensor = torch.empty(
(
layer.output_partition_sizes_sum,
layer.input_size_per_partition,
),
dtype=layer.param_dtype,
device=param.device,
)
extra_weight_attrs = layer.extra_weight_attrs.copy()
orig_weight_loader = extra_weight_attrs.pop("weight_loader")
layer.weight = ModelWeightParameter(
data=tensor,
input_dim=1,
output_dim=0,
weight_loader=orig_weight_loader,
)
layer.weight.tp_size = layer.tp_size
layer.weight.tp_rank = layer.tp_rank
set_weight_attrs(layer.weight, extra_weight_attrs)
param = layer.weight
if shard_id is not None:
return layer.weight.weight_loader(param, loaded_weight, shard_id)
return layer.weight.weight_loader(param, loaded_weight)
# weight processing logic for specific quantization schema
loaded_weight = self.weight_schema.process_loaded_weight(
tensor=loaded_weight,
name=name,
)
if shard_id is not None:
return weight_loader(param, loaded_weight, shard_id)
return weight_loader(param, loaded_weight)
return new_weight_loader
def create_weights(
self,
layer: torch.nn.Module,
input_size_per_partition: int,
output_partition_sizes: list[int],
input_size: int,
output_size: int,
params_dtype: torch.dtype,
**extra_weight_attrs,
):
layer.is_fallback = False
layer.param_dtype = params_dtype
layer.input_size = input_size
layer.output_size = output_size
layer.input_size_per_partition = input_size_per_partition
layer.output_partition_sizes_sum = sum(output_partition_sizes)
layer.output_partition_sizes = output_partition_sizes
layer.extra_weight_attrs = extra_weight_attrs.copy()
weight_loader = extra_weight_attrs.get("weight_loader", default_weight_loader)
new_weight_loader = self.prepare_weight_loader(layer, weight_loader)
extra_weight_attrs["weight_loader"] = new_weight_loader
for key in ["weight_block_size", "block_structure"]:
block_size = getattr(self.weight_schema, key, None)
if block_size is not None:
layer.weight_block_size = block_size
weight_tensor_attrs = self.weight_schema.get_tensors_attrs(
shape_n=layer.output_partition_sizes_sum,
shape_k=layer.input_size_per_partition,
param_dtype=params_dtype,
stack_size=len(layer.output_partition_sizes),
)
input_tensor_attrs = self.input_schema.get_tensors_attrs(
shape_k=layer.input_size_per_partition,
param_dtype=params_dtype,
stack_size=len(layer.output_partition_sizes),
)
tensors_attrs = weight_tensor_attrs | input_tensor_attrs
for name, attrs in tensors_attrs.items():
tensor = torch.empty(attrs["shape"], dtype=attrs["dtype"])
extra_attrs = attrs.get("extra_attrs", {}).copy()
extra_attrs.update(extra_weight_attrs)
param = prepare_param(tensor, name, extra_attrs)
setattr(layer, name, param)
locks = torch.zeros(1024, dtype=torch.int32)
layer.register_buffer("locks", locks)
if self.force_input_schema is not None:
self.input_schema = self.force_input_schema
if not hasattr(layer, "weight"):
param = prepare_param(torch.tensor(0), "weight", extra_weight_attrs)
layer.weight = param
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
if layer.is_fallback:
return None
# convert from checkpoint format to humming format
if not isinstance(self.weight_schema, HummingWeightSchema):
self.weight_schema, tensors = self.weight_schema.convert_humming(
tensors=layer.state_dict(),
shape_n_stacks=layer.output_partition_sizes,
shape_k_stacks=[layer.input_size_per_partition],
param_dtype=layer.param_dtype,
)
self.input_schema, _ = self.input_schema.convert_humming(
tensors=layer.state_dict(),
shape_n_stacks=layer.output_partition_sizes,
shape_k_stacks=[layer.input_size_per_partition],
param_dtype=layer.param_dtype,
)
for name, _ in list(layer.named_parameters()):
delattr(layer, name)
for name, tensor in tensors.items():
param = torch.nn.Parameter(tensor, requires_grad=False)
setattr(layer, name, param)
del tensors
# force requant (origin quant setting -> fp16/bf16 -> new_quant setting)
assert isinstance(self.weight_schema, HummingWeightSchema)
force_requant = self.force_weight_schema is not None
if force_requant and self.weight_schema != self.force_weight_schema:
tensors = self.weight_schema.requant_tensors(
tensors=layer.state_dict(),
target_weight_schema=self.force_weight_schema,
param_dtype=layer.param_dtype,
)
self.weight_schema = self.force_weight_schema
for name, _ in list(layer.named_parameters()):
if name != "bias":
delattr(layer, name)
for name, tensor in tensors.items():
param = torch.nn.Parameter(tensor, requires_grad=False)
setattr(layer, name, param)
del tensors
# prepare layer config from humming kernel
HummingMethod.prepare_layer_meta(
layer=layer,
shape_n=layer.output_partition_sizes_sum,
shape_k=layer.input_size_per_partition,
weight_schema=self.weight_schema,
input_schema=self.input_schema,
pad_n_to_multiple=256,
pad_k_to_multiple=128,
has_bias=layer.has_bias,
torch_dtype=layer.param_dtype,
)
# preprocess weight for inference
HummingMethod.transform_humming_layer(layer)
# compute_config: kernel configs that do not directly affect weights
# but significantly impact kernel behavior or computation precision.
# see https://github.com/inclusionAI/humming/blob/main/docs/config.md
compute_config = {
"use_batch_invariant": envs.VLLM_BATCH_INVARIANT,
"use_f16_accum": envs.VLLM_HUMMING_USE_F16_ACCUM,
"gemm_type": "dense",
}
self.compute_config = json.dumps(compute_config)
def apply(
self,
layer: torch.nn.Module,
x: torch.Tensor,
bias: torch.Tensor | None = None,
) -> torch.Tensor:
flatten_inputs = x.view(-1, x.size(-1))
output = HummingMethod.forward_layer(
layer=layer,
inputs=flatten_inputs,
compute_config=self.compute_config,
)
output = output.view(*x.shape[:-1], output.size(-1))
return output
class HummingMoEMethod(FusedMoEMethodBase):
def __init__(
self, quant_config: HummingLayerQuantizationConfig, moe: "FusedMoEConfig"
) -> None:
super().__init__(moe)
self.quant_config = quant_config
self.moe = moe
self.weight_schema = quant_config.weight_schema
self.input_schema = quant_config.input_schema
self.force_weight_schema = quant_config.force_weight_schema
self.force_input_schema = quant_config.force_input_schema
def prepare_weight_loader(self, layer, weight_loader):
def new_weight_loader(
param: torch.nn.Parameter,
loaded_weight: torch.Tensor,
weight_name: str,
shard_id: str,
expert_id: int | None = None,
return_success: bool = False,
):
name = param.param_name
float_dtypes = [torch.float16, torch.bfloat16, torch.float32]
is_unquantized = name == "weight" and loaded_weight.dtype in float_dtypes
# online quant (fp16/bf16 -> quant_type)
if is_unquantized:
assert isinstance(self.weight_schema, HummingWeightSchema)
f16_dtype = DataType.from_torch_dtype(layer.param_dtype)
has_global_scale = "TENSOR" in str(self.weight_schema.weight_scale_type)
tensor_list = quantize_weight(
weight=loaded_weight,
dtype=self.weight_schema.b_dtype,
scale_dtype=self.weight_schema.bs_dtype or f16_dtype,
group_size=self.weight_schema.weight_scale_group_size,
has_zero_point=self.weight_schema.has_zero_point,
has_global_scale=has_global_scale,
is_fp_zero_point=self.weight_schema.is_fp_zero_point,
pack=True,
)
key_list = ["weight", "weight_scale", "zero_point", "global_scale"]
success = True
for key, tensor in zip(key_list, tensor_list):
if tensor is None or tensor.nelement() == 0:
continue
sublayer_name = "w2" if shard_id == "w2" else "w13"
param = getattr(layer, sublayer_name + "_" + key)
part_subccess = param.weight_loader(
param=param,
loaded_weight=tensor.cpu(),
weight_name=shard_id + "_" + key,
shard_id=shard_id,
expert_id=expert_id,
return_success=return_success,
)
success = success and part_subccess
return success if return_success else None
# weight processing logic for specific quantization schema
loaded_weight = self.weight_schema.process_loaded_weight(
tensor=loaded_weight,
name=name,
)
return weight_loader(
param,
loaded_weight,
weight_name,
shard_id=shard_id,
expert_id=expert_id,
return_success=return_success,
)
return new_weight_loader
def create_weights(
self,
layer: torch.nn.Module,
num_experts: int,
hidden_size: int,
intermediate_size_per_partition: int,
params_dtype: torch.dtype,
**extra_weight_attrs,
):
layer.num_experts = num_experts
layer.param_dtype = params_dtype
layer.intermediate_size = intermediate_size_per_partition
weight_loader = extra_weight_attrs.get("weight_loader", default_weight_loader)
weight_loader = self.prepare_weight_loader(layer, weight_loader)
extra_weight_attrs["weight_loader"] = weight_loader
# sublayer: a layer contains multiple sets of weights for quantized GEMM
# (e.g., weight, weight_scale, etc.).
# The weight names of sublayer start with the prefix "{sublayer_name}_"
layer.sublayer_configs = {
"w13": {
"shape_n": intermediate_size_per_partition * 2,
"shape_k": hidden_size,
"tensors_attrs": self.weight_schema.get_padded_tensors_attrs(
shape_n=intermediate_size_per_partition * 2,
shape_k=hidden_size,
num_experts=num_experts,
param_dtype=params_dtype,
has_bias=self.moe.has_bias,
),
},
"w2": {
"shape_n": hidden_size,
"shape_k": intermediate_size_per_partition,
"tensors_attrs": self.weight_schema.get_padded_tensors_attrs(
shape_n=hidden_size,
shape_k=intermediate_size_per_partition,
num_experts=num_experts,
param_dtype=params_dtype,
has_bias=self.moe.has_bias,
),
},
}
for sublayer_name, configs in layer.sublayer_configs.items():
for name, attrs in configs["tensors_attrs"].items():
tensor = torch.empty(attrs["shape"], dtype=attrs["dtype"])
param = torch.nn.Parameter(tensor, requires_grad=False)
extra_attrs = attrs.get("extra_attrs", {}).copy()
extra_attrs.update(extra_weight_attrs)
param = prepare_moe_param(tensor, name, extra_attrs)
setattr(layer, f"{sublayer_name}_{name}", param)
if self.force_input_schema is not None:
self.input_schema = self.force_input_schema
locks = torch.zeros(1024, dtype=torch.int32)
layer.register_buffer("locks", locks)
def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig:
self.process_weights_after_loading(layer)
input_schema = self.input_schemas["w13"]
weight_schema = self.weight_schemas["w13"]
a_dtype = input_schema.a_dtype
if a_dtype is None or a_dtype.num_bits == 16:
a_quant_desc = FusedMoEQuantDesc(dtype=None)
else:
shape = GroupShape(row=1, col=-1)
a_quant_desc = FusedMoEQuantDesc(dtype=str(a_dtype), shape=shape)
weight_scale_group_size = weight_schema.weight_scale_group_size
weight_scale_group_size_n = weight_schema.weight_scale_group_size_n
weight_group_shape: tuple[int, ...] = ()
if weight_scale_group_size_n > 1:
weight_group_shape = GroupShape(
row=weight_scale_group_size,
col=weight_scale_group_size_n,
)
elif weight_scale_group_size == 0:
weight_group_shape = GroupShape(row=-1, col=1)
else:
weight_group_shape = GroupShape(row=weight_scale_group_size, col=1)
w1_quant_desc = FusedMoEQuantDesc(
dtype=str(weight_schema.b_dtype),
shape=weight_group_shape,
scale=getattr(layer, "w13_weight_scale", None),
alpha_or_gscale=getattr(layer, "w13_global_scale", None),
zp=getattr(layer, "w13_zero_point", None),
bias=getattr(layer, "w13_bias", None),
)
w2_quant_desc = FusedMoEQuantDesc(
dtype=str(weight_schema.b_dtype),
shape=weight_group_shape,
scale=getattr(layer, "w2_weight_scale", None),
alpha_or_gscale=getattr(layer, "w2_global_scale", None),
zp=getattr(layer, "w2_zero_point", None),
bias=getattr(layer, "w2_bias", None),
)
return FusedMoEQuantConfig(
_a1=a_quant_desc,
_a2=a_quant_desc,
_w1=w1_quant_desc,
_w2=w2_quant_desc,
)
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
if getattr(self, "processed", False):
return
self.processed = True
self.weight_schemas = {}
self.input_schemas = {}
for sublayer_name, configs in layer.sublayer_configs.items():
input_schema = self.input_schema
weight_schema = self.weight_schema
# convert from checkpoint format to humming format
if not isinstance(weight_schema, HummingWeightSchema):
tensors: dict[str, torch.Tensor] = dict(
(key.removeprefix(sublayer_name + "_"), value)
for key, value in layer.state_dict().items()
if key.startswith(sublayer_name + "_")
)
shape_k_stacks = [configs["shape_k"]]
shape_n_stacks = [configs["shape_n"]]
if sublayer_name == "w13":
shape_n_stacks = [configs["shape_n"] // 2] * 2
weight_schema, tensors = weight_schema.convert_humming(
tensors=tensors,
shape_n_stacks=shape_n_stacks,
shape_k_stacks=shape_k_stacks,
param_dtype=layer.param_dtype,
num_experts=layer.num_experts,
)
input_schema, _ = input_schema.convert_humming(
tensors=tensors,
shape_n_stacks=shape_n_stacks,
shape_k_stacks=shape_k_stacks,
param_dtype=layer.param_dtype,
num_experts=layer.num_experts,
)
for name, _ in list(layer.named_parameters()):
if not name.startswith(sublayer_name + "_"):
continue
delattr(layer, name)
for name, tensor in tensors.items():
name = f"{sublayer_name}_{name}"
param = torch.nn.Parameter(tensor, requires_grad=False)
setattr(layer, name, param)
self.weight_schemas[sublayer_name] = weight_schema
self.input_schemas[sublayer_name] = input_schema
# force requant (origin quant setting -> fp16/bf16 -> new_quant setting)
assert isinstance(weight_schema, HummingWeightSchema)
force_requant = self.force_weight_schema is not None
if force_requant and weight_schema != self.force_weight_schema:
tensors = dict(
(key.removeprefix(sublayer_name + "_"), value)
for key, value in layer.state_dict().items()
if key.startswith(sublayer_name + "_")
)
tensors = weight_schema.requant_tensors(
tensors=tensors,
target_weight_schema=self.force_weight_schema,
param_dtype=layer.param_dtype,
)
weight_schema = self.force_weight_schema
for name, _ in list(layer.named_parameters()):
if not name.startswith(sublayer_name + "_"):
continue
if name == sublayer_name + "_bias":
continue
delattr(layer, name)
for name, tensor in tensors.items():
name = f"{sublayer_name}_{name}"
param = torch.nn.Parameter(tensor, requires_grad=False)
setattr(layer, name, param)
del tensors
# prepare layer config from humming kernel
HummingMethod.prepare_layer_meta(
layer=layer,
shape_n=configs["shape_n"],
shape_k=configs["shape_k"],
pad_n_to_multiple=256,
pad_k_to_multiple=128,
input_schema=input_schema,
weight_schema=weight_schema,
has_bias=self.moe.has_bias,
num_experts=layer.num_experts,
torch_dtype=layer.param_dtype,
sublayer_name=sublayer_name,
)
# preprocess weight for inference
HummingMethod.transform_humming_layer(layer, sublayer_name=sublayer_name)
# use moe modular
experts: HummingIndexedExperts | HummingGroupedExperts
if get_humming_moe_gemm_type() == "indexed":
experts = HummingIndexedExperts(layer, self)
else:
experts = HummingGroupedExperts(layer, self)
self.experts = experts
def select_gemm_impl(
self,
prepare_finalize,
layer: torch.nn.Module,
):
from vllm.model_executor.layers.fused_moe import modular_kernel as mk
activation_format = prepare_finalize.activation_format
if activation_format == mk.FusedMoEActivationFormat.BatchedExperts:
return BatchedHummingGroupedExperts(layer, self, prepare_finalize)
elif get_humming_moe_gemm_type() == "indexed":
return HummingIndexedExperts(layer, self, prepare_finalize)
else:
return HummingGroupedExperts(layer, self, prepare_finalize)
def apply(
self,
layer: FusedMoE,
x: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
shared_experts_input: torch.Tensor | None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
workspace1, workspace2, output = self.experts.make_workspaces(
M=topk_ids.size(0),
topk=topk_ids.size(1),
activation=layer.activation,
)
assert workspace1.data_ptr() == output.data_ptr()
self.experts.main_apply(
hidden_states=x,
topk_weights=topk_weights,
topk_ids=topk_ids,
workspace1=workspace1,
workspace2=workspace2,
expert_tokens_meta=None,
)
return output
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
moe_align_block_size,
)
def humming_moe_align(
configs: list[int],
topk_ids: torch.Tensor,
num_experts: int,
expert_map: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
assert len(configs) > 0 and len(configs) % 3 == 0
# NOTE: we choose moe_block_size based on
# num_tokens * top_k (= topk_ids.nelement())
shape_m = topk_ids.nelement()
for i in range(len(configs) // 3):
if shape_m > configs[i * 3] and shape_m <= configs[i * 3 + 1]:
block_size = configs[i * 3 + 2]
break
else:
raise ValueError(f"Could not find a matching block_size for shape_m={shape_m}")
return moe_align_block_size(
topk_ids=topk_ids,
block_size=block_size,
num_experts=num_experts,
expert_map=expert_map,
pad_sorted_ids=False,
ignore_invalid_experts=True,
)
......@@ -605,8 +605,8 @@ def _adjust_shard_indexes_for_marlin(shard_size, shard_offset, marlin_tile_size)
def _adjust_shard_indexes_for_packing(
shard_size, shard_offset, packed_factor, marlin_tile_size
):
shard_size = shard_size // packed_factor
shard_offset = shard_offset // packed_factor
shard_size = round(shard_size // packed_factor)
shard_offset = round(shard_offset // packed_factor)
if marlin_tile_size is not None:
return _adjust_shard_indexes_for_marlin(
shard_size=shard_size,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment