# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any import torch import torch._inductor.pattern_matcher as pm from torch import fx from torch._inductor.pattern_matcher import PatternMatcherPass from torch._ops import OpOverload import vllm.model_executor.layers.quantization.utils.fp8_utils # noqa: F401 from vllm._aiter_ops import rocm_aiter_ops from vllm.compilation.activation_quant_fusion import ActivationQuantPattern from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.quant_utils import ( GroupShape, QuantKey, ScaleDesc, ) from vllm.platforms import current_platform from .fusion import ( FusedRMSQuantKey, ) from .inductor_pass import enable_fake_mode from .matcher_utils import ( MatcherFusedAddRMSNorm, MatcherQuantFP8, MatcherRMSNorm, MatcherSiluAndMul, ) from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass logger = init_logger(__name__) FP8_DTYPE = current_platform.fp8_dtype() class AiterRMSNormQuantPattern: def __init__( self, epsilon: float, key: FusedRMSQuantKey, match_aiter_quant: bool = True ): self.epsilon = epsilon self.quant_dtype = key.quant.dtype self.rmsnorm_matcher = ( MatcherRMSNorm(epsilon, match_rocm_aiter=True) if not key.fused_add else MatcherFusedAddRMSNorm(epsilon, match_rocm_aiter=True) ) self.quant_matcher = MatcherQuantFP8( key.quant, match_rocm_aiter=match_aiter_quant, ) class AiterRMSNormDynamicQuantPattern(AiterRMSNormQuantPattern): """AITER RMSNorm + Dynamic Quantization pattern.""" FUSED_OP = rocm_aiter_ops.get_rmsnorm_fused_dynamic_quant_op() def __init__( self, epsilon: float, quant_dtype: torch.dtype, match_aiter_quant: bool = True, group_shape: GroupShape = GroupShape.PER_TOKEN, symmetric=True, ): scale = ScaleDesc(torch.float32, False, group_shape) key = FusedRMSQuantKey( fused_add=False, quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric), ) super().__init__(epsilon, key, match_aiter_quant) def register(self, pm_pass): def pattern( input: torch.Tensor, weight: torch.Tensor, ): result_rms = self.rmsnorm_matcher(input, weight) result, scale = self.quant_matcher(result_rms) return result, scale def replacement( input: torch.Tensor, weight: torch.Tensor, ): result = self.FUSED_OP( x=input, weight=weight, epsilon=self.epsilon, quant_dtype=self.quant_dtype, ) return result[0], result[1] pm.register_replacement( pattern, replacement, self.rmsnorm_matcher.inputs(), pm.fwd_only, pm_pass, ) class AiterFusedAddRMSNormDynamicQuantPattern(AiterRMSNormQuantPattern): """AITER RMSNorm Fused Add + Dynamic Quantization pattern.""" FUSED_OP = rocm_aiter_ops.get_rmsnorm_fused_add_dynamic_quant_op() def __init__( self, epsilon: float, quant_dtype: torch.dtype, match_aiter_quant: bool = True, group_shape: GroupShape = GroupShape.PER_TOKEN, symmetric=True, ): scale = ScaleDesc(torch.float32, False, group_shape) key = FusedRMSQuantKey( fused_add=True, quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric), ) super().__init__(epsilon, key, match_aiter_quant) def register(self, pm_pass): def pattern( input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor, ): result_rms, residual_out = self.rmsnorm_matcher(input, weight, residual) result, scale = self.quant_matcher(result_rms) return result, residual_out, scale def replacement( input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor ): result = self.FUSED_OP( x=input, residual=residual, weight=weight, epsilon=self.epsilon, quant_dtype=self.quant_dtype, ) return result[0], result[1], result[2] pm.register_replacement( pattern, replacement, self.rmsnorm_matcher.inputs(), pm.fwd_only, pm_pass, ) class AiterRMSFp8GroupQuantPattern(AiterRMSNormQuantPattern): """ This pattern fuses aiter rms_norm & group fp8 quant custom ops into an aiter rms_norm_group_fp8_quant op. """ FUSED_OP = rocm_aiter_ops.get_rmsnorm_group_fused_quant_op() def __init__( self, epsilon: float, quant_dtype: torch.dtype, group_shape: GroupShape, match_aiter_quant: bool = True, symmetric=True, ): scale = ScaleDesc(torch.float32, False, group_shape) key = FusedRMSQuantKey( fused_add=False, quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric), ) super().__init__(epsilon, key, match_aiter_quant) def register(self, pm_pass: PatternMatcherPass): def pattern( input: torch.Tensor, weight: torch.Tensor, ): result_rms = self.rmsnorm_matcher(input, weight) result, scale = self.quant_matcher(result_rms) return result, scale def replacement( input: torch.Tensor, weight: torch.Tensor, ): at = self.FUSED_OP( x=input, weight=weight, variance_epsilon=self.epsilon, group_size=128, ) return at[0], at[1] pm.register_replacement( pattern, replacement, self.rmsnorm_matcher.inputs(), pm.fwd_only, pm_pass ) class AiterFusedAddRMSFp8GroupQuantPattern(AiterRMSNormQuantPattern): """ This pattern fuses aiter rms_norm_with_add & group fp8 quant custom ops into a aiter rms_norm_with_add_group_fp8_quant op. """ FUSED_OP = rocm_aiter_ops.get_rmsnorm_group_add_fused_quant_op() def __init__( self, epsilon: float, quant_dtype: torch.dtype, group_shape: GroupShape, match_aiter_quant: bool = True, symmetric=True, ): scale = ScaleDesc(torch.float32, False, group_shape) key = FusedRMSQuantKey( fused_add=True, quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric), ) super().__init__(epsilon, key, match_aiter_quant) def register(self, pm_pass: PatternMatcherPass): def pattern( input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor, ): result_rms, residual_out = self.rmsnorm_matcher(input, weight, residual) result, scale = self.quant_matcher(result_rms) return result, residual_out, scale def replacement( input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor, ): at = self.FUSED_OP( x=input, residual=residual, weight=weight, variance_epsilon=self.epsilon, group_size=128, ) # result, scale, residual return at[0], at[1], at[2] pm.register_replacement( pattern, replacement, self.rmsnorm_matcher.inputs(), pm.fwd_only, pm_pass ) class RocmAiterRMSNormFusionPass(VllmPatternMatcherPass): """ This pass fuses aiter rms_norm & vllm/aiter quant custom ops into a fused rms_norm_quant op. It also supports fused_add_rms_norm. """ @enable_fake_mode def __init__(self, config: VllmConfig): super().__init__(config) self.patterns: PatternMatcherPass = PatternMatcherPass( pass_name="rocm_aiter_rms_norm_quant_fusion_pass" ) # Make sure fused add patterns are before simple rms norm, # as the latter is a subset of the former in torch ops for epsilon in [1e-5, 1e-6]: # Fuse aiter rms_norm + aiter dynamic group fp8 quant AiterRMSFp8GroupQuantPattern( epsilon, FP8_DTYPE, GroupShape(1, 128) ).register(self.patterns) # Fuse aiter fused_add_rms_norm + aiter dynamic group fp8 quant AiterFusedAddRMSFp8GroupQuantPattern( epsilon, FP8_DTYPE, GroupShape(1, 128) ).register(self.patterns) for match_aiter_quant in [True, False]: # Fuse aiter rms_norm + (aiter / vllm built-in) # dynamic per-token fp8 quant AiterRMSNormDynamicQuantPattern( epsilon, FP8_DTYPE, match_aiter_quant=match_aiter_quant ).register(self.patterns) # Fuse aiter fused_add_rms_norm + (aiter / vllm built-in) # dynamic per-token fp8 quant AiterFusedAddRMSNormDynamicQuantPattern( epsilon, FP8_DTYPE, match_aiter_quant=match_aiter_quant ).register(self.patterns) self.dump_patterns(config, self.patterns) @VllmInductorPass.time_and_log def __call__(self, graph: fx.Graph): self.matched_count = self.patterns.apply(graph) logger.debug("Replaced %s patterns", self.matched_count) def uuid(self) -> Any: fusion_patterns = [ AiterRMSNormDynamicQuantPattern, AiterFusedAddRMSNormDynamicQuantPattern, AiterRMSFp8GroupQuantPattern, AiterFusedAddRMSFp8GroupQuantPattern, ] return self.hash_source(self, *fusion_patterns) class AiterSiluMulFp8GroupQuantPattern(ActivationQuantPattern): """ This pattern fuses aiter silu_and_mul & group fp8 quant custom ops into an aiter silu_and_mul_group_fp8_quant op. """ FUSED_SILU_MUL_QUANT_OP = rocm_aiter_ops.get_act_mul_fused_fp8_group_quant_op() def __init__(self, quant_op: OpOverload): self.silu_and_mul_matcher = MatcherSiluAndMul() self.quant_op = quant_op def register(self, pm_pass: PatternMatcherPass): def pattern( input: torch.Tensor, ): at1 = self.silu_and_mul_matcher(input) at2 = self.quant_op(at1, 128) return at2[0], at2[1] def replacement( input: torch.Tensor, ): at = self.FUSED_SILU_MUL_QUANT_OP(x=input, group_size=128) return at[0], at[1] inputs = [ self.silu_and_mul_matcher.inputs()[0], ] pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass) class RocmAiterSiluMulFp8GroupQuantFusionPass(VllmPatternMatcherPass): """ This pass fuses a pre-defined set of custom ops into fused ops. It uses the torch pattern matcher to find the patterns and replace them. Because patterns can only be registered once, the pass is a singleton. This will be addressed in a future version of PyTorch: https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980 """ AITER_GROUP_FP8_QUANT_OP = rocm_aiter_ops.get_group_quant_op() TRITON_GROUP_FP8_QUANT_OP = torch.ops.vllm.triton_per_token_group_quant_fp8.default QUANT_OPS = [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP] @enable_fake_mode def __init__(self, config: VllmConfig): super().__init__(config) self.patterns: PatternMatcherPass = PatternMatcherPass( pass_name="rocm_aiter_silu_mul_fp8_group_quant_fusion_pass" ) for quant_op in self.QUANT_OPS: AiterSiluMulFp8GroupQuantPattern(quant_op).register(self.patterns) self.dump_patterns(config, self.patterns) @VllmInductorPass.time_and_log def __call__(self, graph: torch.fx.Graph): self.matched_count = self.patterns.apply(graph) logger.debug("Replaced %s patterns", self.matched_count) def uuid(self): fusion_patterns = [ ActivationQuantPattern, AiterSiluMulFp8GroupQuantPattern, ] return VllmInductorPass.hash_source(self, *fusion_patterns)