Unverified Commit 02c97d9a authored by Dipika Sikka's avatar Dipika Sikka Committed by GitHub
Browse files

[Quantization] Add compressed-tensors emulations support for NVFP4 (#19879)


Signed-off-by: default avatarDipika Sikka <dipikasikka1@gmail.com>
Signed-off-by: default avatarDipika <dipikasikka1@gmail.com>
parent e795d723
......@@ -133,6 +133,7 @@ if TYPE_CHECKING:
VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300
VLLM_KV_CACHE_LAYOUT: Optional[str] = None
VLLM_COMPUTE_NANS_IN_LOGITS: bool = False
VLLM_USE_NVFP4_CT_EMULATIONS: bool = False
def get_default_cache_root():
......@@ -918,6 +919,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
# or bad hardware but it may add compute overhead.
"VLLM_COMPUTE_NANS_IN_LOGITS":
lambda: bool(int(os.getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0"))),
# Controls whether or not emulations are used for NVFP4
# generations on machines < 100 for compressed-tensors
# models
"VLLM_USE_NVFP4_CT_EMULATIONS":
lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0")))
}
# --8<-- [end:env-vars-definition]
......
......@@ -13,6 +13,7 @@ from compressed_tensors.quantization import (QuantizationArgs,
QuantizationType)
from pydantic import BaseModel
import vllm.envs as envs
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
......@@ -374,7 +375,8 @@ class CompressedTensorsConfig(QuantizationConfig):
if is_activation_quantization_format(self.quant_format):
if self._is_fp4a4_nvfp4(weight_quant, input_quant):
if CompressedTensorsW4A4Fp4.cutlass_fp4_supported():
if CompressedTensorsW4A4Fp4.cutlass_fp4_supported(
) or envs.VLLM_USE_NVFP4_CT_EMULATIONS:
return CompressedTensorsW4A4Fp4()
else:
logger.warning_once(
......
......@@ -4,11 +4,14 @@ from typing import Callable, Optional
import torch
from torch.nn.parameter import Parameter
import vllm.envs as envs
from vllm._custom_ops import (cutlass_scaled_fp4_mm,
cutlass_scaled_mm_supports_fp4, scaled_fp4_quant)
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
CompressedTensorsScheme)
from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import ( # noqa: E501
run_nvfp4_emulations)
from vllm.model_executor.parameter import (GroupQuantScaleParameter,
ModelWeightParameter,
PerTensorScaleParameter)
......@@ -26,6 +29,8 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
@classmethod
def get_min_capability(cls) -> int:
if envs.VLLM_USE_NVFP4_CT_EMULATIONS:
return 80
return 100
@classmethod
......@@ -129,6 +134,17 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
x: torch.Tensor,
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
if envs.VLLM_USE_NVFP4_CT_EMULATIONS:
out = run_nvfp4_emulations(
x=x,
input_global_scale=layer.input_global_scale,
weight=layer.weight,
weight_scale_swizzled=layer.weight_scale_swizzled,
weight_global_scale=layer.weight_global_scale)
if bias is not None:
out = out + bias
return out
output_dtype = x.dtype
output_shape = [x.shape[0], layer.weight.shape[0]]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment