Unverified Commit 66e601ef authored by IriKa's avatar IriKa Committed by GitHub
Browse files

Support compress-tensors with nvfp4 or fp8 weights and modelopt with nvfp4...


Support compress-tensors with nvfp4 or fp8 weights and modelopt with nvfp4 weights on Turing (#33076)
Signed-off-by: default avatarIriKa Qiu <qiujie.jq@gmail.com>
parent 0cd259b2
...@@ -29,7 +29,7 @@ class CompressedTensorsW4A16Fp4(CompressedTensorsScheme): ...@@ -29,7 +29,7 @@ class CompressedTensorsW4A16Fp4(CompressedTensorsScheme):
@classmethod @classmethod
def get_min_capability(cls) -> int: def get_min_capability(cls) -> int:
# don't restrict as emulations # don't restrict as emulations
return 80 return 75
def create_weights( def create_weights(
self, self,
......
...@@ -34,8 +34,8 @@ class CompressedTensorsW8A16Fp8(CompressedTensorsScheme): ...@@ -34,8 +34,8 @@ class CompressedTensorsW8A16Fp8(CompressedTensorsScheme):
@classmethod @classmethod
def get_min_capability(cls) -> int: def get_min_capability(cls) -> int:
# ampere and up # turing and up
return 80 return 75
# W8A8-Fp8 kernels support only per-tensor and per-channel cases. # W8A8-Fp8 kernels support only per-tensor and per-channel cases.
# So if we have a fused module (QKV, MLP) with per tensor scales, # So if we have a fused module (QKV, MLP) with per tensor scales,
......
...@@ -96,6 +96,7 @@ from vllm.model_executor.parameter import ( ...@@ -96,6 +96,7 @@ from vllm.model_executor.parameter import (
PerTensorScaleParameter, PerTensorScaleParameter,
) )
from vllm.model_executor.utils import replace_parameter from vllm.model_executor.utils import replace_parameter
from vllm.platforms import current_platform
from vllm.utils.flashinfer import ( from vllm.utils.flashinfer import (
flashinfer_scaled_fp4_mm, flashinfer_scaled_fp4_mm,
has_flashinfer, has_flashinfer,
...@@ -1110,7 +1111,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase): ...@@ -1110,7 +1111,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
self.backend = "none" self.backend = "none"
if envs.VLLM_NVFP4_GEMM_BACKEND is None: if envs.VLLM_NVFP4_GEMM_BACKEND is None:
if has_flashinfer(): if current_platform.has_device_capability(100) and has_flashinfer():
self.backend = "flashinfer-cutlass" self.backend = "flashinfer-cutlass"
elif cutlass_fp4_supported(): elif cutlass_fp4_supported():
self.backend = "cutlass" self.backend = "cutlass"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment