Unverified Commit f1d78923 authored by Yineng Zhang's avatar Yineng Zhang Committed by GitHub
Browse files

[Auto Sync] Update modelopt_quant.py (20250920) (#10688)


Co-authored-by: default avatargithub-actions[bot] <github-actions[bot]@users.noreply.github.com>
parent 7c876de7
...@@ -77,6 +77,9 @@ logger = logging.getLogger(__name__) ...@@ -77,6 +77,9 @@ logger = logging.getLogger(__name__)
CUTEDSL_MOE_SCALAR_INPUT_SCALE = get_bool_env_var( CUTEDSL_MOE_SCALAR_INPUT_SCALE = get_bool_env_var(
"SGLANG_CUTEDSL_MOE_SCALAR_INPUT_SCALE", "true" "SGLANG_CUTEDSL_MOE_SCALAR_INPUT_SCALE", "true"
) )
USE_CUTLASS_BACKEND_FOR_FP4_GEMM = get_bool_env_var(
"SGLANG_USE_CUTLASS_BACKEND_FOR_FP4_GEMM"
)
# Supported activation schemes for the current configuration # Supported activation schemes for the current configuration
ACTIVATION_SCHEMES = ["static"] ACTIVATION_SCHEMES = ["static"]
...@@ -844,6 +847,17 @@ class ModelOptFp4LinearMethod(LinearMethodBase): ...@@ -844,6 +847,17 @@ class ModelOptFp4LinearMethod(LinearMethodBase):
if enable_flashinfer_fp4_gemm: if enable_flashinfer_fp4_gemm:
w = layer.weight.T w = layer.weight.T
w_scale_interleaved = layer.weight_scale_interleaved.T w_scale_interleaved = layer.weight_scale_interleaved.T
if USE_CUTLASS_BACKEND_FOR_FP4_GEMM:
out = fp4_gemm(
x_fp4,
w,
x_scale_interleaved,
w_scale_interleaved,
layer.alpha,
output_dtype,
backend="cutlass",
)
else:
out = fp4_gemm( out = fp4_gemm(
x_fp4, x_fp4,
w, w,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment