Commit c1a1c04e authored by wenjh's avatar wenjh
Browse files

Merge nv_main(2.10) to main


Signed-off-by: wenjh's avatarwenjh <wenjh@sugon.com>
parents e698a0a7 66aed3ae
......@@ -17,6 +17,48 @@ from transformer_engine.pytorch import (
Float8CurrentScalingQuantizer,
)
import transformer_engine.pytorch.ops as te_ops
from transformer_engine.pytorch.custom_recipes.quantization_nvfp4 import (
nvfp4_ref_rht_2d_quantizer_factory,
)
@pytest.mark.parametrize("module_type", ["Linear", "LayerNormLinear", "OpsLinear"])
def test_custom_recipe_sanity_modules_nvfp4(module_type):
"""Test modules with NVFP4 custom recipe support"""
available, reason = te.is_fp8_available(return_reason=True)
if not torch.cuda.is_available() or not available:
pytest.skip(f"FP8 unsupported on this device: {reason}")
torch.manual_seed(0)
# Simple linear layer with dims divisible by 16
in_features = 64
out_features = 64
batch = 32
if module_type == "Linear":
model = Linear(in_features, out_features, params_dtype=torch.bfloat16, bias=False).cuda()
elif module_type == "LayerNormLinear":
model = LayerNormLinear(
in_features, out_features, params_dtype=torch.bfloat16, bias=False
).cuda()
else: # OpsLinear
model = te_ops.Linear(
in_features, out_features, device="cuda", dtype=torch.bfloat16, bias=False
)
inp = torch.randn(batch, in_features, device="cuda", dtype=torch.bfloat16, requires_grad=True)
# Use NVFP4 quantizer factory
custom_recipe = recipe.CustomRecipe(qfactory=nvfp4_ref_rht_2d_quantizer_factory)
# Execute with custom recipe
with autocast(enabled=True, recipe=custom_recipe):
out = model(inp)
loss = out.float().sum()
loss.backward()
# Basic sanity: gradients exist
assert inp.grad is not None
@pytest.mark.parametrize("module_type", ["Linear", "LayerNormLinear", "OpsLinear", "LayerNormMLP"])
......
This diff is collapsed.
This diff is collapsed.
......@@ -68,7 +68,7 @@ if fp8_available:
fp8_recipes.append(recipe.DelayedScaling())
fp8_recipes.append(None)
supported_activations = ["gelu", "relu", "reglu", "geglu", "swiglu"]
supported_activations = ["gelu", "relu", "reglu", "geglu", "swiglu", "clamped_swiglu"]
all_normalizations = ["LayerNorm", "RMSNorm"]
......
This diff is collapsed.
......@@ -205,6 +205,7 @@ class ModelConfig:
window_size: Tuple[int, int] = (-1, -1),
context_parallel: bool = False,
cp_comm_type: str = "p2p",
return_max_logit=False,
total_requests: int = None,
max_ctx_len: int = None,
num_layers: int = 1,
......@@ -233,6 +234,7 @@ class ModelConfig:
self.window_size = check_set_window_size(self.attn_mask_type, window_size)
self.context_parallel = context_parallel
self.cp_comm_type = cp_comm_type
self.return_max_logit = return_max_logit
self.total_requests = total_requests
self.max_ctx_len = max_ctx_len
self.num_layers = num_layers
......@@ -318,6 +320,7 @@ def get_available_attention_backends(
is_training=is_training,
inference_params=inference_params,
softmax_type=config.softmax_type,
return_max_logit=config.return_max_logit,
)
(
use_flash_attention,
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment