Merge branch 'develop_v2.4' into 'main'

[DCU] avoid rtc trans kernel bug (need fix) See merge request dcutoolkit/deeplearing/TransformerEngine!26

Merge branch 'develop_v2.4' into 'main'
[DCU] avoid rtc trans kernel bug (need fix) See merge request dcutoolkit/deeplearing/TransformerEngine!26
bfd4074f · yuguo · 84e198a3 · fdb21575 · bfd4074f · bfd4074f
Commit bfd4074f authored Jun 10, 2025 by yuguo
3 changed files
--- a/build_tools/pytorch.py
+++ b/build_tools/pytorch.py
@@ -79,11 +79,11 @@ def setup_pytorch_extension(
                ]
            )
-        if bool(int(os.getenv("NVTE_BUILD_SUPPRESS_RETURN_TYPE_WARNING", "0"))):
+        if bool(int(os.getenv("NVTE_BUILD_SUPPRESS_RETURN_TYPE_WARNING", "1"))):
            nvcc_flags.append("-Wno-return-type")
            cxx_flags.append("-Wno-return-type")
-        if bool(int(os.getenv("NVTE_BUILD_SUPPRESS_SIGN_COMPARE", "0"))):
+        if bool(int(os.getenv("NVTE_BUILD_SUPPRESS_SIGN_COMPARE", "1"))):
            nvcc_flags.append("-Wno-sign-compare")
            cxx_flags.append("-Wno-sign-compare")

--- a/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py
+++ b/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py
@@ -9,7 +9,7 @@ from pathlib import Path
 import pytest
 import torch
 from transformer_engine.pytorch.fp8 import FP8GlobalStateManager
-# NVTE_INT8_SIM_FP8=1 torchrun --nproc_per_node=4 run_cast_master_weights_to_fp8.py --quantization fp8_block
+# NVTE_DISABLE_NVRTC=1 NVTE_INT8_SIM_FP8=1 torchrun --nproc_per_node=4 run_cast_master_weights_to_fp8.py --quantization fp8_block
 if torch.cuda.device_count() < 2:
    pytest.skip("cast_master_weights_to_fp8 test needs at least 2 GPUs.")

--- a/transformer_engine/pytorch/tensor/utils.py
+++ b/transformer_engine/pytorch/tensor/utils.py
@@ -437,7 +437,7 @@ def _cast_master_weights_to_fp8_blockwise_scaling(
        # We cannot create columnwise data here because users (like megatron) may want to overlap
        # the all-gather of model weights and forward process, so the model weight is not updated
        # at this moment.
-        model_weight.update_usage(rowwise_usage=True, columnwise_usage=False) # May cause core dump in iter 2
+        model_weight.update_usage(rowwise_usage=True, columnwise_usage=False)
        # If master weight is None, it means that the master weight of the current model weight
        # is in other DP ranks.