Merge branch 'TE_ta_release2.7' into 'release_v2.7'

[DCU]Fix MPI root support, enable int8 simulation and batched_inear to access... See merge request dcutoolkit/deeplearing/TransformerEngine!55

Merge branch 'TE_ta_release2.7' into 'release_v2.7'
[DCU]Fix MPI root support, enable int8 simulation and batched_inear to access... See merge request dcutoolkit/deeplearing/TransformerEngine!55
15a0011b · wenjh · 4d0d3245 · bd5a6e86 · 15a0011b · 15a0011b
Commit 15a0011b authored Oct 31, 2025 by wenjh
3 changed files
--- a/qa/L1_pytorch_distributed_unittest/test.sh
+++ b/qa/L1_pytorch_distributed_unittest/test.sh
@@ -35,7 +35,7 @@ python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_torch_fsdp2.xml $TE_
 python3 -m pytest -v -s --log-cli-level=INFO --junitxml=$XML_LOG_DIR/pytest_test_comm_gemm_overlap.xml $TE_PATH/tests/pytorch/distributed/test_comm_gemm_overlap.py || test_fail "test_comm_gemm_overlap.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops_with_userbuffers.xml $TE_PATH/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py || test_fail "test_fusible_ops_with_userbuffers.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_attention_with_cp.xml $TE_PATH/tests/pytorch/attention/test_attention_with_cp.py || test_fail "test_attention_with_cp.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_to_fp8.xml $TE_PATH/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py || test_fail "test_cast_master_weights_to_fp8.py"
+NVTE_INT8_SIM_FP8=1 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_to_fp8.xml $TE_PATH/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py || test_fail "test_cast_master_weights_to_fp8.py"


 # debug tests
@@ -46,7 +46,7 @@ python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_
 : ${NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE:=$TE_PATH/tests/pytorch/debug/test_configs/dummy_feature.yaml}
 : ${NVTE_TEST_NVINSPECT_FEATURE_DIRS:=$TE_PATH/transformer_engine/debug/features}

-pytest -v -s $TE_PATH/tests/pytorch/debug/test_distributed.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || test_fail "debug test_distributed.py"
+NVTE_INT8_SIM_FP8=1 pytest -v -s $TE_PATH/tests/pytorch/debug/test_distributed.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || test_fail "debug test_distributed.py"
 # standard numerics tests with initialized debug
 NVTE_TEST_NVINSPECT_ENABLED=True NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS pytest -v -s $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "debug test_numerics.py"


--- a/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py
+++ b/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py
@@ -432,7 +432,7 @@ def test_fuser_ops_with_userbuffers(
    command = []
    if tex.ubuf_built_with_mpi():
        python_exe = pathlib.Path(sys.executable).resolve()
-        command.extend(("mpirun", "-np", str(world_size), "--oversubscribe", "--quiet", python_exe))
+        command.extend(("mpirun", "-np", str(world_size), "--allow-run-as-root", "--oversubscribe", "--quiet", python_exe))
    else:
        command.extend(("torchrun", f"--nproc_per_node={world_size}"))


--- a/transformer_engine/pytorch/module/batched_linear.py
+++ b/transformer_engine/pytorch/module/batched_linear.py
@@ -162,7 +162,8 @@ class _BatchLinear(torch.autograd.Function):

            for i in range(num_gemms):
                weights[i].offloading_activation = False
-                weights[i].main_grad.offloading_activation = False
+                if fuse_wgrad_accumulation and hasattr(weights[i], 'main_grad'):
+                    weights[i].main_grad.offloading_activation = False
                if weights_fp8[i] is not None:
                    weights_fp8[i].offloading_activation = False

@@ -553,7 +554,7 @@ class BatchedLinear(TransformerEngineBaseModule):

        if self.primary_weights_in_fp8:
            self.init_fp8_metadata(num_gemms=self.num_gemms)
-
+            
        self.reset_parameters(defer_init=(device == "meta"))