Unverified Commit 2886cbce authored by Paweł Gadziński's avatar Paweł Gadziński Committed by GitHub
Browse files

[PyTorch debug] Fix test for debug tools (#2507)



* Skip delayed wgrad tests in distributed numerics when debug mode is enabled
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

* fix
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

---------
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>
Co-authored-by: default avatarTim Moon <4406448+timmoon10@users.noreply.github.com>
parent b215116a
...@@ -44,7 +44,7 @@ python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_ ...@@ -44,7 +44,7 @@ python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_
pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_distributed.xml $TE_PATH/tests/pytorch/debug/test_distributed.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || test_fail "debug test_distributed.py" pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_distributed.xml $TE_PATH/tests/pytorch/debug/test_distributed.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || test_fail "debug test_distributed.py"
# standard numerics tests with initialized debug # standard numerics tests with initialized debug
NVTE_TEST_NVINSPECT_ENABLED=True NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_2.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "debug test_numerics.py" NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_2.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "debug test_numerics.py"
if [ "$RET" -ne 0 ]; then if [ "$RET" -ne 0 ]; then
echo "Error in the following test cases:$FAILED_CASES" echo "Error in the following test cases:$FAILED_CASES"
......
...@@ -38,8 +38,9 @@ WORLD_RANK, WORLD_SIZE = None, None ...@@ -38,8 +38,9 @@ WORLD_RANK, WORLD_SIZE = None, None
NCCL_WORLD = None NCCL_WORLD = None
LOSS_FN = nn.MSELoss() LOSS_FN = nn.MSELoss()
QUANTIZATION = None QUANTIZATION = None
NVTE_TEST_NVINSPECT_ENABLED = int(os.environ.get("NVTE_TEST_NVINSPECT_ENABLED") or "0")
if os.environ.get("NVTE_TEST_NVINSPECT_ENABLED", False): if NVTE_TEST_NVINSPECT_ENABLED:
# The numerics of all the layers should work the same, # The numerics of all the layers should work the same,
# when debug=True. I fed them with dummy feature # when debug=True. I fed them with dummy feature
# to prevent switching off debug, which can happen if # to prevent switching off debug, which can happen if
...@@ -745,6 +746,8 @@ def test_linear(): ...@@ -745,6 +746,8 @@ def test_linear():
for kwargs in kwargs_list: for kwargs in kwargs_list:
if kwargs.get("save_original_input", False) and QUANTIZATION == "fp8": if kwargs.get("save_original_input", False) and QUANTIZATION == "fp8":
continue continue
if kwargs.get("delay_wgrad_compute", False) and NVTE_TEST_NVINSPECT_ENABLED:
continue
for parallel_mode in ["column", "row"]: for parallel_mode in ["column", "row"]:
for sequence_parallel in [False, True]: for sequence_parallel in [False, True]:
_test_linear(parallel_mode, sequence_parallel, **kwargs) _test_linear(parallel_mode, sequence_parallel, **kwargs)
...@@ -924,6 +927,8 @@ def test_layernorm_linear(): ...@@ -924,6 +927,8 @@ def test_layernorm_linear():
] ]
for kwargs in kwargs_list: for kwargs in kwargs_list:
if kwargs.get("delay_wgrad_compute", False) and NVTE_TEST_NVINSPECT_ENABLED:
continue
for parallel_mode in ["column"]: for parallel_mode in ["column"]:
for sequence_parallel in [False, True]: for sequence_parallel in [False, True]:
_test_layernorm_linear(parallel_mode, sequence_parallel, **kwargs) _test_layernorm_linear(parallel_mode, sequence_parallel, **kwargs)
...@@ -1034,6 +1039,8 @@ def test_layernorm_mlp(): ...@@ -1034,6 +1039,8 @@ def test_layernorm_mlp():
] ]
for kwargs in kwargs_list: for kwargs in kwargs_list:
if kwargs.get("delay_wgrad_compute", False) and NVTE_TEST_NVINSPECT_ENABLED:
continue
for set_parallel_mode in [True]: for set_parallel_mode in [True]:
for sequence_parallel in [False, True]: for sequence_parallel in [False, True]:
_test_layernorm_mlp(set_parallel_mode, sequence_parallel, **kwargs) _test_layernorm_mlp(set_parallel_mode, sequence_parallel, **kwargs)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment