[JAX] Improving the test_multiprocessing_encoder.py run script (#1673)

* script improvement * add wait * add return code back * relax tols for FP8 test in test_multiprocessing_ by 0.001 --------- Signed-off-by: Phuong Nguyen <phuonguyen@nvidia.com>

[JAX] Improving the test_multiprocessing_encoder.py run script (#1673)
* script improvement * add wait * add return code back * relax tols for FP8 test in test_multiprocessing_ by 0.001 --------- Signed-off-by: Phuong Nguyen <phuonguyen@nvidia.com>
313ab4f4 · Phuong Nguyen · GitHub · 5fdd7bb9 · 313ab4f4 · 313ab4f4
Unverified Commit 313ab4f4 authored Apr 14, 2025 by Phuong Nguyen Committed by GitHub Apr 14, 2025
3 changed files
--- a/examples/jax/encoder/run_test_multiprocessing_encoder.sh
+++ b/examples/jax/encoder/run_test_multiprocessing_encoder.sh
@@ -4,32 +4,54 @@

 NUM_GPUS=${NUM_GPUS:-$(nvidia-smi -L | wc -l)}

-for i in $(seq 0 $(($NUM_GPUS-1)))
-do
-  pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/examples/jax/encoder/test_multiprocessing_encoder.py::TestEncoder::test_te_bf16 --num-process=$NUM_GPUS --process-id=$i &
-done
-wait
+# Define the test cases to run
+TEST_CASES=(
+"test_te_bf16"
+"test_te_delayed_scaling_fp8"
+"test_te_mxfp8"
+"test_te_bf16_shardy"
+"test_te_delayed_scaling_fp8_shardy"
+)

-for i in $(seq 0 $(($NUM_GPUS-1)))
-do
-  pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/examples/jax/encoder/test_multiprocessing_encoder.py::TestEncoder::test_te_delayed_scaling_fp8 --num-process=$NUM_GPUS --process-id=$i &
-done
-wait
+echo
+echo "*** Executing tests in examples/jax/encoder/test_multiprocessing_encoder.py ***"

-for i in $(seq 0 $(($NUM_GPUS-1)))
-do
-  pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/examples/jax/encoder/test_multiprocessing_encoder.py::TestEncoder::test_te_mxfp8 --num-process=$NUM_GPUS --process-id=$i &
-done
-wait
+HAS_FAILURE=0  # Global failure flag

-for i in $(seq 0 $(($NUM_GPUS-1)))
-do
-  pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/examples/jax/encoder/test_multiprocessing_encoder.py::TestEncoder::test_te_bf16_shardy --num-process=$NUM_GPUS --process-id=$i &
-done
-wait
+# Run each test case across all GPUs
+for TEST_CASE in "${TEST_CASES[@]}"; do
+  echo
+  echo "=== Starting test: $TEST_CASE ..."
+
+  for i in $(seq 0 $(($NUM_GPUS - 1))); do
+    # Define output file for logs
+    LOG_FILE="${TEST_CASE}_gpu_${i}.log"

-for i in $(seq 0 $(($NUM_GPUS-1)))
-do
-  pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/examples/jax/encoder/test_multiprocessing_encoder.py::TestEncoder::test_te_delayed_scaling_fp8_shardy --num-process=$NUM_GPUS --process-id=$i &
+    # Run pytest and redirect stdout and stderr to the log file
+    pytest -c "$TE_PATH/tests/jax/pytest.ini" \
+      -vs "$TE_PATH/examples/jax/encoder/test_multiprocessing_encoder.py::TestEncoder::$TEST_CASE" \
+      --num-process=$NUM_GPUS \
+      --process-id=$i > "$LOG_FILE" 2>&1 &
+    done
+
+  # Wait for the process to finish
+  wait
+
+  # Check and print the log content accordingly
+  if grep -q "FAILED" "${TEST_CASE}_gpu_0.log"; then
+    HAS_FAILURE=1
+    echo "... $TEST_CASE FAILED"
+    tail -n +7 "${TEST_CASE}_gpu_0.log"
+  elif grep -q "SKIPPED" "${TEST_CASE}_gpu_0.log"; then
+    echo "... $TEST_CASE SKIPPED"
+  elif grep -q "PASSED" "${TEST_CASE}_gpu_0.log"; then
+    echo "... $TEST_CASE PASSED"
+  else
+    echo "Invalid ${TEST_CASE}_gpu_0.log"
+  fi
+
+  # Remove the log file after processing it
+  rm ${TEST_CASE}_gpu_*.log
 done
-wait
+
+exit $HAS_FAILURE
--- a/examples/jax/encoder/test_multiprocessing_encoder.py
+++ b/examples/jax/encoder/test_multiprocessing_encoder.py
@@ -609,7 +609,7 @@ class TestEncoder(unittest.TestCase):
    def test_te_delayed_scaling_fp8(self):
        """Test Transformer Engine with DelayedScaling FP8"""
        result = self.exec(True, "DelayedScaling")
-        assert result[0] < 0.505 and result[1] > 0.755
+        assert result[0] < 0.505 and result[1] > 0.754

    @unittest.skipIf(
        not is_mxfp8_supported(), "Device compute capability 10.0+ is required for MXFP8"

--- a/qa/L0_jax_distributed_unittest/test.sh
+++ b/qa/L0_jax_distributed_unittest/test.sh
@@ -23,7 +23,9 @@ pip3 install -r $TE_PATH/examples/jax/encoder/requirements.txt || error_exit "Fa
 # Make encoder tests to have run-to-run deterministic to have the stable CI results
 export XLA_FLAGS="${XLA_FLAGS} --xla_gpu_deterministic_ops"
 python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/examples/jax/encoder/test_multigpu_encoder.py || test_fail "test_multigpu_encoder.py"
+wait
 python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/examples/jax/encoder/test_model_parallel_encoder.py || test_fail "test_model_parallel_encoder.py"
+wait
 . $TE_PATH/examples/jax/encoder/run_test_multiprocessing_encoder.sh || test_fail "test_multiprocessing_encoder.py"

 if [ $RET -ne 0 ]; then