fix: increase `/dev/shm` for CI pytest containers to fix NIXL UCX segfault (#7515)

4fc2b5e8 · KrishnanPrash · GitHub · 967ba9a2 · 4fc2b5e8 · 4fc2b5e8
Unverified Commit 4fc2b5e8 authored Mar 19, 2026 by KrishnanPrash Committed by GitHub Mar 19, 2026
Showing with 18 additions and 15 deletions

.github/actions/pytest/action.yml .github/actions/pytest/action.yml +8 -3

tests/serve/test_sglang.py tests/serve/test_sglang.py +3 -8

tests/serve/test_vllm.py tests/serve/test_vllm.py +7 -4

No files found.
--- a/.github/actions/pytest/action.yml
+++ b/.github/actions/pytest/action.yml
@@ -154,6 +154,7 @@ runs:

        docker run ${GPU_FLAGS} --rm -w /workspace \
          --cpus=${NUM_CPUS} \
+          --shm-size=200m \
          --network host \
          "${DOCKER_ENV_FLAGS[@]}" \
          --name ${{ env.CONTAINER_ID }}_pytest \
@@ -189,9 +190,13 @@ runs:
        # Run pytest with detailed output and JUnit XML
        set +e  # Don't exit on test failures

-        # Define common docker flags for stability (Shared memory & limits)
-        # --ipc=host is critical for parallel pytest workers to communicate fast
-        DOCKER_OPTS="--ipc=host --ulimit memlock=-1 --ulimit stack=67108864"
+        # /dev/shm sizing: NIXL uses UCX which allocates shared memory segments in /dev/shm
+        # via shm_open(). Each NIXL agent with num_threads=8 creates 9 UCX workers, each needing
+        # ~4.8MB of shm. Disaggregated tests use up to 3 agents (encode+prefill+decode) = ~130MB.
+        # Docker's default is 64MB, which is insufficient and causes segfaults.
+        # Do NOT use --ipc=host here — it overrides --shm-size with the host's /dev/shm
+        # (64MB in K8s pods by default), silently ignoring the size we set.
+        DOCKER_OPTS="--shm-size=200m --ulimit memlock=-1 --ulimit stack=67108864"

        # Determine docker runtime flags and pytest command based on dry_run mode
        if [[ "${{ inputs.dry_run }}" == "true" ]]; then

--- a/tests/serve/test_sglang.py
+++ b/tests/serve/test_sglang.py
@@ -168,9 +168,9 @@ sglang_configs = {
        ],
    ),
    # NOTE: Pack all workers on 1 GPU for lower CI resource requirements
-    "multimodal_epd_qwen": SGLangConfig(
+    "multimodal_e_pd_qwen": SGLangConfig(
        # E/P/D architecture: Encode, Prefill, Decode workers all on GPU 0
-        name="multimodal_epd_qwen",
+        name="multimodal_e_pd_qwen",
        directory=sglang_dir,
        script_name="multimodal_epd.sh",
        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
@@ -182,8 +182,6 @@ sglang_configs = {
            "DYN_WORKER_GPU": "0",
            "DYN_ENCODE_GPU_MEM": "0.1",
            "DYN_WORKER_GPU_MEM": "0.4",
-            # FIXME: NIXL Agent Initialization (shared memory interface) causes segfault
-            "UCX_TLS": "^mm",
        },
        frontend_port=DefaultPort.FRONTEND.value,
        request_payloads=[
@@ -220,10 +218,7 @@ sglang_configs = {
        model="Qwen/Qwen3-VL-2B-Instruct",
        script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
        timeout=360,
-        env={
-            # FIXME: NIXL Agent Initialization (shared memory interface) causes segfault
-            "UCX_TLS": "^mm",
-        },
+        env={},
        frontend_port=DefaultPort.FRONTEND.value,
        request_payloads=[
            chat_payload(

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -299,8 +299,8 @@ vllm_configs = {
        ],
    ),
    # NOTE: Pack all workers on 1 GPU for lower CI resource requirements
-    "multimodal_disagg_qwen3vl_2b_e_pd": VLLMConfig(
-        name="multimodal_disagg_qwen3vl_2b_e_pd",
+    "multimodal_e_pd_qwen": VLLMConfig(
+        name="multimodal_e_pd_qwen",
        directory=vllm_dir,
        script_name="disagg_multimodal_e_pd.sh",
        marks=[
@@ -311,6 +311,9 @@ vllm_configs = {
        ],
        model="Qwen/Qwen3-VL-2B-Instruct",
        script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
+        env={
+            "DYN_VLLM_EMBEDDING_TRANSFER_MODE": "nixl-write",
+        },
        request_payloads=[
            chat_payload(
                [
@@ -376,8 +379,8 @@ vllm_configs = {
    # so _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE has no effect. Regardless of GPU_MEM
    # fractions (0.1/0.4/0.4), the 3 workers combined consistently use ~17.6 GiB
    # total on this GPU.
-    "multimodal_disagg_qwen3vl_2b_epd": VLLMConfig(
-        name="multimodal_disagg_qwen3vl_2b_epd",
+    "multimodal_disagg_qwen": VLLMConfig(
+        name="multimodal_disagg_qwen",
        directory=vllm_dir,
        script_name="disagg_multimodal_epd.sh",
        marks=[