Unverified Commit 4fc2b5e8 authored by KrishnanPrash's avatar KrishnanPrash Committed by GitHub
Browse files

fix: increase `/dev/shm` for CI pytest containers to fix NIXL UCX segfault (#7515)

parent 967ba9a2
......@@ -154,6 +154,7 @@ runs:
docker run ${GPU_FLAGS} --rm -w /workspace \
--cpus=${NUM_CPUS} \
--shm-size=200m \
--network host \
"${DOCKER_ENV_FLAGS[@]}" \
--name ${{ env.CONTAINER_ID }}_pytest \
......@@ -189,9 +190,13 @@ runs:
# Run pytest with detailed output and JUnit XML
set +e # Don't exit on test failures
# Define common docker flags for stability (Shared memory & limits)
# --ipc=host is critical for parallel pytest workers to communicate fast
DOCKER_OPTS="--ipc=host --ulimit memlock=-1 --ulimit stack=67108864"
# /dev/shm sizing: NIXL uses UCX which allocates shared memory segments in /dev/shm
# via shm_open(). Each NIXL agent with num_threads=8 creates 9 UCX workers, each needing
# ~4.8MB of shm. Disaggregated tests use up to 3 agents (encode+prefill+decode) = ~130MB.
# Docker's default is 64MB, which is insufficient and causes segfaults.
# Do NOT use --ipc=host here — it overrides --shm-size with the host's /dev/shm
# (64MB in K8s pods by default), silently ignoring the size we set.
DOCKER_OPTS="--shm-size=200m --ulimit memlock=-1 --ulimit stack=67108864"
# Determine docker runtime flags and pytest command based on dry_run mode
if [[ "${{ inputs.dry_run }}" == "true" ]]; then
......
......@@ -168,9 +168,9 @@ sglang_configs = {
],
),
# NOTE: Pack all workers on 1 GPU for lower CI resource requirements
"multimodal_epd_qwen": SGLangConfig(
"multimodal_e_pd_qwen": SGLangConfig(
# E/P/D architecture: Encode, Prefill, Decode workers all on GPU 0
name="multimodal_epd_qwen",
name="multimodal_e_pd_qwen",
directory=sglang_dir,
script_name="multimodal_epd.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
......@@ -182,8 +182,6 @@ sglang_configs = {
"DYN_WORKER_GPU": "0",
"DYN_ENCODE_GPU_MEM": "0.1",
"DYN_WORKER_GPU_MEM": "0.4",
# FIXME: NIXL Agent Initialization (shared memory interface) causes segfault
"UCX_TLS": "^mm",
},
frontend_port=DefaultPort.FRONTEND.value,
request_payloads=[
......@@ -220,10 +218,7 @@ sglang_configs = {
model="Qwen/Qwen3-VL-2B-Instruct",
script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
timeout=360,
env={
# FIXME: NIXL Agent Initialization (shared memory interface) causes segfault
"UCX_TLS": "^mm",
},
env={},
frontend_port=DefaultPort.FRONTEND.value,
request_payloads=[
chat_payload(
......
......@@ -299,8 +299,8 @@ vllm_configs = {
],
),
# NOTE: Pack all workers on 1 GPU for lower CI resource requirements
"multimodal_disagg_qwen3vl_2b_e_pd": VLLMConfig(
name="multimodal_disagg_qwen3vl_2b_e_pd",
"multimodal_e_pd_qwen": VLLMConfig(
name="multimodal_e_pd_qwen",
directory=vllm_dir,
script_name="disagg_multimodal_e_pd.sh",
marks=[
......@@ -311,6 +311,9 @@ vllm_configs = {
],
model="Qwen/Qwen3-VL-2B-Instruct",
script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
env={
"DYN_VLLM_EMBEDDING_TRANSFER_MODE": "nixl-write",
},
request_payloads=[
chat_payload(
[
......@@ -376,8 +379,8 @@ vllm_configs = {
# so _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE has no effect. Regardless of GPU_MEM
# fractions (0.1/0.4/0.4), the 3 workers combined consistently use ~17.6 GiB
# total on this GPU.
"multimodal_disagg_qwen3vl_2b_epd": VLLMConfig(
name="multimodal_disagg_qwen3vl_2b_epd",
"multimodal_disagg_qwen": VLLMConfig(
name="multimodal_disagg_qwen",
directory=vllm_dir,
script_name="disagg_multimodal_epd.sh",
marks=[
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment