Make NCCL NVLS configurable (#3502)

b8318aec · Ata Fatahi · GitHub · 2f482210 · b8318aec · b8318aec
Unverified Commit b8318aec authored Feb 11, 2025 by Ata Fatahi Committed by GitHub Feb 12, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 1 deletion

python/sglang/srt/entrypoints/engine.py python/sglang/srt/entrypoints/engine.py +1 -1

python/sglang/srt/server_args.py python/sglang/srt/server_args.py +6 -0

No files found.
--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -297,7 +297,7 @@ def _set_envs_and_config(server_args: ServerArgs):
    # Set global environments
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
    os.environ["NCCL_CUMEM_ENABLE"] = "0"
-    os.environ["NCCL_NVLS_ENABLE"] = "0"
+    os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls))
    os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
    os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"


--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -140,6 +140,7 @@ class ServerArgs:
    disable_jump_forward: bool = False
    disable_cuda_graph: bool = False
    disable_cuda_graph_padding: bool = False
+    enable_nccl_nvls: bool = False
    disable_outlines_disk_cache: bool = False
    disable_custom_all_reduce: bool = False
    disable_mla: bool = False
@@ -783,6 +784,11 @@ class ServerArgs:
            action="store_true",
            help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
        )
+        parser.add_argument(
+            "--enable-nccl-nvls",
+            action="store_true",
+            help="Enable NCCL NVLS for prefill heavy requests when available.",
+        )
        parser.add_argument(
            "--disable-outlines-disk-cache",
            action="store_true",