env_override.py 1.66 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
import os

import torch

7
8
9
10
from vllm.logger import init_logger

logger = init_logger(__name__)

11
12
13
14
15
# set some common config/environment variables that should be set
# for all processes created by vllm and all processes
# that interact with vllm workers.
# they are executed whenever `import vllm` is called.

16
17
18
19
20
21
22
23
if 'NCCL_CUMEM_ENABLE' in os.environ:
    logger.warning(
        "NCCL_CUMEM_ENABLE is set to %s, skipping override. "
        "This may increase memory overhead with cudagraph+allreduce: "
        "https://github.com/NVIDIA/nccl/issues/1234",
        os.environ['NCCL_CUMEM_ENABLE'])
elif not os.path.exists('/dev/nvidia-caps-imex-channels'):
    # NCCL requires NCCL_CUMEM_ENABLE to work with
24
25
26
27
28
29
30
31
32
    # multi-node NVLink, typically on GB200-NVL72 systems.
    # The ultimate way to detect multi-node NVLink is to use
    # NVML APIs, which are too expensive to call here.
    # As an approximation, we check the existence of
    # /dev/nvidia-caps-imex-channels, used by
    # multi-node NVLink to communicate across nodes.
    # This will still cost some GPU memory, but it is worthwhile
    # because we can get very fast cross-node bandwidth with NVLink.
    os.environ['NCCL_CUMEM_ENABLE'] = '0'
33
34
35
36
37
38
39
40

# see https://github.com/vllm-project/vllm/pull/15951
# it avoids unintentional cuda initialization from torch.cuda.is_available()
os.environ['PYTORCH_NVML_BASED_CUDA_CHECK'] = '1'

# see https://github.com/vllm-project/vllm/issues/10480
os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
# see https://github.com/vllm-project/vllm/issues/10619
zhuwenwen's avatar
zhuwenwen committed
41
# torch._inductor.config.compile_threads = 1