collect_env.py 27.4 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
# ruff: noqa
5
6
7
8
# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py

import datetime
import locale
9
import os
10
11
import subprocess
import sys
12

13
14
15
# Unlike the rest of the PyTorch this file must be python2 compliant.
# This script outputs relevant system environment info
# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
16
17
from collections import namedtuple

18
19
import regex as re

20
21
from vllm.envs import environment_variables

22
23
try:
    import torch
24

25
26
27
28
29
    TORCH_AVAILABLE = True
except (ImportError, NameError, AttributeError, OSError):
    TORCH_AVAILABLE = False

# System Environment Information
30
SystemEnv = namedtuple(
31
    "SystemEnv",
32
    [
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
        "torch_version",
        "is_debug_build",
        "cuda_compiled_version",
        "gcc_version",
        "clang_version",
        "cmake_version",
        "os",
        "libc_version",
        "python_version",
        "python_platform",
        "is_cuda_available",
        "cuda_runtime_version",
        "cuda_module_loading",
        "nvidia_driver_version",
        "nvidia_gpu_models",
        "cudnn_version",
        "pip_version",  # 'pip' or 'pip3'
        "pip_packages",
        "conda_packages",
        "hip_compiled_version",
        "hip_runtime_version",
        "miopen_runtime_version",
        "caching_allocator_config",
        "is_xnnpack_available",
        "cpu_info",
        "rocm_version",  # vllm specific field
        "vllm_version",  # vllm specific field
        "vllm_build_flags",  # vllm specific field
        "gpu_topo",  # vllm specific field
        "env_vars",
    ],
)
65
66
67
68
69
70
71
72
73
74

DEFAULT_CONDA_PATTERNS = {
    "torch",
    "numpy",
    "cudatoolkit",
    "soumith",
    "mkl",
    "magma",
    "triton",
    "optree",
75
    "nccl",
76
    "transformers",
77
    "zmq",
78
79
    "nvidia",
    "pynvml",
80
    "flashinfer-python",
81
    "helion",
82
83
84
85
86
87
88
89
90
91
}

DEFAULT_PIP_PATTERNS = {
    "torch",
    "numpy",
    "mypy",
    "flake8",
    "triton",
    "optree",
    "onnx",
92
    "nccl",
93
    "transformers",
94
    "zmq",
95
96
    "nvidia",
    "pynvml",
97
    "flashinfer-python",
98
    "helion",
99
100
101
102
103
104
}


def run(command):
    """Return (return-code, stdout, stderr)."""
    shell = True if type(command) is str else False
105
    try:
106
107
108
        p = subprocess.Popen(
            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell
        )
109
110
        raw_output, raw_err = p.communicate()
        rc = p.returncode
111
112
        if get_platform() == "win32":
            enc = "oem"
113
114
115
        else:
            enc = locale.getpreferredencoding()
        output = raw_output.decode(enc)
116
        if command == "nvidia-smi topo -m":
117
118
119
120
121
122
123
124
125
126
            # don't remove the leading whitespace of `nvidia-smi topo -m`
            #   because they are meaningful
            output = output.rstrip()
        else:
            output = output.strip()
        err = raw_err.decode(enc)
        return rc, output, err.strip()

    except FileNotFoundError:
        cmd_str = command if isinstance(command, str) else command[0]
127
        return 127, "", f"Command not found: {cmd_str}"
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147


def run_and_read_all(run_lambda, command):
    """Run command using run_lambda; reads and returns entire output if rc is 0."""
    rc, out, _ = run_lambda(command)
    if rc != 0:
        return None
    return out


def run_and_parse_first_match(run_lambda, command, regex):
    """Run command using run_lambda, returns the first regex match if it exists."""
    rc, out, _ = run_lambda(command)
    if rc != 0:
        return None
    match = re.search(regex, out)
    if match is None:
        return None
    return match.group(1)

148

149
150
151
152
153
def run_and_return_first_line(run_lambda, command):
    """Run command using run_lambda and returns first line if output is not empty."""
    rc, out, _ = run_lambda(command)
    if rc != 0:
        return None
154
    return out.split("\n")[0]
155
156
157
158
159


def get_conda_packages(run_lambda, patterns=None):
    if patterns is None:
        patterns = DEFAULT_CONDA_PATTERNS
160
161
    conda = os.environ.get("CONDA_EXE", "conda")
    out = run_and_read_all(run_lambda, [conda, "list"])
162
163
164
    if out is None:
        return out

165
166
167
168
169
    return "\n".join(
        line
        for line in out.splitlines()
        if not line.startswith("#") and any(name in line for name in patterns)
    )
170

171
172

def get_gcc_version(run_lambda):
173
    return run_and_parse_first_match(run_lambda, "gcc --version", r"gcc (.*)")
174

175

176
def get_clang_version(run_lambda):
177
178
179
    return run_and_parse_first_match(
        run_lambda, "clang --version", r"clang version (.*)"
    )
180
181
182


def get_cmake_version(run_lambda):
183
    return run_and_parse_first_match(run_lambda, "cmake --version", r"cmake (.*)")
184
185
186


def get_nvidia_driver_version(run_lambda):
187
188
189
190
191
    if get_platform() == "darwin":
        cmd = "kextstat | grep -i cuda"
        return run_and_parse_first_match(
            run_lambda, cmd, r"com[.]nvidia[.]CUDA [(](.*?)[)]"
        )
192
    smi = get_nvidia_smi()
193
    return run_and_parse_first_match(run_lambda, smi, r"Driver Version: (.*?) ")
194
195
196


def get_gpu_info(run_lambda):
197
198
199
200
201
    if get_platform() == "darwin" or (
        TORCH_AVAILABLE
        and hasattr(torch.version, "hip")
        and torch.version.hip is not None
    ):
202
203
204
205
206
207
208
209
210
211
212
213
        if TORCH_AVAILABLE and torch.cuda.is_available():
            if torch.version.hip is not None:
                prop = torch.cuda.get_device_properties(0)
                if hasattr(prop, "gcnArchName"):
                    gcnArch = " ({})".format(prop.gcnArchName)
                else:
                    gcnArch = "NoGCNArchNameOnOldPyTorch"
            else:
                gcnArch = ""
            return torch.cuda.get_device_name(None) + gcnArch
        return None
    smi = get_nvidia_smi()
214
215
    uuid_regex = re.compile(r" \(UUID: .+?\)")
    rc, out, _ = run_lambda(smi + " -L")
216
217
218
    if rc != 0:
        return None
    # Anonymize GPUs by removing their UUID
219
    return re.sub(uuid_regex, "", out)
220
221
222


def get_running_cuda_version(run_lambda):
223
    return run_and_parse_first_match(run_lambda, "nvcc --version", r"release .+ V(.*)")
224
225
226
227


def get_cudnn_version(run_lambda):
    """Return a list of libcudnn.so; it's hard to tell which one is being used."""
228
229
230
231
    if get_platform() == "win32":
        system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
        cuda_path = os.environ.get("CUDA_PATH", "%CUDA_PATH%")
        where_cmd = os.path.join(system_root, "System32", "where")
232
        cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path)
233
    elif get_platform() == "darwin":
234
235
236
237
        # CUDA libraries and drivers can be found in /usr/local/cuda/. See
        # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install
        # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac
        # Use CUDNN_LIBRARY when cudnn library is installed elsewhere.
238
        cudnn_cmd = "ls /usr/local/cuda/lib/libcudnn*"
239
240
241
242
243
    else:
        cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev'
    rc, out, _ = run_lambda(cudnn_cmd)
    # find will return 1 if there are permission errors or if not found
    if len(out) == 0 or (rc != 1 and rc != 0):
244
        l = os.environ.get("CUDNN_LIBRARY")
245
246
247
248
        if l is not None and os.path.isfile(l):
            return os.path.realpath(l)
        return None
    files_set = set()
249
    for fn in out.split("\n"):
250
251
252
253
254
255
256
257
258
        fn = os.path.realpath(fn)  # eliminate symbolic links
        if os.path.isfile(fn):
            files_set.add(fn)
    if not files_set:
        return None
    # Alphabetize the result because the order is non-deterministic otherwise
    files = sorted(files_set)
    if len(files) == 1:
        return files[0]
259
260
    result = "\n".join(files)
    return "Probably one of the following:\n{}".format(result)
261
262
263
264


def get_nvidia_smi():
    # Note: nvidia-smi is currently available only on Windows and Linux
265
266
267
268
269
270
271
272
    smi = "nvidia-smi"
    if get_platform() == "win32":
        system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
        program_files_root = os.environ.get("PROGRAMFILES", "C:\\Program Files")
        legacy_path = os.path.join(
            program_files_root, "NVIDIA Corporation", "NVSMI", smi
        )
        new_path = os.path.join(system_root, "System32", smi)
273
274
275
276
277
278
279
280
281
282
        smis = [new_path, legacy_path]
        for candidate_smi in smis:
            if os.path.exists(candidate_smi):
                smi = '"{}"'.format(candidate_smi)
                break
    return smi


def get_rocm_version(run_lambda):
    """Returns the ROCm version if available, otherwise 'N/A'."""
283
284
285
    return run_and_parse_first_match(
        run_lambda, "hipcc --version", r"HIP version: (\S+)"
    )
286
287
288


def get_vllm_version():
289
290
291
292
    from vllm import __version__, __version_tuple__

    if __version__ == "dev":
        return "N/A (dev)"
293
    version_str = __version_tuple__[-1]
294
    if isinstance(version_str, str) and version_str.startswith("g"):
295
        # it's a dev build
296
        if "." in version_str:
297
            # it's a dev build containing local changes
298
299
            git_sha = version_str.split(".")[0][1:]
            date = version_str.split(".")[-1][1:]
300
301
302
303
304
            return f"{__version__} (git sha: {git_sha}, date: {date})"
        else:
            # it's a dev build without local changes
            git_sha = version_str[1:]  # type: ignore
            return f"{__version__} (git sha: {git_sha})"
305
    return __version__
306

307

308
309
def summarize_vllm_build_flags():
    # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
310
311
312
    return "CUDA Archs: {}; ROCm: {}".format(
        os.environ.get("TORCH_CUDA_ARCH_LIST", "Not Set"),
        "Enabled" if os.environ.get("ROCM_HOME") else "Disabled",
313
314
315
316
    )


def get_gpu_topo(run_lambda):
317
318
    output = None

319
320
    if get_platform() == "linux":
        output = run_and_read_all(run_lambda, "nvidia-smi topo -m")
321
        if output is None:
322
            output = run_and_read_all(run_lambda, "rocm-smi --showtopo")
323
324

    return output
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401


# example outputs of CPU infos
#  * linux
#    Architecture:            x86_64
#      CPU op-mode(s):        32-bit, 64-bit
#      Address sizes:         46 bits physical, 48 bits virtual
#      Byte Order:            Little Endian
#    CPU(s):                  128
#      On-line CPU(s) list:   0-127
#    Vendor ID:               GenuineIntel
#      Model name:            Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
#        CPU family:          6
#        Model:               106
#        Thread(s) per core:  2
#        Core(s) per socket:  32
#        Socket(s):           2
#        Stepping:            6
#        BogoMIPS:            5799.78
#        Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr
#                             sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl
#                             xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq monitor ssse3 fma cx16
#                             pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand
#                             hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced
#                             fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap
#                             avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1
#                             xsaves wbnoinvd ida arat avx512vbmi pku ospke avx512_vbmi2 gfni vaes vpclmulqdq
#                             avx512_vnni avx512_bitalg tme avx512_vpopcntdq rdpid md_clear flush_l1d arch_capabilities
#    Virtualization features:
#      Hypervisor vendor:     KVM
#      Virtualization type:   full
#    Caches (sum of all):
#      L1d:                   3 MiB (64 instances)
#      L1i:                   2 MiB (64 instances)
#      L2:                    80 MiB (64 instances)
#      L3:                    108 MiB (2 instances)
#    NUMA:
#      NUMA node(s):          2
#      NUMA node0 CPU(s):     0-31,64-95
#      NUMA node1 CPU(s):     32-63,96-127
#    Vulnerabilities:
#      Itlb multihit:         Not affected
#      L1tf:                  Not affected
#      Mds:                   Not affected
#      Meltdown:              Not affected
#      Mmio stale data:       Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown
#      Retbleed:              Not affected
#      Spec store bypass:     Mitigation; Speculative Store Bypass disabled via prctl and seccomp
#      Spectre v1:            Mitigation; usercopy/swapgs barriers and __user pointer sanitization
#      Spectre v2:            Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence
#      Srbds:                 Not affected
#      Tsx async abort:       Not affected
#  * win32
#    Architecture=9
#    CurrentClockSpeed=2900
#    DeviceID=CPU0
#    Family=179
#    L2CacheSize=40960
#    L2CacheSpeed=
#    Manufacturer=GenuineIntel
#    MaxClockSpeed=2900
#    Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
#    ProcessorType=3
#    Revision=27142
#
#    Architecture=9
#    CurrentClockSpeed=2900
#    DeviceID=CPU1
#    Family=179
#    L2CacheSize=40960
#    L2CacheSpeed=
#    Manufacturer=GenuineIntel
#    MaxClockSpeed=2900
#    Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
#    ProcessorType=3
#    Revision=27142

402

403
def get_cpu_info(run_lambda):
404
405
406
407
    rc, out, err = 0, "", ""
    if get_platform() == "linux":
        rc, out, err = run_lambda("lscpu")
    elif get_platform() == "win32":
408
        rc, out, err = run_lambda(
409
410
            "wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \
        CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE"
411
        )
412
    elif get_platform() == "darwin":
413
        rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string")
414
    cpu_info = "None"
415
416
417
418
419
420
421
422
    if rc == 0:
        cpu_info = out
    else:
        cpu_info = err
    return cpu_info


def get_platform():
423
424
425
426
427
428
429
430
    if sys.platform.startswith("linux"):
        return "linux"
    elif sys.platform.startswith("win32"):
        return "win32"
    elif sys.platform.startswith("cygwin"):
        return "cygwin"
    elif sys.platform.startswith("darwin"):
        return "darwin"
431
432
433
434
435
    else:
        return sys.platform


def get_mac_version(run_lambda):
436
    return run_and_parse_first_match(run_lambda, "sw_vers -productVersion", r"(.*)")
437
438
439


def get_windows_version(run_lambda):
440
441
442
    system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
    wmic_cmd = os.path.join(system_root, "System32", "Wbem", "wmic")
    findstr_cmd = os.path.join(system_root, "System32", "findstr")
443
    return run_and_read_all(
444
445
        run_lambda, "{} os get Caption | {} /v Caption".format(wmic_cmd, findstr_cmd)
    )
446
447
448


def get_lsb_version(run_lambda):
449
450
451
    return run_and_parse_first_match(
        run_lambda, "lsb_release -a", r"Description:\t(.*)"
    )
452
453
454


def check_release_file(run_lambda):
455
456
457
    return run_and_parse_first_match(
        run_lambda, "cat /etc/*-release", r'PRETTY_NAME="(.*)"'
    )
458
459
460
461


def get_os(run_lambda):
    from platform import machine
462

463
464
    platform = get_platform()

465
    if platform == "win32" or platform == "cygwin":
466
467
        return get_windows_version(run_lambda)

468
    if platform == "darwin":
469
470
471
        version = get_mac_version(run_lambda)
        if version is None:
            return None
472
        return "macOS {} ({})".format(version, machine())
473

474
    if platform == "linux":
475
476
477
        # Ubuntu/Debian based
        desc = get_lsb_version(run_lambda)
        if desc is not None:
478
            return "{} ({})".format(desc, machine())
479
480
481
482

        # Try reading /etc/*-release
        desc = check_release_file(run_lambda)
        if desc is not None:
483
            return "{} ({})".format(desc, machine())
484

485
        return "{} ({})".format(platform, machine())
486
487
488
489
490
491
492

    # Unknown platform
    return platform


def get_python_platform():
    import platform
493

494
495
496
497
498
    return platform.platform()


def get_libc_version():
    import platform
499
500
501
502

    if get_platform() != "linux":
        return "N/A"
    return "-".join(platform.libc_ver())
503
504


505
506
507
def is_uv_venv():
    if os.environ.get("UV"):
        return True
508
    pyvenv_cfg_path = os.path.join(sys.prefix, "pyvenv.cfg")
509
    if os.path.exists(pyvenv_cfg_path):
510
511
        with open(pyvenv_cfg_path, "r") as f:
            return any(line.startswith("uv = ") for line in f)
512
513
514
    return False


515
516
517
518
519
def get_pip_packages(run_lambda, patterns=None):
    """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages."""
    if patterns is None:
        patterns = DEFAULT_PIP_PATTERNS

520
521
522
    def run_with_pip():
        try:
            import importlib.util
523
524

            pip_spec = importlib.util.find_spec("pip")
525
526
527
528
529
            pip_available = pip_spec is not None
        except ImportError:
            pip_available = False

        if pip_available:
530
            cmd = [sys.executable, "-mpip", "list", "--format=freeze"]
531
        elif is_uv_venv():
532
533
534
            print("uv is set")
            cmd = ["uv", "pip", "list", "--format=freeze"]
        else:
535
536
537
            raise RuntimeError(
                "Could not collect pip list output (pip or uv module not available)"
            )
538
539

        out = run_and_read_all(run_lambda, cmd)
540
541
542
        return "\n".join(
            line for line in out.splitlines() if any(name in line for name in patterns)
        )
543

544
    pip_version = "pip3" if sys.version[0] == "3" else "pip"
545
    out = run_with_pip()
546
547
548
549
    return pip_version, out


def get_cachingallocator_config():
550
    ca_config = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
551
552
553
554
555
556
    return ca_config


def get_cuda_module_loading_config():
    if TORCH_AVAILABLE and torch.cuda.is_available():
        torch.cuda.init()
557
        config = os.environ.get("CUDA_MODULE_LOADING", "")
558
559
560
561
562
563
564
565
        return config
    else:
        return "N/A"


def is_xnnpack_available():
    if TORCH_AVAILABLE:
        import torch.backends.xnnpack
566
567

        return str(torch.backends.xnnpack.enabled)  # type: ignore[attr-defined]
568
569
570
    else:
        return "N/A"

571

572
def get_env_vars():
573
574
575
576
577
578
579
580
581
582
583
584
585
    env_vars = ""
    secret_terms = ("secret", "token", "api", "access", "password")
    report_prefix = (
        "TORCH",
        "NCCL",
        "PYTORCH",
        "CUDA",
        "CUBLAS",
        "CUDNN",
        "OMP_",
        "MKL_",
        "NVIDIA",
    )
586
587
588
589
590
591
592
593
594
    for k, v in os.environ.items():
        if any(term in k.lower() for term in secret_terms):
            continue
        if k in environment_variables:
            env_vars = env_vars + "{}={}".format(k, v) + "\n"
        if k.startswith(report_prefix):
            env_vars = env_vars + "{}={}".format(k, v) + "\n"

    return env_vars
595

596

597
598
599
600
601
602
603
604
605
def get_env_info():
    run_lambda = run
    pip_version, pip_list_output = get_pip_packages(run_lambda)

    if TORCH_AVAILABLE:
        version_str = torch.__version__
        debug_mode_str = str(torch.version.debug)
        cuda_available_str = str(torch.cuda.is_available())
        cuda_version_str = torch.version.cuda
606
607
608
609
        if (
            not hasattr(torch.version, "hip") or torch.version.hip is None
        ):  # cuda version
            hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A"
610
        else:  # HIP version
611

612
613
            def get_version_or_na(cfg, prefix):
                _lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s]
614
                return _lst[0] if _lst else "N/A"
615

616
617
618
619
            cfg = torch._C._show_config().split("\n")
            hip_runtime_version = get_version_or_na(cfg, "HIP Runtime")
            miopen_runtime_version = get_version_or_na(cfg, "MIOpen")
            cuda_version_str = "N/A"
620
621
            hip_compiled_version = torch.version.hip
    else:
622
623
        version_str = debug_mode_str = cuda_available_str = cuda_version_str = "N/A"
        hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A"
624
625
626
627
628
629
630
631
632
633
634
635
636

    sys_version = sys.version.replace("\n", " ")

    conda_packages = get_conda_packages(run_lambda)

    rocm_version = get_rocm_version(run_lambda)
    vllm_version = get_vllm_version()
    vllm_build_flags = summarize_vllm_build_flags()
    gpu_topo = get_gpu_topo(run_lambda)

    return SystemEnv(
        torch_version=version_str,
        is_debug_build=debug_mode_str,
637
638
639
        python_version="{} ({}-bit runtime)".format(
            sys_version, sys.maxsize.bit_length() + 1
        ),
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
        python_platform=get_python_platform(),
        is_cuda_available=cuda_available_str,
        cuda_compiled_version=cuda_version_str,
        cuda_runtime_version=get_running_cuda_version(run_lambda),
        cuda_module_loading=get_cuda_module_loading_config(),
        nvidia_gpu_models=get_gpu_info(run_lambda),
        nvidia_driver_version=get_nvidia_driver_version(run_lambda),
        cudnn_version=get_cudnn_version(run_lambda),
        hip_compiled_version=hip_compiled_version,
        hip_runtime_version=hip_runtime_version,
        miopen_runtime_version=miopen_runtime_version,
        pip_version=pip_version,
        pip_packages=pip_list_output,
        conda_packages=conda_packages,
        os=get_os(run_lambda),
        libc_version=get_libc_version(),
        gcc_version=get_gcc_version(run_lambda),
        clang_version=get_clang_version(run_lambda),
        cmake_version=get_cmake_version(run_lambda),
        caching_allocator_config=get_cachingallocator_config(),
        is_xnnpack_available=is_xnnpack_available(),
        cpu_info=get_cpu_info(run_lambda),
        rocm_version=rocm_version,
        vllm_version=vllm_version,
        vllm_build_flags=vllm_build_flags,
        gpu_topo=gpu_topo,
666
        env_vars=get_env_vars(),
667
668
    )

669

670
env_info_fmt = """
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
==============================
        System Info
==============================
OS                           : {os}
GCC version                  : {gcc_version}
Clang version                : {clang_version}
CMake version                : {cmake_version}
Libc version                 : {libc_version}

==============================
       PyTorch Info
==============================
PyTorch version              : {torch_version}
Is debug build               : {is_debug_build}
CUDA used to build PyTorch   : {cuda_compiled_version}
ROCM used to build PyTorch   : {hip_compiled_version}

==============================
      Python Environment
==============================
Python version               : {python_version}
Python platform              : {python_platform}

==============================
       CUDA / GPU Info
==============================
Is CUDA available            : {is_cuda_available}
CUDA runtime version         : {cuda_runtime_version}
CUDA_MODULE_LOADING set to   : {cuda_module_loading}
GPU models and configuration : {nvidia_gpu_models}
Nvidia driver version        : {nvidia_driver_version}
cuDNN version                : {cudnn_version}
HIP runtime version          : {hip_runtime_version}
MIOpen runtime version       : {miopen_runtime_version}
Is XNNPACK available         : {is_xnnpack_available}

==============================
          CPU Info
==============================
710
711
{cpu_info}

712
713
714
==============================
Versions of relevant libraries
==============================
715
716
717
718
{pip_packages}
{conda_packages}
""".strip()

youkaichao's avatar
youkaichao committed
719
720
721
# both the above code and the following code use `strip()` to
# remove leading/trailing whitespaces, so we need to add a newline
# in between to separate the two sections
722
env_info_fmt += "\n\n"
youkaichao's avatar
youkaichao committed
723

724
env_info_fmt += """
725
726
727
728
729
==============================
         vLLM Info
==============================
ROCM Version                 : {rocm_version}
vLLM Version                 : {vllm_version}
730
vLLM Build Flags:
731
  {vllm_build_flags}
732
GPU Topology:
733
  {gpu_topo}
734

735
736
737
==============================
     Environment Variables
==============================
738
{env_vars}
739
740
741
742
""".strip()


def pretty_str(envinfo):
743
    def replace_nones(dct, replacement="Could not collect"):
744
745
746
747
748
749
        for key in dct.keys():
            if dct[key] is not None:
                continue
            dct[key] = replacement
        return dct

750
    def replace_bools(dct, true="Yes", false="No"):
751
752
753
754
755
756
757
        for key in dct.keys():
            if dct[key] is True:
                dct[key] = true
            elif dct[key] is False:
                dct[key] = false
        return dct

758
759
    def prepend(text, tag="[prepend]"):
        lines = text.split("\n")
760
        updated_lines = [tag + line for line in lines]
761
        return "\n".join(updated_lines)
762

763
    def replace_if_empty(text, replacement="No relevant packages"):
764
765
766
767
768
769
        if text is not None and len(text) == 0:
            return replacement
        return text

    def maybe_start_on_next_line(string):
        # If `string` is multiline, prepend a \n to it.
770
771
        if string is not None and len(string.split("\n")) > 1:
            return "\n{}\n".format(string)
772
773
774
775
776
        return string

    mutable_dict = envinfo._asdict()

    # If nvidia_gpu_models is multiline, start on the next line
777
778
779
    mutable_dict["nvidia_gpu_models"] = maybe_start_on_next_line(
        envinfo.nvidia_gpu_models
    )
780
781
782

    # If the machine doesn't have CUDA, report some fields as 'No CUDA'
    dynamic_cuda_fields = [
783
784
785
        "cuda_runtime_version",
        "nvidia_gpu_models",
        "nvidia_driver_version",
786
    ]
787
788
789
790
791
792
793
794
795
    all_cuda_fields = dynamic_cuda_fields + ["cudnn_version"]
    all_dynamic_cuda_fields_missing = all(
        mutable_dict[field] is None for field in dynamic_cuda_fields
    )
    if (
        TORCH_AVAILABLE
        and not torch.cuda.is_available()
        and all_dynamic_cuda_fields_missing
    ):
796
        for field in all_cuda_fields:
797
            mutable_dict[field] = "No CUDA"
798
        if envinfo.cuda_compiled_version is None:
799
            mutable_dict["cuda_compiled_version"] = "None"
800
801
802
803
804
805
806
807

    # Replace True with Yes, False with No
    mutable_dict = replace_bools(mutable_dict)

    # Replace all None objects with 'Could not collect'
    mutable_dict = replace_nones(mutable_dict)

    # If either of these are '', replace with 'No relevant packages'
808
809
    mutable_dict["pip_packages"] = replace_if_empty(mutable_dict["pip_packages"])
    mutable_dict["conda_packages"] = replace_if_empty(mutable_dict["conda_packages"])
810
811
812

    # Tag conda and pip packages with a prefix
    # If they were previously None, they'll show up as ie '[conda] Could not collect'
813
814
815
816
817
818
819
820
821
    if mutable_dict["pip_packages"]:
        mutable_dict["pip_packages"] = prepend(
            mutable_dict["pip_packages"], "[{}] ".format(envinfo.pip_version)
        )
    if mutable_dict["conda_packages"]:
        mutable_dict["conda_packages"] = prepend(
            mutable_dict["conda_packages"], "[conda] "
        )
    mutable_dict["cpu_info"] = envinfo.cpu_info
822
823
824
825
826
827
828
829
830
831
832
833
    return env_info_fmt.format(**mutable_dict)


def get_pretty_env_info():
    return pretty_str(get_env_info())


def main():
    print("Collecting environment information...")
    output = get_pretty_env_info()
    print(output)

834
835
836
837
838
    if (
        TORCH_AVAILABLE
        and hasattr(torch, "utils")
        and hasattr(torch.utils, "_crash_handler")
    ):
839
840
        minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR
        if sys.platform == "linux" and os.path.exists(minidump_dir):
841
            dumps = [
842
                os.path.join(minidump_dir, dump) for dump in os.listdir(minidump_dir)
843
            ]
844
845
            latest = max(dumps, key=os.path.getctime)
            ctime = os.path.getctime(latest)
846
            creation_time = datetime.datetime.fromtimestamp(ctime).strftime(
847
848
849
850
851
852
853
854
                "%Y-%m-%d %H:%M:%S"
            )
            msg = (
                "\n*** Detected a minidump at {} created on {}, ".format(
                    latest, creation_time
                )
                + "if this is related to your bug please include it when you file a report ***"
            )
855
856
857
            print(msg, file=sys.stderr)


858
if __name__ == "__main__":
859
    main()