install_nixl_from_source_ubuntu.py 8.76 KB
Newer Older
1
2
3
4
5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# install_prerequisites.py
import argparse
import glob
6
import json
7
8
9
import os
import subprocess
import sys
10
import urllib.request
11
12
13
14

# --- Configuration ---
WHEELS_CACHE_HOME = os.environ.get("WHEELS_CACHE_HOME", "/tmp/wheels_cache")
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
15
16
17
18
19
UCX_DIR = os.path.join("/tmp", "ucx_source")
NIXL_DIR = os.path.join("/tmp", "nixl_source")
UCX_INSTALL_DIR = os.path.join("/tmp", "ucx_install")
UCX_REPO_URL = "https://github.com/openucx/ucx.git"
NIXL_REPO_URL = "https://github.com/ai-dynamo/nixl.git"
20
21
22


# --- Helper Functions ---
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def get_latest_nixl_version():
    """Helper function to get latest release version of NIXL"""
    try:
        nixl_release_url = "https://api.github.com/repos/ai-dynamo/nixl/releases/latest"
        with urllib.request.urlopen(nixl_release_url) as response:
            data = json.load(response)
            return data.get("tag_name", "0.7.0")
    except Exception:
        return "0.7.0"


NIXL_VERSION = os.environ.get("NIXL_VERSION", get_latest_nixl_version())


37
def run_command(command, cwd=".", env=None):
38
39
40
41
42
43
44
45
    """Helper function to run a shell command and check for errors."""
    print(f"--> Running command: {' '.join(command)} in '{cwd}'", flush=True)
    subprocess.check_call(command, cwd=cwd, env=env)


def is_pip_package_installed(package_name):
    """Checks if a package is installed via pip without raising an exception."""
    result = subprocess.run(
46
        [sys.executable, "-m", "pip", "show", package_name],
47
        stdout=subprocess.DEVNULL,
48
49
        stderr=subprocess.DEVNULL,
    )
50
51
52
53
54
55
    return result.returncode == 0


def find_nixl_wheel_in_cache(cache_dir):
    """Finds a nixl wheel file in the specified cache directory."""
    # The repaired wheel will have a 'manylinux' tag, but this glob still works.
56
    search_pattern = os.path.join(cache_dir, f"nixl*{NIXL_VERSION}*.whl")
57
58
59
60
61
62
63
64
65
66
67
68
    wheels = glob.glob(search_pattern)
    if wheels:
        # Sort to get the most recent/highest version if multiple exist
        wheels.sort()
        return wheels[-1]
    return None


def install_system_dependencies():
    """Installs required system packages using apt-get if run as root."""
    if os.geteuid() != 0:
        print("\n---", flush=True)
69
70
        print(
            "WARNING: Not running as root. \
71
            Skipping system dependency installation.",
72
73
            flush=True,
        )
74
75
        print(
            "Please ensure the listed packages are installed on your system:",
76
77
78
79
            flush=True,
        )
        print(
            "  patchelf build-essential git cmake ninja-build \
80
            autotools-dev automake meson libtool libtool-bin",
81
82
            flush=True,
        )
83
84
85
        print("---\n", flush=True)
        return

86
    print("--- Running as root. Installing system dependencies... ---", flush=True)
87
88
89
90
91
92
93
94
95
96
    apt_packages = [
        "patchelf",  # <-- Add patchelf here
        "build-essential",
        "git",
        "cmake",
        "ninja-build",
        "autotools-dev",
        "automake",
        "meson",
        "libtool",
97
        "libtool-bin",
98
    ]
99
100
    run_command(["apt-get", "update"])
    run_command(["apt-get", "install", "-y"] + apt_packages)
101
102
103
104
105
106
    print("--- System dependencies installed successfully. ---\n", flush=True)


def build_and_install_prerequisites(args):
    """Builds UCX and NIXL from source, creating a self-contained wheel."""

107
    if not args.force_reinstall and is_pip_package_installed("nixl"):
108
109
110
111
112
        print("--> NIXL is already installed. Nothing to do.", flush=True)
        return

    cached_wheel = find_nixl_wheel_in_cache(WHEELS_CACHE_HOME)
    if not args.force_reinstall and cached_wheel:
113
114
        print(
            f"\n--> Found self-contained wheel: \
115
                {os.path.basename(cached_wheel)}.",
116
117
118
119
            flush=True,
        )
        print("--> Installing from cache, skipping all source builds.", flush=True)
        install_command = [sys.executable, "-m", "pip", "install", cached_wheel]
120
121
122
123
        run_command(install_command)
        print("\n--- Installation from cache complete. ---", flush=True)
        return

124
125
    print(
        "\n--> No installed package or cached wheel found. \
126
         Starting full build process...",
127
128
        flush=True,
    )
129
    print("\n--> Installing auditwheel...", flush=True)
130
    run_command([sys.executable, "-m", "pip", "install", "auditwheel"])
131
132
133
134
135
136
137
138
    install_system_dependencies()
    ucx_install_path = os.path.abspath(UCX_INSTALL_DIR)
    print(f"--> Using wheel cache directory: {WHEELS_CACHE_HOME}", flush=True)
    os.makedirs(WHEELS_CACHE_HOME, exist_ok=True)

    # -- Step 1: Build UCX from source --
    print("\n[1/3] Configuring and building UCX from source...", flush=True)
    if not os.path.exists(UCX_DIR):
139
        run_command(["git", "clone", UCX_REPO_URL, UCX_DIR])
140
    ucx_source_path = os.path.abspath(UCX_DIR)
141
142
    run_command(["git", "checkout", "v1.19.x"], cwd=ucx_source_path)
    run_command(["./autogen.sh"], cwd=ucx_source_path)
143
    configure_command = [
144
145
146
147
148
149
150
151
152
153
        "./configure",
        f"--prefix={ucx_install_path}",
        "--enable-shared",
        "--disable-static",
        "--disable-doxygen-doc",
        "--enable-optimizations",
        "--enable-cma",
        "--enable-devel-headers",
        "--with-verbs",
        "--enable-mt",
154
        "--with-ze=no",
155
156
    ]
    run_command(configure_command, cwd=ucx_source_path)
157
158
    run_command(["make", "-j", str(os.cpu_count() or 1)], cwd=ucx_source_path)
    run_command(["make", "install"], cwd=ucx_source_path)
159
160
161
162
163
    print("--- UCX build and install complete ---", flush=True)

    # -- Step 2: Build NIXL wheel from source --
    print("\n[2/3] Building NIXL wheel from source...", flush=True)
    if not os.path.exists(NIXL_DIR):
164
        run_command(["git", "clone", NIXL_REPO_URL, NIXL_DIR])
165
166
167
168
    else:
        run_command(["git", "fetch", "--tags"], cwd=NIXL_DIR)
    run_command(["git", "checkout", NIXL_VERSION], cwd=NIXL_DIR)
    print(f"--> Checked out NIXL version: {NIXL_VERSION}", flush=True)
169
170

    build_env = os.environ.copy()
171
172
173
174
175
176
177
    build_env["PKG_CONFIG_PATH"] = os.path.join(ucx_install_path, "lib", "pkgconfig")
    ucx_lib_path = os.path.join(ucx_install_path, "lib")
    ucx_plugin_path = os.path.join(ucx_lib_path, "ucx")
    existing_ld_path = os.environ.get("LD_LIBRARY_PATH", "")
    build_env["LD_LIBRARY_PATH"] = (
        f"{ucx_lib_path}:{ucx_plugin_path}:{existing_ld_path}".strip(":")
    )
178
    build_env["LDFLAGS"] = "-Wl,-rpath,$ORIGIN"
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
    print(f"--> Using LD_LIBRARY_PATH: {build_env['LD_LIBRARY_PATH']}", flush=True)

    temp_wheel_dir = os.path.join(ROOT_DIR, "temp_wheelhouse")
    run_command(
        [
            sys.executable,
            "-m",
            "pip",
            "wheel",
            ".",
            "--no-deps",
            f"--wheel-dir={temp_wheel_dir}",
        ],
        cwd=os.path.abspath(NIXL_DIR),
        env=build_env,
    )
195
196

    # -- Step 3: Repair the wheel by copying UCX libraries --
197
    print("\n[3/3] Repairing NIXL wheel to include UCX libraries...", flush=True)
198
199
200
201
202
203
    unrepaired_wheel = find_nixl_wheel_in_cache(temp_wheel_dir)
    if not unrepaired_wheel:
        raise RuntimeError("Failed to find the NIXL wheel after building it.")

    # We tell auditwheel to ignore the plugin that mesonpy already handled.
    auditwheel_command = [
204
205
206
207
        "auditwheel",
        "repair",
        "--exclude",
        "libplugin_UCX.so",  # <-- Exclude because mesonpy already includes it
208
        unrepaired_wheel,
209
        f"--wheel-dir={WHEELS_CACHE_HOME}",
210
211
212
213
214
    ]
    run_command(auditwheel_command, env=build_env)

    # --- CLEANUP ---
    # No more temporary files to remove, just the temp wheelhouse
215
    run_command(["rm", "-rf", temp_wheel_dir])
216
217
218
219
220
221
    # --- END CLEANUP ---

    newly_built_wheel = find_nixl_wheel_in_cache(WHEELS_CACHE_HOME)
    if not newly_built_wheel:
        raise RuntimeError("Failed to find the repaired NIXL wheel.")

222
223
    print(
        f"--> Successfully built self-contained wheel: \
224
            {os.path.basename(newly_built_wheel)}. Now installing...",
225
226
        flush=True,
    )
227
228
229
230
231
232
233
234
    install_command = [
        sys.executable,
        "-m",
        "pip",
        "install",
        "--no-deps",  # w/o "no-deps", it will install cuda-torch
        newly_built_wheel,
    ]
235
    if args.force_reinstall:
236
        install_command.insert(-1, "--force-reinstall")
237
238
239
240
241
242
243

    run_command(install_command)
    print("--- NIXL installation complete ---", flush=True)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
244
245
246
247
248
249
250
251
        description="Build and install UCX and NIXL dependencies."
    )
    parser.add_argument(
        "--force-reinstall",
        action="store_true",
        help="Force rebuild and reinstall of UCX and NIXL \
        even if they are already installed.",
    )
252
253
    args = parser.parse_args()
    build_and_install_prerequisites(args)