generate-nightly-index.py 17.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

# do not complain about line length (for docstring)
# ruff: noqa: E501

import argparse
import json
import sys
from dataclasses import asdict, dataclass
12
from datetime import datetime
13
14
15
16
from pathlib import Path
from typing import Any
from urllib.parse import quote

Wentao Ye's avatar
Wentao Ye committed
17
18
import regex as re

19
20
21
22
23
24
25
26
27
28
29
30

def normalize_package_name(name: str) -> str:
    """
    Normalize package name according to PEP 503.
    https://peps.python.org/pep-0503/#normalized-names

    Replace runs of underscores, hyphens, and periods with a single hyphen,
    and lowercase the result.
    """
    return re.sub(r"[-_.]+", "-", name).lower()


31
32
33
34
35
if not sys.version_info >= (3, 12):
    raise RuntimeError("This script requires Python 3.12 or higher.")

INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
<html>
36
  <!-- {comment} -->
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
  <meta name="pypi:repository-version" content="1.0">
  <body>
{items}
  </body>
</html>
"""


@dataclass
class WheelFileInfo:
    package_name: str
    version: str
    build_tag: str | None
    python_tag: str
    abi_tag: str
    platform_tag: str
    variant: str | None
    filename: str


def parse_from_filename(file: str) -> WheelFileInfo:
    """
    Parse wheel file name to extract metadata.

    The format of wheel names:
        {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
    All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
    Example:
        vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
        vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
        vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
        vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
    """
    wheel_file_re = re.compile(
        r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
    )
    match = wheel_file_re.match(file)
    if not match:
        raise ValueError(f"Invalid wheel file name: {file}")

    package_name = match.group("package_name")
    version = match.group("version")
    build_tag = match.group("build_tag")
    python_tag = match.group("python_tag")
    abi_tag = match.group("abi_tag")
    platform_tag = match.group("platform_tag")

    # extract variant from version
    variant = None
    if "dev" in version:
        ver_after_dev = version.split("dev")[-1]
        if "." in ver_after_dev:
            variant = ver_after_dev.split(".")[-1]
            version = version.removesuffix("." + variant)
    else:
        if "+" in version:
93
94
95
96
97
98
99
            version_part, suffix = version.split("+", 1)
            # Only treat known patterns as variants (rocmXXX, cuXXX, cpu)
            # Git hashes and other suffixes are NOT variants
            if suffix.startswith(("rocm", "cu", "cpu")):
                variant = suffix
                version = version_part
            # Otherwise keep the full version string (variant stays None)
100
101
102
103
104
105
106
107
108
109
110
111
112

    return WheelFileInfo(
        package_name=package_name,
        version=version,
        build_tag=build_tag,
        python_tag=python_tag,
        abi_tag=abi_tag,
        platform_tag=platform_tag,
        variant=variant,
        filename=file,
    )


113
def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
114
115
116
117
118
119
120
    """
    Generate project list HTML content linking to each project & variant sub-directory.
    """
    href_tags = []
    for name in sorted(subdir_names):
        name = name.strip("/").strip(".")
        href_tags.append(f'    <a href="{name}/">{name}/</a><br/>')
121
    return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
122
123
124


def generate_package_index_and_metadata(
125
126
127
128
    wheel_files: list[WheelFileInfo],
    wheel_base_dir: Path,
    index_base_dir: Path,
    comment: str = "",
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
) -> tuple[str, str]:
    """
    Generate package index HTML content for a specific package, linking to actual wheel files.
    """
    href_tags = []
    metadata = []
    for file in sorted(wheel_files, key=lambda x: x.filename):
        relative_path = (
            wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
        )
        # handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
        # NOTE: this is AWS S3 specific behavior!
        file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
        href_tags.append(f'    <a href="{file_path_quoted}">{file.filename}</a><br/>')
        file_meta = asdict(file)
        file_meta["path"] = file_path_quoted
        metadata.append(file_meta)
146
    index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
147
148
149
150
151
152
153
154
155
156
    metadata_str = json.dumps(metadata, indent=2)
    return index_str, metadata_str


def generate_index_and_metadata(
    whl_files: list[str],
    wheel_base_dir: Path,
    index_base_dir: Path,
    default_variant: str | None = None,
    alias_to_default: str | None = None,
157
    comment: str = "",
158
159
160
161
162
163
164
165
166
167
):
    """
    Generate index for all wheel files.

    Args:
        whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
        wheel_base_dir (Path): Base directory for wheel files.
        index_base_dir (Path): Base directory to store index files.
        default_variant (str | None): The default variant name, if any.
        alias_to_default (str | None): Alias variant name for the default variant, if any.
168
        comment (str | None): Optional comment to include in the generated HTML files.
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226

    First, parse all wheel files to extract metadata.
    We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
    The index for the default variant (if any) is generated in the root index directory.

    If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
    is purely a copy of the corresponding variant index, with only the links adjusted.
    Otherwise, all wheels without variant suffixes are treated as the default variant.

    If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
    as the default variant index, but the links are adjusted accordingly.

    Index directory structure:
        index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
            index.html  # project list, linking to "vllm/" and other packages, and all variant sub-directories
            vllm/
                index.html # package index, pointing to actual files in wheel_base_dir (relative path)
                metadata.json # machine-readable metadata for all wheels in this package
            cpu/ # cpu variant sub-directory
                index.html
                vllm/
                    index.html
                    metadata.json
            cu129/ # cu129 is actually the alias to default variant
                index.html
                vllm/
                    index.html
                    metadata.json
            cu130/ # cu130 variant sub-directory
                index.html
                vllm/
                    index.html
                    metadata.json
            ...

    metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
        [
            {
                "package_name": "vllm",
                "version": "0.10.2rc2",
                "build_tag": null,
                "python_tag": "cp38",
                "abi_tag": "abi3",
                "platform_tag": "manylinux2014_aarch64",
                "variant": "cu129",
                "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
                "path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
            },
            ...
        ]
    """

    parsed_files = [parse_from_filename(f) for f in whl_files]

    if not parsed_files:
        print("No wheel files found, skipping index generation.")
        return

227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
    # For ROCm builds: inherit variant from vllm wheel
    # All ROCm wheels should share the same variant as vllm
    rocm_variant = None
    for file in parsed_files:
        if (
            file.package_name == "vllm"
            and file.variant
            and file.variant.startswith("rocm")
        ):
            rocm_variant = file.variant
            print(f"Detected ROCm variant from vllm: {rocm_variant}")
            break

    # Apply ROCm variant to all wheels without a variant
    if rocm_variant:
        for file in parsed_files:
            if file.variant is None:
                file.variant = rocm_variant
                print(f"Inherited variant '{rocm_variant}' for {file.filename}")

247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
    # Group by variant
    variant_to_files: dict[str, list[WheelFileInfo]] = {}
    for file in parsed_files:
        variant = file.variant or "default"
        if variant not in variant_to_files:
            variant_to_files[variant] = []
        variant_to_files[variant].append(file)

    print(f"Found variants: {list(variant_to_files.keys())}")

    # sanity check for default variant
    if default_variant:
        if "default" in variant_to_files:
            raise ValueError(
                "All wheel files must have variant suffixes when `default_variant` is specified."
            )
        if default_variant not in variant_to_files:
            raise ValueError(
                f"Default variant '{default_variant}' not found among wheel files."
            )

    if alias_to_default:
        if "default" not in variant_to_files:
            # e.g. only some wheels are uploaded to S3 currently
            print(
                "[WARN] Alias to default variant specified, but no default variant found."
            )
        elif alias_to_default in variant_to_files:
            raise ValueError(
                f"Alias variant name '{alias_to_default}' already exists among wheel files."
            )
        else:
            variant_to_files[alias_to_default] = variant_to_files["default"].copy()
            print(f"Alias variant '{alias_to_default}' created for default variant.")

282
283
284
285
    # Generate comment in HTML header
    comment_str = f" ({comment})" if comment else ""
    comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"

286
287
288
289
290
291
292
293
294
295
296
    # Generate index for each variant
    subdir_names = set()
    for variant, files in variant_to_files.items():
        if variant == "default":
            variant_dir = index_base_dir
        else:
            variant_dir = index_base_dir / variant
            subdir_names.add(variant)

        variant_dir.mkdir(parents=True, exist_ok=True)

297
298
        # gather all package names in this variant (normalized per PEP 503)
        packages = set(normalize_package_name(f.package_name) for f in files)
299
300
301
302
303
304
        if variant == "default":
            # these packages should also appear in the "project list"
            # generate after all variants are processed
            subdir_names = subdir_names.union(packages)
        else:
            # generate project list for this variant directly
305
            project_list_str = generate_project_list(sorted(packages), comment_tmpl)
306
307
308
309
            with open(variant_dir / "index.html", "w") as f:
                f.write(project_list_str)

        for package in packages:
310
311
312
313
            # filter files belonging to this package only (compare normalized names)
            package_files = [
                f for f in files if normalize_package_name(f.package_name) == package
            ]
314
315
316
            package_dir = variant_dir / package
            package_dir.mkdir(parents=True, exist_ok=True)
            index_str, metadata_str = generate_package_index_and_metadata(
317
                package_files, wheel_base_dir, package_dir, comment
318
319
320
321
322
323
324
            )
            with open(package_dir / "index.html", "w") as f:
                f.write(index_str)
            with open(package_dir / "metadata.json", "w") as f:
                f.write(metadata_str)

    # Generate top-level project list index
325
    project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
326
327
328
329
330
331
332
333
    with open(index_base_dir / "index.html", "w") as f:
        f.write(project_list_str)


if __name__ == "__main__":
    """
    Arguments:
        --version <version> : version string for the current build (e.g., commit hash)
334
        --wheel-dir <wheel_directory> : directory containing wheel files (default to be same as `version`)
335
336
337
        --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
        --output-dir <output_directory> : directory to store generated index files
        --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
338
        --comment <comment_string> : (optional) comment string to include in generated HTML files
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
    """

    parser = argparse.ArgumentParser(
        description="Process nightly build wheel files to generate indices."
    )
    parser.add_argument(
        "--version",
        type=str,
        required=True,
        help="Version string for the current build (e.g., commit hash)",
    )
    parser.add_argument(
        "--current-objects",
        type=str,
        required=True,
        help="Path to JSON file containing current S3 objects listing in this version directory",
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        required=True,
        help="Directory to store generated index files",
    )
362
363
364
365
366
367
    parser.add_argument(
        "--wheel-dir",
        type=str,
        default=None,
        help="Directory containing wheel files (default to be same as `version`)",
    )
368
369
370
371
372
373
    parser.add_argument(
        "--alias-to-default",
        type=str,
        default=None,
        help="Alias variant name for the default variant",
    )
374
375
376
377
378
379
    parser.add_argument(
        "--comment",
        type=str,
        default="",
        help="Optional comment string to include in generated HTML files",
    )
380
381
382
383

    args = parser.parse_args()

    version = args.version
384
385
386
387
388
389
390
    # Allow rocm/ prefix, reject other slashes and all backslashes
    if "\\" in version:
        raise ValueError("Version string must not contain backslashes.")
    if "/" in version and not version.startswith("rocm/"):
        raise ValueError(
            "Version string must not contain slashes (except for 'rocm/' prefix)."
        )
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
    current_objects_path = Path(args.current_objects)
    output_dir = Path(args.output_dir)
    if not output_dir.exists():
        output_dir.mkdir(parents=True, exist_ok=True)

    # Read current objects JSON
    with open(current_objects_path) as f:
        current_objects: dict[str, list[dict[str, Any]]] = json.load(f)

    # current_objects looks like from list_objects_v2 S3 API:
    """
    "Contents": [
        {
            "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
            "LastModified": "2025-11-28T14:00:32+00:00",
            "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
            "ChecksumAlgorithm": [
                "CRC64NVME"
            ],
            "ChecksumType": "FULL_OBJECT",
            "Size": 435649349,
            "StorageClass": "STANDARD"
        },
        ...
    ]
    """

    # Extract wheel file keys
    wheel_files = []
    for item in current_objects.get("Contents", []):
        key: str = item["Key"]
        if key.endswith(".whl"):
            wheel_files.append(key.split("/")[-1])  # only the filename is used

    print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")

427
    # keep only "official" files for a non-nightly version (specified by cli args)
428
429
430
431
432
433
434
435
436
437
    PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
    if PY_VERSION_RE.match(version):
        # upload-wheels.sh ensures no "dev" is in args.version
        wheel_files = list(
            filter(lambda x: version in x and "dev" not in x, wheel_files)
        )
        print(f"Non-nightly version detected, wheel files used: {wheel_files}")
    else:
        print("Nightly version detected, keeping all wheel files.")

438
    # Generate index and metadata, assuming wheels and indices are stored as:
439
    # s3://vllm-wheels/{wheel_dir}/<wheel files>
440
    # s3://vllm-wheels/<anything>/<index files>
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
    #
    # For ROCm builds, version is "rocm/{commit}" and indices are uploaded to:
    #   - rocm/{commit}/  (same as wheels)
    #   - rocm/nightly/
    #   - rocm/{version}/
    # All these are under the "rocm/" prefix, so relative paths should be
    # relative to "rocm/", not the bucket root.
    if args.wheel_dir:
        # Explicit wheel-dir provided (e.g., for version-specific indices pointing to commit dir)
        wheel_dir = args.wheel_dir.strip().rstrip("/")
    elif version.startswith("rocm/"):
        # For rocm/commit, wheel_base_dir should be just the commit part
        # so relative path from rocm/0.12.0/rocm710/vllm/ -> ../../../{commit}/
        wheel_dir = version.split("/", 1)[1]
    else:
        wheel_dir = version
    wheel_base_dir = Path(output_dir).parent / wheel_dir
458
459
460
461
462
463
464
465
    index_base_dir = Path(output_dir)

    generate_index_and_metadata(
        whl_files=wheel_files,
        wheel_base_dir=wheel_base_dir,
        index_base_dir=index_base_dir,
        default_variant=None,
        alias_to_default=args.alias_to_default,
466
        comment=args.comment.strip(),
467
468
    )
    print(f"Successfully generated index and metadata in {output_dir}")