generate-nightly-index.py 14 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

# do not complain about line length (for docstring)
# ruff: noqa: E501

import argparse
import json
import sys
from dataclasses import asdict, dataclass
12
from datetime import datetime
13
14
15
16
from pathlib import Path
from typing import Any
from urllib.parse import quote

Wentao Ye's avatar
Wentao Ye committed
17
18
import regex as re

19
20
21
22
23
if not sys.version_info >= (3, 12):
    raise RuntimeError("This script requires Python 3.12 or higher.")

INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
<html>
24
  <!-- {comment} -->
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
  <meta name="pypi:repository-version" content="1.0">
  <body>
{items}
  </body>
</html>
"""


@dataclass
class WheelFileInfo:
    package_name: str
    version: str
    build_tag: str | None
    python_tag: str
    abi_tag: str
    platform_tag: str
    variant: str | None
    filename: str


def parse_from_filename(file: str) -> WheelFileInfo:
    """
    Parse wheel file name to extract metadata.

    The format of wheel names:
        {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
    All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
    Example:
        vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
        vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
        vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
        vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
    """
    wheel_file_re = re.compile(
        r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
    )
    match = wheel_file_re.match(file)
    if not match:
        raise ValueError(f"Invalid wheel file name: {file}")

    package_name = match.group("package_name")
    version = match.group("version")
    build_tag = match.group("build_tag")
    python_tag = match.group("python_tag")
    abi_tag = match.group("abi_tag")
    platform_tag = match.group("platform_tag")

    # extract variant from version
    variant = None
    if "dev" in version:
        ver_after_dev = version.split("dev")[-1]
        if "." in ver_after_dev:
            variant = ver_after_dev.split(".")[-1]
            version = version.removesuffix("." + variant)
    else:
        if "+" in version:
            version, variant = version.split("+")

    return WheelFileInfo(
        package_name=package_name,
        version=version,
        build_tag=build_tag,
        python_tag=python_tag,
        abi_tag=abi_tag,
        platform_tag=platform_tag,
        variant=variant,
        filename=file,
    )


95
def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
96
97
98
99
100
101
102
    """
    Generate project list HTML content linking to each project & variant sub-directory.
    """
    href_tags = []
    for name in sorted(subdir_names):
        name = name.strip("/").strip(".")
        href_tags.append(f'    <a href="{name}/">{name}/</a><br/>')
103
    return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
104
105
106


def generate_package_index_and_metadata(
107
108
109
110
    wheel_files: list[WheelFileInfo],
    wheel_base_dir: Path,
    index_base_dir: Path,
    comment: str = "",
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
) -> tuple[str, str]:
    """
    Generate package index HTML content for a specific package, linking to actual wheel files.
    """
    href_tags = []
    metadata = []
    for file in sorted(wheel_files, key=lambda x: x.filename):
        relative_path = (
            wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
        )
        # handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
        # NOTE: this is AWS S3 specific behavior!
        file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
        href_tags.append(f'    <a href="{file_path_quoted}">{file.filename}</a><br/>')
        file_meta = asdict(file)
        file_meta["path"] = file_path_quoted
        metadata.append(file_meta)
128
    index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
129
130
131
132
133
134
135
136
137
138
    metadata_str = json.dumps(metadata, indent=2)
    return index_str, metadata_str


def generate_index_and_metadata(
    whl_files: list[str],
    wheel_base_dir: Path,
    index_base_dir: Path,
    default_variant: str | None = None,
    alias_to_default: str | None = None,
139
    comment: str = "",
140
141
142
143
144
145
146
147
148
149
):
    """
    Generate index for all wheel files.

    Args:
        whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
        wheel_base_dir (Path): Base directory for wheel files.
        index_base_dir (Path): Base directory to store index files.
        default_variant (str | None): The default variant name, if any.
        alias_to_default (str | None): Alias variant name for the default variant, if any.
150
        comment (str | None): Optional comment to include in the generated HTML files.
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243

    First, parse all wheel files to extract metadata.
    We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
    The index for the default variant (if any) is generated in the root index directory.

    If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
    is purely a copy of the corresponding variant index, with only the links adjusted.
    Otherwise, all wheels without variant suffixes are treated as the default variant.

    If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
    as the default variant index, but the links are adjusted accordingly.

    Index directory structure:
        index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
            index.html  # project list, linking to "vllm/" and other packages, and all variant sub-directories
            vllm/
                index.html # package index, pointing to actual files in wheel_base_dir (relative path)
                metadata.json # machine-readable metadata for all wheels in this package
            cpu/ # cpu variant sub-directory
                index.html
                vllm/
                    index.html
                    metadata.json
            cu129/ # cu129 is actually the alias to default variant
                index.html
                vllm/
                    index.html
                    metadata.json
            cu130/ # cu130 variant sub-directory
                index.html
                vllm/
                    index.html
                    metadata.json
            ...

    metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
        [
            {
                "package_name": "vllm",
                "version": "0.10.2rc2",
                "build_tag": null,
                "python_tag": "cp38",
                "abi_tag": "abi3",
                "platform_tag": "manylinux2014_aarch64",
                "variant": "cu129",
                "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
                "path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
            },
            ...
        ]
    """

    parsed_files = [parse_from_filename(f) for f in whl_files]

    if not parsed_files:
        print("No wheel files found, skipping index generation.")
        return

    # Group by variant
    variant_to_files: dict[str, list[WheelFileInfo]] = {}
    for file in parsed_files:
        variant = file.variant or "default"
        if variant not in variant_to_files:
            variant_to_files[variant] = []
        variant_to_files[variant].append(file)

    print(f"Found variants: {list(variant_to_files.keys())}")

    # sanity check for default variant
    if default_variant:
        if "default" in variant_to_files:
            raise ValueError(
                "All wheel files must have variant suffixes when `default_variant` is specified."
            )
        if default_variant not in variant_to_files:
            raise ValueError(
                f"Default variant '{default_variant}' not found among wheel files."
            )

    if alias_to_default:
        if "default" not in variant_to_files:
            # e.g. only some wheels are uploaded to S3 currently
            print(
                "[WARN] Alias to default variant specified, but no default variant found."
            )
        elif alias_to_default in variant_to_files:
            raise ValueError(
                f"Alias variant name '{alias_to_default}' already exists among wheel files."
            )
        else:
            variant_to_files[alias_to_default] = variant_to_files["default"].copy()
            print(f"Alias variant '{alias_to_default}' created for default variant.")

244
245
246
247
    # Generate comment in HTML header
    comment_str = f" ({comment})" if comment else ""
    comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"

248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
    # Generate index for each variant
    subdir_names = set()
    for variant, files in variant_to_files.items():
        if variant == "default":
            variant_dir = index_base_dir
        else:
            variant_dir = index_base_dir / variant
            subdir_names.add(variant)

        variant_dir.mkdir(parents=True, exist_ok=True)

        # gather all package names in this variant
        packages = set(f.package_name for f in files)
        if variant == "default":
            # these packages should also appear in the "project list"
            # generate after all variants are processed
            subdir_names = subdir_names.union(packages)
        else:
            # generate project list for this variant directly
267
            project_list_str = generate_project_list(sorted(packages), comment_tmpl)
268
269
270
271
272
273
274
275
276
            with open(variant_dir / "index.html", "w") as f:
                f.write(project_list_str)

        for package in packages:
            # filter files belonging to this package only
            package_files = [f for f in files if f.package_name == package]
            package_dir = variant_dir / package
            package_dir.mkdir(parents=True, exist_ok=True)
            index_str, metadata_str = generate_package_index_and_metadata(
277
                package_files, wheel_base_dir, package_dir, comment
278
279
280
281
282
283
284
            )
            with open(package_dir / "index.html", "w") as f:
                f.write(index_str)
            with open(package_dir / "metadata.json", "w") as f:
                f.write(metadata_str)

    # Generate top-level project list index
285
    project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
286
287
288
289
290
291
292
293
294
295
296
    with open(index_base_dir / "index.html", "w") as f:
        f.write(project_list_str)


if __name__ == "__main__":
    """
    Arguments:
        --version <version> : version string for the current build (e.g., commit hash)
        --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
        --output-dir <output_directory> : directory to store generated index files
        --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
297
        --comment <comment_string> : (optional) comment string to include in generated HTML files
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
    """

    parser = argparse.ArgumentParser(
        description="Process nightly build wheel files to generate indices."
    )
    parser.add_argument(
        "--version",
        type=str,
        required=True,
        help="Version string for the current build (e.g., commit hash)",
    )
    parser.add_argument(
        "--current-objects",
        type=str,
        required=True,
        help="Path to JSON file containing current S3 objects listing in this version directory",
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        required=True,
        help="Directory to store generated index files",
    )
    parser.add_argument(
        "--alias-to-default",
        type=str,
        default=None,
        help="Alias variant name for the default variant",
    )
327
328
329
330
331
332
    parser.add_argument(
        "--comment",
        type=str,
        default="",
        help="Optional comment string to include in generated HTML files",
    )
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386

    args = parser.parse_args()

    version = args.version
    if "/" in version or "\\" in version:
        raise ValueError("Version string must not contain slashes.")
    current_objects_path = Path(args.current_objects)
    output_dir = Path(args.output_dir)
    if not output_dir.exists():
        output_dir.mkdir(parents=True, exist_ok=True)

    # Read current objects JSON
    with open(current_objects_path) as f:
        current_objects: dict[str, list[dict[str, Any]]] = json.load(f)

    # current_objects looks like from list_objects_v2 S3 API:
    """
    "Contents": [
        {
            "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
            "LastModified": "2025-11-28T14:00:32+00:00",
            "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
            "ChecksumAlgorithm": [
                "CRC64NVME"
            ],
            "ChecksumType": "FULL_OBJECT",
            "Size": 435649349,
            "StorageClass": "STANDARD"
        },
        ...
    ]
    """

    # Extract wheel file keys
    wheel_files = []
    for item in current_objects.get("Contents", []):
        key: str = item["Key"]
        if key.endswith(".whl"):
            wheel_files.append(key.split("/")[-1])  # only the filename is used

    print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")

    # Generate index and metadata, assuming wheels and indices are stored as:
    # s3://vllm-wheels/{version}/<wheel files>
    # s3://vllm-wheels/<anything>/<index files>
    wheel_base_dir = Path(output_dir).parent / version
    index_base_dir = Path(output_dir)

    generate_index_and_metadata(
        whl_files=wheel_files,
        wheel_base_dir=wheel_base_dir,
        index_base_dir=index_base_dir,
        default_variant=None,
        alias_to_default=args.alias_to_default,
387
        comment=args.comment.strip(),
388
389
    )
    print(f"Successfully generated index and metadata in {output_dir}")