#!/usr/bin/env python3 # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 """Generate attribution CSV files for container images. Extracts dpkg and Python package information from a container image by running helper scripts inside the container via `docker run`. Optionally computes a diff against a base image to show only added/changed packages. Usage: python generate_attributions.py [--output out.csv] [--base-image base:tag] python generate_attributions.py --framework vllm --cuda-version 12.9 """ import argparse import csv import logging import sys from pathlib import Path # Allow running as a script from any directory _SCRIPT_DIR = Path(__file__).resolve().parent _REPO_ROOT = _SCRIPT_DIR.parent.parent sys.path.insert(0, str(_SCRIPT_DIR)) from extractors.dpkg import extract_dpkg # noqa: E402 from extractors.python_pkgs import extract_python # noqa: E402 log = logging.getLogger(__name__) VALID_TYPES = {"dpkg", "python"} def resolve_base_image( framework: str, target: str, cuda_version: str, context_yaml_path: Path, ) -> str: """Resolve the base image from context.yaml for a given framework/target/cuda combo.""" try: import yaml except ImportError: log.error( "PyYAML is required for --framework/--cuda-version base image resolution. " "Install it with: pip install pyyaml" ) sys.exit(1) if not context_yaml_path.is_file(): log.error("context.yaml not found at %s", context_yaml_path) sys.exit(1) with open(context_yaml_path, "r") as f: context = yaml.safe_load(f) if target == "frontend": frontend_image = context.get("dynamo", {}).get("frontend_image") if not frontend_image: log.error("frontend_image not found in context.yaml dynamo section") sys.exit(1) return frontend_image # Runtime target: look up runtime_image and runtime_image_tag fw_config = context.get(framework, {}) cuda_key = f"cuda{cuda_version}" cuda_config = fw_config.get(cuda_key, {}) runtime_image = cuda_config.get("runtime_image") runtime_image_tag = cuda_config.get("runtime_image_tag") if not runtime_image or not runtime_image_tag: log.error( "Could not resolve base image for framework=%s cuda=%s target=%s. " "Keys runtime_image/runtime_image_tag not found under %s.%s in context.yaml", framework, cuda_version, target, framework, cuda_key, ) sys.exit(1) return f"{runtime_image}:{runtime_image_tag}" def compute_diff( target_packages: list[dict[str, str]], base_packages: list[dict[str, str]], ) -> list[dict[str, str]]: """Compute packages in target that are new or have different versions vs base. Returns packages present in target but not in base, or with a different version. """ base_lookup = {} for pkg in base_packages: key = (pkg["package_name"], pkg["type"]) base_lookup[key] = pkg["version"] diff = [] for pkg in target_packages: key = (pkg["package_name"], pkg["type"]) base_version = base_lookup.get(key) if base_version is None or base_version != pkg["version"]: diff.append(pkg) return diff def write_csv(packages: list[dict[str, str]], output_path: str | None) -> None: """Write packages to CSV, sorted by (type, package_name).""" sorted_packages = sorted(packages, key=lambda p: (p["type"], p["package_name"])) fieldnames = ["package_name", "version", "type", "spdx_license"] if output_path: with open(output_path, "w", newline="") as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(sorted_packages) log.info("Wrote %d entries to %s", len(sorted_packages), output_path) else: writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames) writer.writeheader() writer.writerows(sorted_packages) def extract_all( image: str, types: set[str], docker_cmd: str, verbose: bool, ) -> list[dict[str, str]]: """Run all requested extractors against an image.""" packages = [] if "dpkg" in types: log.info("Extracting dpkg packages from %s ...", image) packages.extend(extract_dpkg(image, docker_cmd=docker_cmd, verbose=verbose)) if "python" in types: log.info("Extracting Python packages from %s ...", image) packages.extend(extract_python(image, docker_cmd=docker_cmd, verbose=verbose)) return packages def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Generate attribution CSV files for container images", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: %(prog)s my-registry/dynamo:vllm-runtime -o vllm.csv %(prog)s my-registry/dynamo:vllm-runtime --framework vllm --cuda-version 12.9 -o vllm.csv %(prog)s my-registry/dynamo:vllm-runtime --base-image nvcr.io/nvidia/cuda:12.9.1-runtime-ubuntu24.04 -o vllm.csv %(prog)s my-registry/dynamo:frontend --framework dynamo --target frontend -o frontend.csv """, ) parser.add_argument( "image", help="Container image to scan (e.g., my-registry/dynamo:latest)" ) parser.add_argument( "--output", "-o", help="Output CSV file path (default: stdout)", ) parser.add_argument( "--base-image", help="Base image for diff calculation (explicit, overrides --framework auto-resolve)", ) parser.add_argument( "--framework", choices=["vllm", "sglang", "trtllm", "dynamo"], help="Framework name for auto-resolving base image from context.yaml", ) parser.add_argument( "--target", default="runtime", choices=["runtime", "frontend"], help="Build target for base image resolution (default: runtime)", ) parser.add_argument( "--cuda-version", choices=["12.9", "13.0", "13.1"], help="CUDA version for base image resolution", ) parser.add_argument( "--context-yaml", default=str(_REPO_ROOT / "container" / "context.yaml"), help="Path to context.yaml (default: container/context.yaml in repo root)", ) parser.add_argument( "--types", default="dpkg,python", help="Comma-separated extraction types (default: dpkg,python)", ) parser.add_argument( "--docker-cmd", default="docker", help="Docker command to use (default: docker)", ) parser.add_argument( "--verbose", "-v", action="store_true", help="Enable verbose logging", ) return parser.parse_args() def main() -> None: args = parse_args() logging.basicConfig( level=logging.DEBUG if args.verbose else logging.INFO, format="%(levelname)s: %(message)s", stream=sys.stderr, ) types = set(args.types.split(",")) invalid = types - VALID_TYPES if invalid: log.error("Invalid types: %s (valid: %s)", invalid, VALID_TYPES) sys.exit(1) # Resolve base image if needed base_image = args.base_image if not base_image and args.framework: if args.target != "frontend" and not args.cuda_version: log.error( "--cuda-version is required when using --framework for runtime targets" ) sys.exit(1) base_image = resolve_base_image( framework=args.framework, target=args.target, cuda_version=args.cuda_version or "", context_yaml_path=Path(args.context_yaml), ) log.info("Auto-resolved base image: %s", base_image) # Extract from target image target_packages = extract_all(args.image, types, args.docker_cmd, args.verbose) log.info("Total packages extracted from target: %d", len(target_packages)) # Write full CSV write_csv(target_packages, args.output) # Compute and write diff if base image is available if base_image: log.info("Extracting packages from base image for diff: %s", base_image) base_packages = extract_all(base_image, types, args.docker_cmd, args.verbose) log.info("Total packages extracted from base: %d", len(base_packages)) diff_packages = compute_diff(target_packages, base_packages) log.info("Diff: %d new/changed packages", len(diff_packages)) if args.output: # Insert _diff before the file extension output_path = Path(args.output) diff_path = str(output_path.with_stem(output_path.stem + "_diff")) write_csv(diff_packages, diff_path) else: # Write diff to stdout with a separator print("\n# --- DIFF (new/changed packages vs base) ---", file=sys.stderr) write_csv(diff_packages, None) if __name__ == "__main__": main()