Unverified Commit f5a284b8 authored by Ran Rubin's avatar Ran Rubin Committed by GitHub
Browse files

ci(compliance): replace docker run with BuildKit filesystem extraction (#7397)


Co-authored-by: default avatarClaude Sonnet 4.6 <noreply@anthropic.com>
parent 9321ae9c
......@@ -6,11 +6,15 @@ description: 'Generate attribution CSVs (dpkg + Python) for a container image an
inputs:
image:
description: 'Full container image URI to scan (must be pullable)'
description: 'Full container image URI to scan (must be accessible to the BuildKit worker)'
required: true
artifact_name:
description: 'Name for the uploaded artifact (e.g., compliance-vllm-cuda12-amd64)'
required: true
arch:
description: 'Target architecture (amd64, arm64)'
required: false
default: 'amd64'
framework:
description: 'Framework name for base image resolution (vllm, sglang, trtllm, dynamo)'
required: false
......@@ -35,49 +39,74 @@ inputs:
runs:
using: "composite"
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 #v3.11.1
- name: Initialize builder
uses: ./.github/actions/init-dynamo-builder
with:
driver: docker-container
# Enable BuildKit for enhanced metadata
buildkitd-flags: --debug
version: v0.14.1
- name: Cleanup
if: always()
shell: bash
run: |
docker system prune -af
builder_name: compliance-${{ github.run_id }}-${{ github.run_attempt }}
flavor: general
arch: ${{ inputs.arch }}
- name: Set up Python
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: '3.12'
pip-install: pyyaml
- name: Pull container image
shell: bash
run: |
source ./.github/scripts/retry_docker.sh
retry_pull ${{ inputs.image }}
- name: Generate attribution CSVs
- name: Resolve base image
id: resolve-base
shell: bash
run: |
ARGS=""
BASE_IMAGE=""
if [ -n "${{ inputs.base_image }}" ]; then
ARGS+=" --base-image ${{ inputs.base_image }}"
BASE_IMAGE="${{ inputs.base_image }}"
elif [ -n "${{ inputs.framework }}" ]; then
ARGS+=" --framework ${{ inputs.framework }}"
ARGS+=" --target ${{ inputs.target }}"
RESOLVE_ARGS="--framework ${{ inputs.framework }} --target ${{ inputs.target }}"
if [ -n "${{ inputs.cuda_version }}" ]; then
ARGS+=" --cuda-version ${{ inputs.cuda_version }}"
RESOLVE_ARGS+=" --cuda-version ${{ inputs.cuda_version }}"
fi
BASE_IMAGE=$(python3 container/compliance/resolve_base_image.py ${RESOLVE_ARGS})
fi
echo "base_image=${BASE_IMAGE}" >> $GITHUB_OUTPUT
python container/compliance/generate_attributions.py \
"${{ inputs.image }}" \
--output "${{ inputs.artifact_name }}.csv" \
--verbose \
${ARGS}
- name: Extract target image packages
shell: bash
run: |
mkdir -p /tmp/compliance-target
docker buildx build \
--builder compliance-${{ github.run_id }}-${{ github.run_attempt }} \
--platform linux/${{ inputs.arch }} \
--build-arg TARGET_IMAGE="${{ inputs.image }}" \
--output "type=local,dest=/tmp/compliance-target" \
--pull \
--no-cache-filter extractor \
--progress=plain \
-f container/compliance/Dockerfile.extract \
container/compliance/
- name: Extract base image packages
if: steps.resolve-base.outputs.base_image != ''
shell: bash
run: |
mkdir -p /tmp/compliance-base
docker buildx build \
--builder compliance-${{ github.run_id }}-${{ github.run_attempt }} \
--platform linux/${{ inputs.arch }} \
--build-arg TARGET_IMAGE="${{ steps.resolve-base.outputs.base_image }}" \
--output "type=local,dest=/tmp/compliance-base" \
--pull \
--no-cache-filter extractor \
--progress=plain \
-f container/compliance/Dockerfile.extract \
container/compliance/
- name: Generate attribution CSVs
shell: bash
run: |
ARGS=(--target-dir /tmp/compliance-target --output "${{ inputs.artifact_name }}.csv")
if [ -n "${{ steps.resolve-base.outputs.base_image }}" ]; then
ARGS+=(--base-dir /tmp/compliance-base)
fi
python3 container/compliance/process_results.py "${ARGS[@]}"
- name: Upload attribution artifacts
if: always()
......@@ -86,3 +115,9 @@ runs:
name: ${{ inputs.artifact_name }}
path: ${{ inputs.artifact_name }}*.csv
retention-days: ${{ inputs.retention_days }}
- name: Cleanup compliance builder
if: always()
shell: bash
run: |
docker buildx rm compliance-${{ github.run_id }}-${{ github.run_attempt }} || true
......@@ -199,13 +199,9 @@ jobs:
strategy:
fail-fast: false
matrix:
include:
- arch: amd64
runner: prod-builder-amd-v1
- arch: arm64
runner: prod-tester-arm-v1
arch: [amd64, arm64]
name: Compliance frontend-${{ matrix.arch }}
runs-on: ${{ matrix.runner }}
runs-on: prod-builder-v3
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
......@@ -222,6 +218,7 @@ jobs:
with:
image: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build-frontend-image.outputs.target_tag }}
artifact_name: compliance-frontend-${{ matrix.arch }}
arch: ${{ matrix.arch }}
framework: dynamo
target: frontend
......
......@@ -503,7 +503,7 @@ jobs:
if: inputs.build_image && inputs.push_image
needs: [build]
name: Compliance cuda${{ inputs.cuda_version }}-${{ inputs.platform }}
runs-on: ${{ inputs.platform == 'amd64' && 'prod-builder-amd-v1' || 'prod-tester-arm-v1' }}
runs-on: prod-builder-v3
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
......@@ -529,6 +529,7 @@ jobs:
with:
image: ${{ steps.images.outputs.runtime_image }}
artifact_name: compliance-${{ inputs.framework }}-${{ inputs.target }}${{ inputs.make_efa && '-efa' || '' }}-cuda${{ steps.images.outputs.cuda_major }}-${{ inputs.platform }}
arch: ${{ inputs.platform }}
framework: ${{ inputs.framework }}
cuda_version: ${{ inputs.cuda_version }}
......
# syntax=docker/dockerfile:1
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# BuildKit-based extraction Dockerfile.
# Mounts the target image filesystem read-only and runs helper scripts to extract
# dpkg and Python package data — no `docker run` of the target image required.
#
# Usage:
# docker buildx build \
# --builder <builder> \
# --platform linux/amd64 \
# --build-arg TARGET_IMAGE=<image:tag> \
# --output type=local,dest=<output_dir> \
# --pull \
# --no-cache-filter extractor \
# -f container/compliance/Dockerfile.extract \
# container/compliance/
#
# --no-cache-filter extractor: always re-runs the extraction stage to avoid
# stale results. BuildKit's cache key for RUN --mount=type=bind,from=<stage>
# does not reliably include the mounted stage's content digest when the source
# is a stage name rather than a direct image reference, so a cache hit could
# return TSVs from a previous run against a different image.
#
# Output files in <output_dir>:
# dpkg.tsv - tab-separated: package_name\tversion\tspdx_license
# python.tsv - tab-separated: package_name\tversion\tspdx_license
# dpkg_err.txt - stderr from dpkg extraction (for debugging)
# python_err.txt - stderr from python extraction (for debugging)
ARG TARGET_IMAGE=scratch
ARG EXTRACTOR_IMAGE=python:3.12-slim
FROM ${TARGET_IMAGE} AS target
FROM ${EXTRACTOR_IMAGE} AS extractor
RUN mkdir /output
COPY helpers/dpkg_helper.py /helpers/dpkg_helper.py
COPY helpers/python_helper.py /helpers/python_helper.py
RUN --mount=type=bind,from=target,target=/target \
python3 /helpers/dpkg_helper.py --root /target > /output/dpkg.tsv 2>/output/dpkg_err.txt ; \
python3 /helpers/python_helper.py --root /target > /output/python.tsv 2>/output/python_err.txt ; \
cat /output/dpkg_err.txt >&2 ; \
cat /output/python_err.txt >&2 ; \
[ -s /output/dpkg.tsv ] || { echo "ERROR: dpkg extraction produced no output" >&2; exit 1; } ; \
[ -s /output/python.tsv ] || echo "⚠️ WARNING: python extraction produced no output" >&2
FROM scratch
COPY --from=extractor /output/ /
......@@ -17,54 +17,110 @@ Files are sorted by `(type, package_name)` for stable diffs.
When a base image is provided, a second `_diff.csv` file is written containing only packages that are new or version-changed relative to the base — i.e. what Dynamo's build layers added on top of the upstream image.
## Usage
## Local usage
### Prerequisites
- Docker with [BuildKit](https://docs.docker.com/build/buildkit/) support (Docker 23+)
- Python 3.11+
### Step 1 — Create a local BuildKit builder (one-time)
```bash
docker buildx create --use --name compliance-builder
```
### Step 2 — Extract packages from an image
```bash
# Full scan, output to stdout
python container/compliance/generate_attributions.py <image:tag>
docker buildx build \
--builder compliance-builder \
--platform linux/amd64 \
--build-arg TARGET_IMAGE=<image:tag> \
--output type=local,dest=./output \
--pull \
--no-cache-filter extractor \
--progress=plain \
-f container/compliance/Dockerfile.extract \
container/compliance/
```
This produces `./output/dpkg.tsv` and `./output/python.tsv` — tab-separated files
with `package_name\tversion\tspdx_license` per line.
> **Why `--no-cache-filter extractor`?** BuildKit's cache key for
> `RUN --mount=type=bind,from=<stage>` does not reliably include the mounted
> stage's content digest when the source is a stage name (vs. a direct image
> reference). Without this flag, a cache hit could return TSVs from a previous
> run against a different image even if `--pull` resolved a new digest.
> `--no-cache-filter extractor` forces only the extraction stage to re-run;
> the `python:3.12-slim` base layer and helper script COPYs are still cached.
# Write to file
python container/compliance/generate_attributions.py <image:tag> -o attribution.csv
### Step 3 — Convert to CSV
```bash
python container/compliance/process_results.py \
--target-dir ./output \
--output attribution.csv
```
# With base image diff — auto-resolved from context.yaml
python container/compliance/generate_attributions.py <image:tag> \
### Full example with base image diff
Use `resolve_base_image.py` to look up the correct base image from `container/context.yaml`
rather than hardcoding the URI:
```bash
# Resolve base image from context.yaml (requires: pip install pyyaml)
BASE_IMAGE=$(python container/compliance/resolve_base_image.py \
--framework vllm \
--cuda-version 12.9 \
-o attribution-vllm-cuda12-amd64.csv
# Produces: attribution-vllm-cuda12-amd64.csv (full)
# attribution-vllm-cuda12-amd64_diff.csv (delta from base)
# With explicit base image override
python container/compliance/generate_attributions.py <image:tag> \
--base-image nvcr.io/nvidia/cuda:12.9.1-runtime-ubuntu24.04 \
-o attribution.csv
# Frontend image
python container/compliance/generate_attributions.py <image:tag> \
--framework dynamo \
--target frontend \
-o attribution-frontend-amd64.csv
# dpkg only
python container/compliance/generate_attributions.py <image:tag> \
--types dpkg \
-o attribution-dpkg.csv
--cuda-version 12.9)
# Extract target image
docker buildx build \
--builder compliance-builder \
--platform linux/amd64 \
--build-arg TARGET_IMAGE=<image:tag> \
--output type=local,dest=./output \
--pull \
--no-cache-filter extractor \
-f container/compliance/Dockerfile.extract \
container/compliance/
# Extract base image
docker buildx build \
--builder compliance-builder \
--platform linux/amd64 \
--build-arg TARGET_IMAGE="${BASE_IMAGE}" \
--output type=local,dest=./base-output \
--pull \
--no-cache-filter extractor \
-f container/compliance/Dockerfile.extract \
container/compliance/
# Generate CSV with diff
python container/compliance/process_results.py \
--target-dir ./output \
--base-dir ./base-output \
--output attribution.csv
# Produces: attribution.csv (full) and attribution_diff.csv (delta from base)
```
### All flags
### resolve_base_image.py flags
| Flag | Default | Description |
|------|---------|-------------|
| `image` | *(required)* | Container image to scan |
| `--output`, `-o` | stdout | Output CSV path |
| `--framework` | — | Auto-resolve base image from `context.yaml` (`vllm`, `sglang`, `trtllm`, `dynamo`) |
| `--target` | `runtime` | Build target for base resolution (`runtime` or `frontend`) |
| `--cuda-version` | — | CUDA version for base resolution (e.g. `12.9`, `13.0`, `13.1`) |
| `--base-image` | — | Explicit base image URI (overrides `--framework` auto-resolve) |
| `--framework` | *(required)* | `vllm`, `sglang`, `trtllm`, or `dynamo` |
| `--target` | `runtime` | `runtime` or `frontend` |
| `--cuda-version` | — | Required for runtime targets (e.g. `12.9`, `13.0`, `13.1`) |
| `--context-yaml` | `container/context.yaml` | Path to context.yaml |
| `--types` | `dpkg,python` | Comma-separated list of types to extract |
| `--docker-cmd` | `docker` | Docker binary to use |
| `--verbose`, `-v` | — | Enable verbose logging to stderr |
### process_results.py flags
| Flag | Default | Description |
|------|---------|-------------|
| `--target-dir` | *(required)* | Directory containing `dpkg.tsv` and `python.tsv` from target extraction |
| `--base-dir` | — | Directory containing TSVs from base image extraction (enables `_diff.csv` output) |
| `--output`, `-o` | stdout | Output CSV path |
## Base image reference
......@@ -77,31 +133,37 @@ python container/compliance/generate_attributions.py <image:tag> \
| `trtllm` | 13.1 | `nvcr.io/nvidia/cuda-dl-base:25.12-cuda13.1-runtime-ubuntu24.04` |
| `dynamo` frontend | — | `nvcr.io/nvidia/base/ubuntu:noble-20250619` |
These values are sourced from `container/context.yaml` at runtime; the table above reflects the current defaults.
These values are sourced from `container/context.yaml`; the table above reflects the current defaults.
## How it works
The script runs two lightweight helper scripts **inside the container** via `docker run --rm -v`:
Extraction uses BuildKit's bind-mount mechanism — the target image filesystem is
mounted read-only at `/target` inside a Python 3.12 builder container, and two
helper scripts read package metadata directly from disk without starting the target
container:
- **dpkg extractor** — runs `dpkg-query` to list packages, then reads `/usr/share/doc/<pkg>/copyright` files for license info. Only DEP-5 machine-readable copyright files are parsed; ambiguous cases return `UNKNOWN`.
- **Python extractor** — uses `importlib.metadata.distributions()` to iterate installed packages. License is read from `License-Expression` (PEP 639), then `License` metadata, then trove classifiers. Ambiguous cases return `UNKNOWN`.
- **`helpers/dpkg_helper.py`** — parses `/target/var/lib/dpkg/status` for installed
packages and reads `/target/usr/share/doc/<pkg>/copyright` (DEP-5 format) for license info.
- **`helpers/python_helper.py`** — enumerates site-packages directories under `/target`
using `importlib.metadata`. License is read from `License-Expression` (PEP 639),
then `License` metadata, then trove classifiers.
Both helpers are self-contained and have no external dependencies — they run with whatever Python is in the container.
Both helpers are self-contained (stdlib only) and run inside the `python:3.12-slim`
extractor stage, not inside the target image.
## License detection
Detection is intentionally conservative: only unambiguous matches are assigned SPDX identifiers. The `UNKNOWN` entries are expected; they can be resolved with additional analysis against the raw copyright files.
Detection is intentionally conservative: only unambiguous matches are assigned SPDX
identifiers. The `UNKNOWN` entries are expected; they can be resolved with additional
analysis against the raw copyright files.
## CI integration
Attribution CSVs are generated automatically as part of CI after every successful image build. Artifacts are available in the GitHub Actions workflow run under:
Attribution CSVs are generated automatically as part of CI after every successful
image build. Artifacts are available in the GitHub Actions workflow run under:
- `compliance-{framework}-cuda{major}-{platform}` — runtime images
- `compliance-frontend-{arch}` — frontend image
The scan runs as a separate lightweight job (`prod-default-small-v2`) in parallel with tests, so it does not extend pipeline wall time.
## Requirements
- Python 3.11+
- `docker` (or compatible CLI) with access to the target registry
- `pyyaml` — only required on the host when using `--framework`/`--cuda-version` base image auto-resolution (`pip install pyyaml`)
The scan runs as a separate job in parallel with tests, so it does not extend
pipeline wall time.
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Attribution extractors for container dependency scanning."""
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Extract dpkg package information from a container image."""
import logging
import subprocess
from pathlib import Path
log = logging.getLogger(__name__)
_HELPER_SCRIPT_PATH = Path(__file__).resolve().parent / "helpers" / "dpkg_helper.py"
def extract_dpkg(
image: str,
docker_cmd: str = "docker",
verbose: bool = False,
) -> list[dict[str, str]]:
"""Extract dpkg package attributions from a container image.
Returns a list of dicts with keys: package_name, version, type, spdx_license
"""
cmd = [
docker_cmd,
"run",
"--rm",
"--entrypoint",
"python3",
"-v",
f"{_HELPER_SCRIPT_PATH}:/tmp/dpkg_helper.py:ro",
image,
"/tmp/dpkg_helper.py",
]
if verbose:
log.info("Running: %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
if result.returncode != 0:
# Exit 127 means python3 not found — fall back to shell-based dpkg-query
if result.returncode == 127:
log.warning(
"python3 not found in %s, falling back to shell-based dpkg extraction (no license info)",
image,
)
return _extract_dpkg_shell(image, docker_cmd, verbose)
log.error(
"dpkg extraction failed (exit %d): %s", result.returncode, result.stderr
)
raise RuntimeError(f"dpkg extraction failed: {result.stderr}")
packages = []
for line in result.stdout.strip().splitlines():
parts = line.split("\t", 2)
if len(parts) != 3:
if verbose:
log.warning("Skipping malformed line: %r", line)
continue
pkg_name, version, spdx_license = parts
packages.append(
{
"package_name": pkg_name,
"version": version,
"type": "dpkg",
"spdx_license": spdx_license,
}
)
if verbose:
log.info("Extracted %d dpkg packages", len(packages))
return packages
def _extract_dpkg_shell(
image: str,
docker_cmd: str = "docker",
verbose: bool = False,
) -> list[dict[str, str]]:
"""Fallback: extract dpkg packages via shell when python3 is unavailable.
License info will be UNKNOWN for all packages.
"""
cmd = [
docker_cmd,
"run",
"--rm",
"--entrypoint",
"sh",
image,
"-c",
"dpkg-query -W -f='${Package}\\t${Version}\\n'",
]
if verbose:
log.info("Running (shell fallback): %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
if result.returncode != 0:
log.error(
"dpkg shell extraction failed (exit %d): %s",
result.returncode,
result.stderr,
)
raise RuntimeError(f"dpkg shell extraction failed: {result.stderr}")
packages = []
for line in result.stdout.strip().splitlines():
parts = line.split("\t", 1)
if len(parts) != 2:
continue
pkg_name, version = parts
packages.append(
{
"package_name": pkg_name,
"version": version,
"type": "dpkg",
"spdx_license": "UNKNOWN",
}
)
if verbose:
log.info("Extracted %d dpkg packages (shell fallback)", len(packages))
return packages
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Extract Python package information from a container image."""
import logging
import subprocess
from pathlib import Path
log = logging.getLogger(__name__)
_HELPER_SCRIPT_PATH = Path(__file__).resolve().parent / "helpers" / "python_helper.py"
def extract_python(
image: str,
docker_cmd: str = "docker",
verbose: bool = False,
) -> list[dict[str, str]]:
"""Extract Python package attributions from a container image.
Returns a list of dicts with keys: package_name, version, type, spdx_license
"""
cmd = [
docker_cmd,
"run",
"--rm",
"--entrypoint",
"python3",
"-v",
f"{_HELPER_SCRIPT_PATH}:/tmp/python_helper.py:ro",
image,
"/tmp/python_helper.py",
]
if verbose:
log.info("Running: %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
if result.returncode != 0:
# Exit 127 means python3 not found — no Python packages in this image
if result.returncode == 127:
log.warning(
"python3 not found in %s, skipping Python package extraction", image
)
return []
log.error(
"Python extraction failed (exit %d): %s",
result.returncode,
result.stderr,
)
raise RuntimeError(f"Python extraction failed: {result.stderr}")
packages = []
for line in result.stdout.strip().splitlines():
parts = line.split("\t", 2)
if len(parts) != 3:
if verbose:
log.warning("Skipping malformed line: %r", line)
continue
pkg_name, version, spdx_license = parts
packages.append(
{
"package_name": pkg_name,
"version": version,
"type": "python",
"spdx_license": spdx_license,
}
)
if verbose:
log.info("Extracted %d Python packages", len(packages))
return packages
#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Generate attribution CSV files for container images.
Extracts dpkg and Python package information from a container image by
running helper scripts inside the container via `docker run`. Optionally
computes a diff against a base image to show only added/changed packages.
Usage:
python generate_attributions.py <image:tag> [--output out.csv] [--base-image base:tag]
python generate_attributions.py <image:tag> --framework vllm --cuda-version 12.9
"""
import argparse
import csv
import logging
import sys
from pathlib import Path
# Allow running as a script from any directory
_SCRIPT_DIR = Path(__file__).resolve().parent
_REPO_ROOT = _SCRIPT_DIR.parent.parent
sys.path.insert(0, str(_SCRIPT_DIR))
from extractors.dpkg import extract_dpkg # noqa: E402
from extractors.python_pkgs import extract_python # noqa: E402
log = logging.getLogger(__name__)
VALID_TYPES = {"dpkg", "python"}
def resolve_base_image(
framework: str,
target: str,
cuda_version: str,
context_yaml_path: Path,
) -> str:
"""Resolve the base image from context.yaml for a given framework/target/cuda combo."""
try:
import yaml
except ImportError:
log.error(
"PyYAML is required for --framework/--cuda-version base image resolution. "
"Install it with: pip install pyyaml"
)
sys.exit(1)
if not context_yaml_path.is_file():
log.error("context.yaml not found at %s", context_yaml_path)
sys.exit(1)
with open(context_yaml_path, "r") as f:
context = yaml.safe_load(f)
if target == "frontend":
frontend_image = context.get("dynamo", {}).get("frontend_image")
if not frontend_image:
log.error("frontend_image not found in context.yaml dynamo section")
sys.exit(1)
return frontend_image
# Runtime target: look up runtime_image and runtime_image_tag
fw_config = context.get(framework, {})
cuda_key = f"cuda{cuda_version}"
cuda_config = fw_config.get(cuda_key, {})
runtime_image = cuda_config.get("runtime_image")
runtime_image_tag = cuda_config.get("runtime_image_tag")
if not runtime_image or not runtime_image_tag:
log.error(
"Could not resolve base image for framework=%s cuda=%s target=%s. "
"Keys runtime_image/runtime_image_tag not found under %s.%s in context.yaml",
framework,
cuda_version,
target,
framework,
cuda_key,
)
sys.exit(1)
return f"{runtime_image}:{runtime_image_tag}"
def compute_diff(
target_packages: list[dict[str, str]],
base_packages: list[dict[str, str]],
) -> list[dict[str, str]]:
"""Compute packages in target that are new or have different versions vs base.
Returns packages present in target but not in base, or with a different version.
"""
base_lookup = {}
for pkg in base_packages:
key = (pkg["package_name"], pkg["type"])
base_lookup[key] = pkg["version"]
diff = []
for pkg in target_packages:
key = (pkg["package_name"], pkg["type"])
base_version = base_lookup.get(key)
if base_version is None or base_version != pkg["version"]:
diff.append(pkg)
return diff
def write_csv(packages: list[dict[str, str]], output_path: str | None) -> None:
"""Write packages to CSV, sorted by (type, package_name)."""
sorted_packages = sorted(packages, key=lambda p: (p["type"], p["package_name"]))
fieldnames = ["package_name", "version", "type", "spdx_license"]
if output_path:
with open(output_path, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(sorted_packages)
log.info("Wrote %d entries to %s", len(sorted_packages), output_path)
else:
writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(sorted_packages)
def extract_all(
image: str,
types: set[str],
docker_cmd: str,
verbose: bool,
) -> list[dict[str, str]]:
"""Run all requested extractors against an image."""
packages = []
if "dpkg" in types:
log.info("Extracting dpkg packages from %s ...", image)
packages.extend(extract_dpkg(image, docker_cmd=docker_cmd, verbose=verbose))
if "python" in types:
log.info("Extracting Python packages from %s ...", image)
packages.extend(extract_python(image, docker_cmd=docker_cmd, verbose=verbose))
return packages
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Generate attribution CSV files for container images",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s my-registry/dynamo:vllm-runtime -o vllm.csv
%(prog)s my-registry/dynamo:vllm-runtime --framework vllm --cuda-version 12.9 -o vllm.csv
%(prog)s my-registry/dynamo:vllm-runtime --base-image nvcr.io/nvidia/cuda:12.9.1-runtime-ubuntu24.04 -o vllm.csv
%(prog)s my-registry/dynamo:frontend --framework dynamo --target frontend -o frontend.csv
""",
)
parser.add_argument(
"image", help="Container image to scan (e.g., my-registry/dynamo:latest)"
)
parser.add_argument(
"--output",
"-o",
help="Output CSV file path (default: stdout)",
)
parser.add_argument(
"--base-image",
help="Base image for diff calculation (explicit, overrides --framework auto-resolve)",
)
parser.add_argument(
"--framework",
choices=["vllm", "sglang", "trtllm", "dynamo"],
help="Framework name for auto-resolving base image from context.yaml",
)
parser.add_argument(
"--target",
default="runtime",
choices=["runtime", "frontend"],
help="Build target for base image resolution (default: runtime)",
)
parser.add_argument(
"--cuda-version",
choices=["12.9", "13.0", "13.1"],
help="CUDA version for base image resolution",
)
parser.add_argument(
"--context-yaml",
default=str(_REPO_ROOT / "container" / "context.yaml"),
help="Path to context.yaml (default: container/context.yaml in repo root)",
)
parser.add_argument(
"--types",
default="dpkg,python",
help="Comma-separated extraction types (default: dpkg,python)",
)
parser.add_argument(
"--docker-cmd",
default="docker",
help="Docker command to use (default: docker)",
)
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Enable verbose logging",
)
return parser.parse_args()
def main() -> None:
args = parse_args()
logging.basicConfig(
level=logging.DEBUG if args.verbose else logging.INFO,
format="%(levelname)s: %(message)s",
stream=sys.stderr,
)
types = set(args.types.split(","))
invalid = types - VALID_TYPES
if invalid:
log.error("Invalid types: %s (valid: %s)", invalid, VALID_TYPES)
sys.exit(1)
# Resolve base image if needed
base_image = args.base_image
if not base_image and args.framework:
if args.target != "frontend" and not args.cuda_version:
log.error(
"--cuda-version is required when using --framework for runtime targets"
)
sys.exit(1)
base_image = resolve_base_image(
framework=args.framework,
target=args.target,
cuda_version=args.cuda_version or "",
context_yaml_path=Path(args.context_yaml),
)
log.info("Auto-resolved base image: %s", base_image)
# Extract from target image
target_packages = extract_all(args.image, types, args.docker_cmd, args.verbose)
log.info("Total packages extracted from target: %d", len(target_packages))
# Write full CSV
write_csv(target_packages, args.output)
# Compute and write diff if base image is available
if base_image:
log.info("Extracting packages from base image for diff: %s", base_image)
base_packages = extract_all(base_image, types, args.docker_cmd, args.verbose)
log.info("Total packages extracted from base: %d", len(base_packages))
diff_packages = compute_diff(target_packages, base_packages)
log.info("Diff: %d new/changed packages", len(diff_packages))
if args.output:
# Insert _diff before the file extension
output_path = Path(args.output)
diff_path = str(output_path.with_stem(output_path.stem + "_diff"))
write_csv(diff_packages, diff_path)
else:
# Write diff to stdout with a separator
print("\n# --- DIFF (new/changed packages vs base) ---", file=sys.stderr)
write_csv(diff_packages, None)
if __name__ == "__main__":
main()
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# This script runs INSIDE the container. It must be fully self-contained
# with zero external dependencies (only Python stdlib).
# This script runs INSIDE the container (local mode) or against a mounted
# filesystem root (--root /target mode for BuildKit extraction).
# It must be fully self-contained with zero external dependencies (only Python stdlib).
import argparse
import os
import subprocess
import sys
......@@ -76,9 +78,10 @@ def extract_dep5_license(content):
return "UNKNOWN"
def get_license_for_package(pkg_name):
"""Read /usr/share/doc/<pkg>/copyright and extract license info."""
copyright_path = f"/usr/share/doc/{pkg_name}/copyright"
def get_license_for_package(pkg_name, root="/"):
"""Read <root>/usr/share/doc/<pkg>/copyright and extract license info."""
root = root.rstrip("/")
copyright_path = f"{root}/usr/share/doc/{pkg_name}/copyright"
if not os.path.isfile(copyright_path):
return "UNKNOWN"
try:
......@@ -96,7 +99,59 @@ def get_license_for_package(pkg_name):
return "UNKNOWN"
def parse_dpkg_status(status_path):
"""Parse a dpkg status file and return {pkg: version} for installed packages."""
packages = {}
current = {}
try:
with open(status_path, "r", errors="replace") as f:
for line in f:
line = line.rstrip("\n")
if not line:
# End of stanza — record if installed
if current.get("Package") and "installed" in current.get(
"Status", ""
):
packages[current["Package"]] = current.get("Version", "UNKNOWN")
current = {}
elif line.startswith((" ", "\t")):
# Continuation line — ignore
pass
elif ":" in line:
key, _, val = line.partition(":")
current[key.strip()] = val.strip()
except (OSError, IOError):
print(f"ERROR: Cannot read dpkg status file: {status_path}", file=sys.stderr)
sys.exit(1)
# Handle last stanza if file has no trailing blank line
if current.get("Package") and "installed" in current.get("Status", ""):
packages[current["Package"]] = current.get("Version", "UNKNOWN")
return packages
def main():
parser = argparse.ArgumentParser(
description="Extract dpkg package info (stdlib only)"
)
parser.add_argument(
"--root",
default="/",
help="Filesystem root to inspect (default: /, i.e. running system)",
)
args = parser.parse_args()
root = args.root.rstrip("/") or "/"
count = 0
if root != "/":
# BuildKit mode: parse dpkg status file from mounted target filesystem
status_path = f"{root}/var/lib/dpkg/status"
pkgs = parse_dpkg_status(status_path)
for pkg, version in pkgs.items():
license_id = get_license_for_package(pkg, root)
print(f"{pkg}\t{version}\t{license_id}")
count += 1
else:
# Local mode: run dpkg-query inside the container
result = subprocess.run(
["dpkg-query", "-W", "-f=${Package}\t${Version}\n"],
capture_output=True,
......@@ -113,6 +168,10 @@ def main():
pkg, version = parts
license_id = get_license_for_package(pkg)
print(f"{pkg}\t{version}\t{license_id}")
count += 1
icon = "✅" if count > 0 else "⚠️"
print(f"{icon} [dpkg] extracted {count} package(s)", file=sys.stderr)
if __name__ == "__main__":
......
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# This script runs INSIDE the container. It must be fully self-contained
# with zero external dependencies (only Python stdlib).
# This script runs INSIDE the container (local mode) or against a mounted
# filesystem root (--root /target mode for BuildKit extraction).
# It must be fully self-contained with zero external dependencies (only Python stdlib).
import argparse
import glob
import importlib.metadata
import sys
# Conservative classifier -> SPDX mapping
_CLASSIFIER_MAP = {
......@@ -96,8 +100,46 @@ def get_license(dist):
def main():
parser = argparse.ArgumentParser(
description="Extract Python package info (stdlib only)"
)
parser.add_argument(
"--root",
default="/",
help="Filesystem root to inspect (default: /, i.e. running system)",
)
args = parser.parse_args()
root = args.root.rstrip("/") or "/"
if root != "/":
# BuildKit mode: scan site-packages directories in the mounted target filesystem
_patterns = [
"/usr/lib/python*/dist-packages",
"/usr/lib/python*/site-packages",
"/usr/local/lib/python*/dist-packages",
"/usr/local/lib/python*/site-packages",
# conda / virtualenv layouts common in ML containers (e.g. /opt/conda)
"/opt/*/lib/python*/site-packages",
"/opt/*/lib/python*/dist-packages",
# virtualenv one level deeper (e.g. /opt/dynamo/venv/lib/python*/site-packages)
"/opt/*/*/lib/python*/site-packages",
"/opt/*/*/lib/python*/dist-packages",
]
search_paths = []
print(f"[python] search paths ({len(_patterns)} patterns):", file=sys.stderr)
for pattern in _patterns:
matches = glob.glob(f"{root}{pattern}")
marker = "+" if matches else "-"
label = f"({len(matches)} match)" if matches else "(no match)"
print(f"[python] {marker} {root}{pattern} {label}", file=sys.stderr)
search_paths.extend(matches)
dists = importlib.metadata.distributions(path=search_paths)
else:
# Local mode: enumerate distributions in the running Python environment
dists = importlib.metadata.distributions()
seen = set()
for dist in importlib.metadata.distributions():
for dist in dists:
name = dist.metadata["Name"]
if not name:
continue
......@@ -111,6 +153,10 @@ def main():
spdx = get_license(dist)
print(f"{name}\t{version}\t{spdx}")
count = len(seen)
icon = "✅" if count > 0 else "⚠️"
print(f"{icon} [python] extracted {count} package(s)", file=sys.stderr)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Convert BuildKit TSV extraction output to attribution CSV files.
Reads dpkg.tsv and python.tsv from a target extraction directory, writes a
sorted CSV, and optionally computes a diff against a base extraction directory.
Usage:
python process_results.py --target-dir <dir> --output attribution.csv
python process_results.py --target-dir <dir> --base-dir <dir> --output attribution.csv
# Produces: attribution.csv and attribution_diff.csv
"""
import argparse
import csv
import sys
from pathlib import Path
def read_tsv(tsv_path: Path, pkg_type: str) -> list[dict[str, str]]:
"""Parse a tab-separated extraction output file into package dicts."""
packages = []
if not tsv_path.is_file():
return packages
for line in tsv_path.read_text(errors="replace").strip().splitlines():
parts = line.split("\t", 2)
if len(parts) != 3:
continue
pkg_name, version, spdx_license = parts
packages.append(
{
"package_name": pkg_name,
"version": version,
"type": pkg_type,
"spdx_license": spdx_license,
}
)
return packages
def read_extraction_dir(directory: Path) -> list[dict[str, str]]:
"""Read dpkg.tsv and python.tsv from an extraction directory."""
packages = read_tsv(directory / "dpkg.tsv", "dpkg")
packages += read_tsv(directory / "python.tsv", "python")
return packages
def compute_diff(
target_packages: list[dict[str, str]],
base_packages: list[dict[str, str]],
) -> list[dict[str, str]]:
"""Return packages in target that are new or have a different version vs base."""
base_lookup = {
(pkg["package_name"], pkg["type"]): pkg["version"] for pkg in base_packages
}
return [
pkg
for pkg in target_packages
if base_lookup.get((pkg["package_name"], pkg["type"])) != pkg["version"]
]
def write_csv(packages: list[dict[str, str]], output_path: Path | None) -> None:
"""Write packages to CSV sorted by (type, package_name)."""
sorted_packages = sorted(packages, key=lambda p: (p["type"], p["package_name"]))
fieldnames = ["package_name", "version", "type", "spdx_license"]
if output_path:
with open(output_path, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(sorted_packages)
print(f"Wrote {len(sorted_packages)} entries to {output_path}", file=sys.stderr)
else:
writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(sorted_packages)
def main() -> None:
parser = argparse.ArgumentParser(
description="Convert BuildKit TSV extraction output to attribution CSV",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s --target-dir ./output --output attribution.csv
%(prog)s --target-dir ./output --base-dir ./base-output --output attribution.csv
""",
)
parser.add_argument(
"--target-dir",
required=True,
help="Directory containing dpkg.tsv and python.tsv from the target image extraction",
)
parser.add_argument(
"--base-dir",
help="Directory containing dpkg.tsv and python.tsv from the base image extraction (enables diff output)",
)
parser.add_argument(
"--output",
"-o",
help="Output CSV file path (default: stdout)",
)
args = parser.parse_args()
target_dir = Path(args.target_dir)
if not target_dir.is_dir():
print(f"ERROR: --target-dir does not exist: {target_dir}", file=sys.stderr)
sys.exit(1)
target_packages = read_extraction_dir(target_dir)
if not target_packages:
print(f"ERROR: no packages found in {target_dir}", file=sys.stderr)
sys.exit(1)
output_path = Path(args.output) if args.output else None
write_csv(target_packages, output_path)
if args.base_dir:
base_dir = Path(args.base_dir)
if not base_dir.is_dir():
print(f"ERROR: --base-dir does not exist: {base_dir}", file=sys.stderr)
sys.exit(1)
base_packages = read_extraction_dir(base_dir)
diff_packages = compute_diff(target_packages, base_packages)
print(
f"Diff: {len(diff_packages)} new/changed packages vs base", file=sys.stderr
)
if output_path:
diff_path = output_path.with_stem(output_path.stem + "_diff")
write_csv(diff_packages, diff_path)
else:
print("\n# --- DIFF (new/changed packages vs base) ---", file=sys.stderr)
write_csv(diff_packages, None)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Resolve the base image for a given framework/target/cuda from context.yaml.
Prints the resolved image URI to stdout so it can be captured in shell scripts.
Usage:
python resolve_base_image.py --framework vllm --cuda-version 12.9
python resolve_base_image.py --framework dynamo --target frontend
python resolve_base_image.py --framework sglang --cuda-version 13.0
"""
import argparse
import sys
from pathlib import Path
_REPO_ROOT = Path(__file__).resolve().parent.parent.parent
def main() -> None:
parser = argparse.ArgumentParser(
description="Resolve base image URI from container/context.yaml"
)
parser.add_argument(
"--framework",
required=True,
choices=["vllm", "sglang", "trtllm", "dynamo"],
help="Framework name",
)
parser.add_argument(
"--target",
default="runtime",
choices=["runtime", "frontend"],
help="Build target (default: runtime)",
)
parser.add_argument(
"--cuda-version",
help="CUDA version (e.g. 12.9, 13.0, 13.1) — required for runtime targets",
)
parser.add_argument(
"--context-yaml",
default=str(_REPO_ROOT / "container" / "context.yaml"),
help="Path to context.yaml (default: container/context.yaml in repo root)",
)
args = parser.parse_args()
try:
import yaml
except ImportError:
print(
"ERROR: pyyaml is required — install with: pip install pyyaml",
file=sys.stderr,
)
sys.exit(1)
context_yaml = Path(args.context_yaml)
if not context_yaml.is_file():
print(f"ERROR: context.yaml not found at {context_yaml}", file=sys.stderr)
sys.exit(1)
with open(context_yaml) as f:
ctx = yaml.safe_load(f)
if args.target == "frontend":
image = ctx.get("dynamo", {}).get("frontend_image")
if not image:
print(
"ERROR: frontend_image not found in context.yaml dynamo section",
file=sys.stderr,
)
sys.exit(1)
print(image)
return
# Runtime target
if not args.cuda_version:
print("ERROR: --cuda-version is required for runtime targets", file=sys.stderr)
sys.exit(1)
fw_config = ctx.get(args.framework, {})
cuda_key = f"cuda{args.cuda_version}"
cuda_config = fw_config.get(cuda_key, {})
runtime_image = cuda_config.get("runtime_image")
runtime_image_tag = cuda_config.get("runtime_image_tag")
if not runtime_image or not runtime_image_tag:
print(
f"ERROR: Could not resolve base image for framework={args.framework} "
f"cuda={args.cuda_version}. Keys runtime_image/runtime_image_tag not found "
f"under {args.framework}.{cuda_key} in context.yaml",
file=sys.stderr,
)
sys.exit(1)
print(f"{runtime_image}:{runtime_image_tag}")
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment