Unverified Commit f842a8e4 authored by Harrison Saturley-Hall's avatar Harrison Saturley-Hall Committed by GitHub
Browse files

ci: add a preliminary compliance scan to ci (#7289)


Signed-off-by: default avatarHarrison King Saturley-Hall <hsaturleyhal@nvidia.com>
parent 87ac404e
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/*.onnx **/*.onnx
**/*.plan **/*.plan
...@@ -45,6 +33,7 @@ container/Dockerfile* ...@@ -45,6 +33,7 @@ container/Dockerfile*
container/**/*.Dockerfile container/**/*.Dockerfile
container/render.py container/render.py
container/context.yaml container/context.yaml
container/compliance/
.venv .venv
.venv-docs .venv-docs
......
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
name: 'Compliance Scan'
description: 'Generate attribution CSVs (dpkg + Python) for a container image and upload as workflow artifacts'
inputs:
image:
description: 'Full container image URI to scan (must be pullable)'
required: true
artifact_name:
description: 'Name for the uploaded artifact (e.g., compliance-vllm-cuda12-amd64)'
required: true
framework:
description: 'Framework name for base image resolution (vllm, sglang, trtllm, dynamo)'
required: false
default: ''
target:
description: 'Build target for base image resolution (runtime or frontend)'
required: false
default: 'runtime'
cuda_version:
description: 'CUDA version for base image resolution (e.g., 12.9, 13.0, 13.1)'
required: false
default: ''
base_image:
description: 'Explicit base image for diff (overrides framework/cuda-version auto-resolve)'
required: false
default: ''
retention_days:
description: 'Artifact retention in days'
required: false
default: '90'
runs:
using: "composite"
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 #v3.11.1
with:
driver: docker-container
# Enable BuildKit for enhanced metadata
buildkitd-flags: --debug
version: v0.14.1
- name: Cleanup
if: always()
shell: bash
run: |
docker system prune -af
- name: Set up Python
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: '3.12'
pip-install: pyyaml
- name: Pull container image
shell: bash
run: |
source ./.github/scripts/retry_docker.sh
retry_pull ${{ inputs.image }}
- name: Generate attribution CSVs
shell: bash
run: |
ARGS=""
if [ -n "${{ inputs.base_image }}" ]; then
ARGS+=" --base-image ${{ inputs.base_image }}"
elif [ -n "${{ inputs.framework }}" ]; then
ARGS+=" --framework ${{ inputs.framework }}"
ARGS+=" --target ${{ inputs.target }}"
if [ -n "${{ inputs.cuda_version }}" ]; then
ARGS+=" --cuda-version ${{ inputs.cuda_version }}"
fi
fi
python container/compliance/generate_attributions.py \
"${{ inputs.image }}" \
--output "${{ inputs.artifact_name }}.csv" \
--verbose \
${ARGS}
- name: Upload attribution artifacts
if: always()
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: ${{ inputs.artifact_name }}
path: ${{ inputs.artifact_name }}*.csv
retention-days: ${{ inputs.retention_days }}
...@@ -83,6 +83,7 @@ core: ...@@ -83,6 +83,7 @@ core:
- 'container/templates/wheel_builder.Dockerfile' - 'container/templates/wheel_builder.Dockerfile'
- '.dockerignore' - '.dockerignore'
- 'container/deps/*' - 'container/deps/*'
- 'container/compliance/**'
- '.cargo/config.toml' - '.cargo/config.toml'
- 'lib/**' - 'lib/**'
- 'tests/**' - 'tests/**'
...@@ -154,6 +155,7 @@ frontend: ...@@ -154,6 +155,7 @@ frontend:
- '*.toml' - '*.toml'
- '*.lock' - '*.lock'
- 'container/deps/*' - 'container/deps/*'
- 'container/compliance/**'
- 'components/src/dynamo/router/**' - 'components/src/dynamo/router/**'
- 'components/src/dynamo/mocker/**' - 'components/src/dynamo/mocker/**'
- 'components/src/dynamo/frontend/**' - 'components/src/dynamo/frontend/**'
......
...@@ -220,6 +220,44 @@ jobs: ...@@ -220,6 +220,44 @@ jobs:
echo "| \`${{ steps.calculate-target-tag.outputs.default_target_image_uri }}\` |" >> $GITHUB_STEP_SUMMARY echo "| \`${{ steps.calculate-target-tag.outputs.default_target_image_uri }}\` |" >> $GITHUB_STEP_SUMMARY
echo "| \`${{ steps.calculate-target-tag.outputs.azure_target_image_uri }}\` |" >> $GITHUB_STEP_SUMMARY echo "| \`${{ steps.calculate-target-tag.outputs.azure_target_image_uri }}\` |" >> $GITHUB_STEP_SUMMARY
# ============================================================================
# COMPLIANCE — Generate attribution CSVs for dpkg and Python packages
# ============================================================================
compliance:
needs: [build-frontend-image, changed-files]
if: needs.build-frontend-image.result == 'success'
strategy:
fail-fast: false
matrix:
arch: [amd64, arm64]
name: Compliance frontend-${{ matrix.arch }}
runs-on: ${{ matrix.arch == 'amd64' && 'prod-builder-amd-v1' || 'prod-tester-arm-v1' }}
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Calculate image URI
id: images
shell: bash
run: |
TARGET_TAG="${{ github.sha }}-frontend-${{ matrix.arch }}"
FRONTEND_IMAGE="${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${TARGET_TAG}"
echo "frontend_image=${FRONTEND_IMAGE}" >> $GITHUB_OUTPUT
- name: Compliance scan
uses: ./.github/actions/compliance-scan
with:
image: ${{ steps.images.outputs.frontend_image }}
artifact_name: compliance-frontend-${{ matrix.arch }}
framework: dynamo
target: frontend
frontend-status-check: frontend-status-check:
runs-on: ubuntu-latest runs-on: ubuntu-latest
needs: [changed-files, build-frontend-image, build-epp-image] needs: [changed-files, build-frontend-image, build-epp-image]
......
...@@ -504,6 +504,43 @@ jobs: ...@@ -504,6 +504,43 @@ jobs:
dind_as_sidecar: 'true' dind_as_sidecar: 'true'
# ============================================================================
# COMPLIANCE — Generate attribution CSVs for dpkg and Python packages
# ============================================================================
compliance:
if: inputs.build_image && inputs.push_image
needs: [build]
name: Compliance ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }}
runs-on: ${{ inputs.platform == 'amd64' && 'prod-builder-amd-v1' || 'prod-tester-arm-v1' }}
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Calculate image URI
id: images
shell: bash
run: |
CUDA_VERSION_RAW=${{ inputs.cuda_version }}
CUDA_VERSION=${CUDA_VERSION_RAW%%.*}
echo "cuda_major=${CUDA_VERSION}" >> $GITHUB_OUTPUT
RUNTIME_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}-cuda${CUDA_VERSION}-${{ inputs.platform }}
echo "runtime_image=${RUNTIME_IMAGE}" >> $GITHUB_OUTPUT
- name: Compliance scan
uses: ./.github/actions/compliance-scan
with:
image: ${{ steps.images.outputs.runtime_image }}
artifact_name: compliance-${{ inputs.framework }}-cuda${{ steps.images.outputs.cuda_major }}-${{ inputs.platform }}
framework: ${{ inputs.framework }}
cuda_version: ${{ inputs.cuda_version }}
# ============================================================================ # ============================================================================
# COPY TO ACR # COPY TO ACR
# ============================================================================ # ============================================================================
......
# Container Compliance Tooling
Scripts for generating attribution CSVs from built container images, listing all installed dpkg and Python packages with their SPDX license identifiers where known.
## Output format
Each run produces up to two CSV files:
| Column | Description |
|--------|-------------|
| `package_name` | Package name as reported by dpkg or pip |
| `version` | Installed version |
| `type` | `dpkg` or `python` |
| `spdx_license` | SPDX identifier (e.g. `MIT`, `Apache-2.0`) or `UNKNOWN` |
Files are sorted by `(type, package_name)` for stable diffs.
When a base image is provided, a second `_diff.csv` file is written containing only packages that are new or version-changed relative to the base — i.e. what Dynamo's build layers added on top of the upstream image.
## Usage
```bash
# Full scan, output to stdout
python container/compliance/generate_attributions.py <image:tag>
# Write to file
python container/compliance/generate_attributions.py <image:tag> -o attribution.csv
# With base image diff — auto-resolved from context.yaml
python container/compliance/generate_attributions.py <image:tag> \
--framework vllm \
--cuda-version 12.9 \
-o attribution-vllm-cuda12-amd64.csv
# Produces: attribution-vllm-cuda12-amd64.csv (full)
# attribution-vllm-cuda12-amd64_diff.csv (delta from base)
# With explicit base image override
python container/compliance/generate_attributions.py <image:tag> \
--base-image nvcr.io/nvidia/cuda:12.9.1-runtime-ubuntu24.04 \
-o attribution.csv
# Frontend image
python container/compliance/generate_attributions.py <image:tag> \
--framework dynamo \
--target frontend \
-o attribution-frontend-amd64.csv
# dpkg only
python container/compliance/generate_attributions.py <image:tag> \
--types dpkg \
-o attribution-dpkg.csv
```
### All flags
| Flag | Default | Description |
|------|---------|-------------|
| `image` | *(required)* | Container image to scan |
| `--output`, `-o` | stdout | Output CSV path |
| `--framework` | — | Auto-resolve base image from `context.yaml` (`vllm`, `sglang`, `trtllm`, `dynamo`) |
| `--target` | `runtime` | Build target for base resolution (`runtime` or `frontend`) |
| `--cuda-version` | — | CUDA version for base resolution (e.g. `12.9`, `13.0`, `13.1`) |
| `--base-image` | — | Explicit base image URI (overrides `--framework` auto-resolve) |
| `--context-yaml` | `container/context.yaml` | Path to context.yaml |
| `--types` | `dpkg,python` | Comma-separated list of types to extract |
| `--docker-cmd` | `docker` | Docker binary to use |
| `--verbose`, `-v` | — | Enable verbose logging to stderr |
## Base image reference
| Framework | CUDA | Base image |
|-----------|------|------------|
| `vllm` | 12.9 | `nvcr.io/nvidia/cuda:12.9.1-runtime-ubuntu24.04` |
| `vllm` | 13.0 | `nvcr.io/nvidia/cuda:13.0.2-runtime-ubuntu24.04` |
| `sglang` | 12.9 | `lmsysorg/sglang:v0.5.9-runtime` |
| `sglang` | 13.0 | `lmsysorg/sglang:v0.5.9-cu130-runtime` |
| `trtllm` | 13.1 | `nvcr.io/nvidia/cuda-dl-base:25.12-cuda13.1-runtime-ubuntu24.04` |
| `dynamo` frontend | — | `nvcr.io/nvidia/base/ubuntu:noble-20250619` |
These values are sourced from `container/context.yaml` at runtime; the table above reflects the current defaults.
## How it works
The script runs two lightweight helper scripts **inside the container** via `docker run --rm -v`:
- **dpkg extractor** — runs `dpkg-query` to list packages, then reads `/usr/share/doc/<pkg>/copyright` files for license info. Only DEP-5 machine-readable copyright files are parsed; ambiguous cases return `UNKNOWN`.
- **Python extractor** — uses `importlib.metadata.distributions()` to iterate installed packages. License is read from `License-Expression` (PEP 639), then `License` metadata, then trove classifiers. Ambiguous cases return `UNKNOWN`.
Both helpers are self-contained and have no external dependencies — they run with whatever Python is in the container.
## License detection
Detection is intentionally conservative: only unambiguous matches are assigned SPDX identifiers. The `UNKNOWN` entries are expected; they can be resolved with additional analysis against the raw copyright files.
## CI integration
Attribution CSVs are generated automatically as part of CI after every successful image build. Artifacts are available in the GitHub Actions workflow run under:
- `compliance-{framework}-cuda{major}-{platform}` — runtime images
- `compliance-frontend-{arch}` — frontend image
The scan runs as a separate lightweight job (`prod-default-small-v2`) in parallel with tests, so it does not extend pipeline wall time.
## Requirements
- Python 3.11+
- `docker` (or compatible CLI) with access to the target registry
- `pyyaml` — only required on the host when using `--framework`/`--cuda-version` base image auto-resolution (`pip install pyyaml`)
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Attribution extractors for container dependency scanning."""
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Extract dpkg package information from a container image."""
import logging
import subprocess
from pathlib import Path
log = logging.getLogger(__name__)
_HELPER_SCRIPT_PATH = Path(__file__).resolve().parent / "helpers" / "dpkg_helper.py"
def extract_dpkg(
image: str,
docker_cmd: str = "docker",
verbose: bool = False,
) -> list[dict[str, str]]:
"""Extract dpkg package attributions from a container image.
Returns a list of dicts with keys: package_name, version, type, spdx_license
"""
cmd = [
docker_cmd,
"run",
"--rm",
"--entrypoint",
"python3",
"-v",
f"{_HELPER_SCRIPT_PATH}:/tmp/dpkg_helper.py:ro",
image,
"/tmp/dpkg_helper.py",
]
if verbose:
log.info("Running: %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
if result.returncode != 0:
# Exit 127 means python3 not found — fall back to shell-based dpkg-query
if result.returncode == 127:
log.warning(
"python3 not found in %s, falling back to shell-based dpkg extraction (no license info)",
image,
)
return _extract_dpkg_shell(image, docker_cmd, verbose)
log.error(
"dpkg extraction failed (exit %d): %s", result.returncode, result.stderr
)
raise RuntimeError(f"dpkg extraction failed: {result.stderr}")
packages = []
for line in result.stdout.strip().splitlines():
parts = line.split("\t", 2)
if len(parts) != 3:
if verbose:
log.warning("Skipping malformed line: %r", line)
continue
pkg_name, version, spdx_license = parts
packages.append(
{
"package_name": pkg_name,
"version": version,
"type": "dpkg",
"spdx_license": spdx_license,
}
)
if verbose:
log.info("Extracted %d dpkg packages", len(packages))
return packages
def _extract_dpkg_shell(
image: str,
docker_cmd: str = "docker",
verbose: bool = False,
) -> list[dict[str, str]]:
"""Fallback: extract dpkg packages via shell when python3 is unavailable.
License info will be UNKNOWN for all packages.
"""
cmd = [
docker_cmd,
"run",
"--rm",
"--entrypoint",
"sh",
image,
"-c",
"dpkg-query -W -f='${Package}\\t${Version}\\n'",
]
if verbose:
log.info("Running (shell fallback): %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
if result.returncode != 0:
log.error(
"dpkg shell extraction failed (exit %d): %s",
result.returncode,
result.stderr,
)
raise RuntimeError(f"dpkg shell extraction failed: {result.stderr}")
packages = []
for line in result.stdout.strip().splitlines():
parts = line.split("\t", 1)
if len(parts) != 2:
continue
pkg_name, version = parts
packages.append(
{
"package_name": pkg_name,
"version": version,
"type": "dpkg",
"spdx_license": "UNKNOWN",
}
)
if verbose:
log.info("Extracted %d dpkg packages (shell fallback)", len(packages))
return packages
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# This script runs INSIDE the container. It must be fully self-contained
# with zero external dependencies (only Python stdlib).
import os
import subprocess
import sys
# Conservative DEP-5 license field -> SPDX mapping
_DEP5_MAP = {
"Apache-2.0": "Apache-2.0",
"Apache-2": "Apache-2.0",
"Artistic-2.0": "Artistic-2.0",
"BSD-2-clause": "BSD-2-Clause",
"BSD-3-clause": "BSD-3-Clause",
"BSL-1.0": "BSL-1.0",
"CC0-1.0": "CC0-1.0",
"Expat": "MIT",
"GPL-2": "GPL-2.0-only",
"GPL-2+": "GPL-2.0-or-later",
"GPL-2.0": "GPL-2.0-only",
"GPL-2.0+": "GPL-2.0-or-later",
"GPL-3": "GPL-3.0-only",
"GPL-3+": "GPL-3.0-or-later",
"GPL-3.0": "GPL-3.0-only",
"GPL-3.0+": "GPL-3.0-or-later",
"ISC": "ISC",
"LGPL-2": "LGPL-2.0-only",
"LGPL-2+": "LGPL-2.0-or-later",
"LGPL-2.0": "LGPL-2.0-only",
"LGPL-2.0+": "LGPL-2.0-or-later",
"LGPL-2.1": "LGPL-2.1-only",
"LGPL-2.1+": "LGPL-2.1-or-later",
"LGPL-3": "LGPL-3.0-only",
"LGPL-3+": "LGPL-3.0-or-later",
"LGPL-3.0": "LGPL-3.0-only",
"LGPL-3.0+": "LGPL-3.0-or-later",
"MIT": "MIT",
"MPL-2.0": "MPL-2.0",
"PSF-2": "PSF-2.0",
"public-domain": "CC0-1.0",
"Zlib": "Zlib",
"OpenSSL": "OpenSSL",
"WTFPL": "WTFPL",
}
_DEP5_MAP_LOWER = {k.lower(): v for k, v in _DEP5_MAP.items()}
def is_dep5(content):
for line in content.splitlines():
s = line.strip()
if not s or s.startswith("#"):
continue
return s.startswith("Format:")
return False
def extract_dep5_license(content):
"""Extract the primary license from a DEP-5 copyright file."""
licenses = set()
for line in content.splitlines():
s = line.strip()
if s.startswith("License:"):
val = s[len("License:") :].strip()
if val:
mapped = _DEP5_MAP.get(val) or _DEP5_MAP_LOWER.get(val.lower())
if mapped:
licenses.add(mapped)
if len(licenses) == 1:
return licenses.pop()
elif len(licenses) > 1:
return " AND ".join(sorted(licenses))
return "UNKNOWN"
def get_license_for_package(pkg_name):
"""Read /usr/share/doc/<pkg>/copyright and extract license info."""
copyright_path = f"/usr/share/doc/{pkg_name}/copyright"
if not os.path.isfile(copyright_path):
return "UNKNOWN"
try:
with open(copyright_path, "r", errors="replace") as f:
content = f.read()
except (OSError, IOError):
return "UNKNOWN"
if not content.strip():
return "UNKNOWN"
if is_dep5(content):
return extract_dep5_license(content)
return "UNKNOWN"
def main():
result = subprocess.run(
["dpkg-query", "-W", "-f=${Package}\t${Version}\n"],
capture_output=True,
text=True,
)
if result.returncode != 0:
print(f"ERROR: dpkg-query failed: {result.stderr}", file=sys.stderr)
sys.exit(1)
for line in result.stdout.strip().splitlines():
parts = line.split("\t", 1)
if len(parts) != 2:
continue
pkg, version = parts
license_id = get_license_for_package(pkg)
print(f"{pkg}\t{version}\t{license_id}")
if __name__ == "__main__":
main()
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# This script runs INSIDE the container. It must be fully self-contained
# with zero external dependencies (only Python stdlib).
import importlib.metadata
# Conservative classifier -> SPDX mapping
_CLASSIFIER_MAP = {
"License :: OSI Approved :: MIT License": "MIT",
"License :: OSI Approved :: Apache Software License": "Apache-2.0",
"License :: OSI Approved :: BSD License": "BSD-3-Clause",
"License :: OSI Approved :: ISC License (ISCL)": "ISC",
"License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)": "MPL-2.0",
"License :: OSI Approved :: GNU General Public License v2 (GPLv2)": "GPL-2.0-only",
"License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+)": "GPL-2.0-or-later",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)": "GPL-3.0-only",
"License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)": "GPL-3.0-or-later",
"License :: OSI Approved :: GNU Lesser General Public License v2 (LGPLv2)": "LGPL-2.0-only",
"License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)": "LGPL-2.0-or-later",
"License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)": "LGPL-3.0-only",
"License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)": "LGPL-3.0-or-later",
"License :: OSI Approved :: Python Software Foundation License": "PSF-2.0",
"License :: OSI Approved :: Boost Software License 1.0 (BSL-1.0)": "BSL-1.0",
"License :: OSI Approved :: The Unlicense (Unlicense)": "Unlicense",
"License :: OSI Approved :: Artistic License": "Artistic-2.0",
"License :: OSI Approved :: zlib/libpng License": "Zlib",
"License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication": "CC0-1.0",
"License :: Public Domain": "CC0-1.0",
}
# Conservative free-text license -> SPDX mapping
_LICENSE_MAP = {
"MIT": "MIT",
"MIT License": "MIT",
"The MIT License": "MIT",
"The MIT License (MIT)": "MIT",
"Apache License 2.0": "Apache-2.0",
"Apache License, Version 2.0": "Apache-2.0",
"Apache Software License": "Apache-2.0",
"Apache 2.0": "Apache-2.0",
"Apache-2.0": "Apache-2.0",
"BSD License": "BSD-3-Clause",
"BSD": "BSD-3-Clause",
"BSD-2-Clause": "BSD-2-Clause",
"BSD-3-Clause": "BSD-3-Clause",
"3-Clause BSD License": "BSD-3-Clause",
"2-Clause BSD License": "BSD-2-Clause",
"Simplified BSD License": "BSD-2-Clause",
"New BSD License": "BSD-3-Clause",
"ISC": "ISC",
"ISC License": "ISC",
"ISC License (ISCL)": "ISC",
"MPL-2.0": "MPL-2.0",
"Mozilla Public License 2.0": "MPL-2.0",
"Mozilla Public License 2.0 (MPL 2.0)": "MPL-2.0",
"PSF-2.0": "PSF-2.0",
"Python Software Foundation License": "PSF-2.0",
"Unlicense": "Unlicense",
"The Unlicense": "Unlicense",
"CC0-1.0": "CC0-1.0",
"Public Domain": "CC0-1.0",
"WTFPL": "WTFPL",
"Zlib": "Zlib",
}
_LICENSE_MAP_LOWER = {k.lower(): v for k, v in _LICENSE_MAP.items()}
def get_license(dist):
"""Extract SPDX license for a distribution, conservative approach."""
meta = dist.metadata
# 1. PEP 639 License-Expression (already SPDX)
license_expr = meta.get("License-Expression")
if license_expr and license_expr.strip():
return license_expr.strip()
# 2. Free-text License field
license_field = meta.get("License")
if license_field and license_field.strip():
val = license_field.strip()
mapped = _LICENSE_MAP.get(val) or _LICENSE_MAP_LOWER.get(val.lower())
if mapped:
return mapped
# 3. Trove classifiers
classifiers = meta.get_all("Classifier") or []
license_classifiers = [c for c in classifiers if c.startswith("License ::")]
for clf in license_classifiers:
if clf in _CLASSIFIER_MAP:
return _CLASSIFIER_MAP[clf]
return "UNKNOWN"
def main():
seen = set()
for dist in importlib.metadata.distributions():
name = dist.metadata["Name"]
if not name:
continue
# Deduplicate (importlib.metadata can return duplicates)
key = name.lower()
if key in seen:
continue
seen.add(key)
version = dist.metadata["Version"] or "UNKNOWN"
spdx = get_license(dist)
print(f"{name}\t{version}\t{spdx}")
if __name__ == "__main__":
main()
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Extract Python package information from a container image."""
import logging
import subprocess
from pathlib import Path
log = logging.getLogger(__name__)
_HELPER_SCRIPT_PATH = Path(__file__).resolve().parent / "helpers" / "python_helper.py"
def extract_python(
image: str,
docker_cmd: str = "docker",
verbose: bool = False,
) -> list[dict[str, str]]:
"""Extract Python package attributions from a container image.
Returns a list of dicts with keys: package_name, version, type, spdx_license
"""
cmd = [
docker_cmd,
"run",
"--rm",
"--entrypoint",
"python3",
"-v",
f"{_HELPER_SCRIPT_PATH}:/tmp/python_helper.py:ro",
image,
"/tmp/python_helper.py",
]
if verbose:
log.info("Running: %s", " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
if result.returncode != 0:
# Exit 127 means python3 not found — no Python packages in this image
if result.returncode == 127:
log.warning(
"python3 not found in %s, skipping Python package extraction", image
)
return []
log.error(
"Python extraction failed (exit %d): %s",
result.returncode,
result.stderr,
)
raise RuntimeError(f"Python extraction failed: {result.stderr}")
packages = []
for line in result.stdout.strip().splitlines():
parts = line.split("\t", 2)
if len(parts) != 3:
if verbose:
log.warning("Skipping malformed line: %r", line)
continue
pkg_name, version, spdx_license = parts
packages.append(
{
"package_name": pkg_name,
"version": version,
"type": "python",
"spdx_license": spdx_license,
}
)
if verbose:
log.info("Extracted %d Python packages", len(packages))
return packages
#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Generate attribution CSV files for container images.
Extracts dpkg and Python package information from a container image by
running helper scripts inside the container via `docker run`. Optionally
computes a diff against a base image to show only added/changed packages.
Usage:
python generate_attributions.py <image:tag> [--output out.csv] [--base-image base:tag]
python generate_attributions.py <image:tag> --framework vllm --cuda-version 12.9
"""
import argparse
import csv
import logging
import sys
from pathlib import Path
# Allow running as a script from any directory
_SCRIPT_DIR = Path(__file__).resolve().parent
_REPO_ROOT = _SCRIPT_DIR.parent.parent
sys.path.insert(0, str(_SCRIPT_DIR))
from extractors.dpkg import extract_dpkg # noqa: E402
from extractors.python_pkgs import extract_python # noqa: E402
log = logging.getLogger(__name__)
VALID_TYPES = {"dpkg", "python"}
def resolve_base_image(
framework: str,
target: str,
cuda_version: str,
context_yaml_path: Path,
) -> str:
"""Resolve the base image from context.yaml for a given framework/target/cuda combo."""
try:
import yaml
except ImportError:
log.error(
"PyYAML is required for --framework/--cuda-version base image resolution. "
"Install it with: pip install pyyaml"
)
sys.exit(1)
if not context_yaml_path.is_file():
log.error("context.yaml not found at %s", context_yaml_path)
sys.exit(1)
with open(context_yaml_path, "r") as f:
context = yaml.safe_load(f)
if target == "frontend":
frontend_image = context.get("dynamo", {}).get("frontend_image")
if not frontend_image:
log.error("frontend_image not found in context.yaml dynamo section")
sys.exit(1)
return frontend_image
# Runtime target: look up runtime_image and runtime_image_tag
fw_config = context.get(framework, {})
cuda_key = f"cuda{cuda_version}"
cuda_config = fw_config.get(cuda_key, {})
runtime_image = cuda_config.get("runtime_image")
runtime_image_tag = cuda_config.get("runtime_image_tag")
if not runtime_image or not runtime_image_tag:
log.error(
"Could not resolve base image for framework=%s cuda=%s target=%s. "
"Keys runtime_image/runtime_image_tag not found under %s.%s in context.yaml",
framework,
cuda_version,
target,
framework,
cuda_key,
)
sys.exit(1)
return f"{runtime_image}:{runtime_image_tag}"
def compute_diff(
target_packages: list[dict[str, str]],
base_packages: list[dict[str, str]],
) -> list[dict[str, str]]:
"""Compute packages in target that are new or have different versions vs base.
Returns packages present in target but not in base, or with a different version.
"""
base_lookup = {}
for pkg in base_packages:
key = (pkg["package_name"], pkg["type"])
base_lookup[key] = pkg["version"]
diff = []
for pkg in target_packages:
key = (pkg["package_name"], pkg["type"])
base_version = base_lookup.get(key)
if base_version is None or base_version != pkg["version"]:
diff.append(pkg)
return diff
def write_csv(packages: list[dict[str, str]], output_path: str | None) -> None:
"""Write packages to CSV, sorted by (type, package_name)."""
sorted_packages = sorted(packages, key=lambda p: (p["type"], p["package_name"]))
fieldnames = ["package_name", "version", "type", "spdx_license"]
if output_path:
with open(output_path, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(sorted_packages)
log.info("Wrote %d entries to %s", len(sorted_packages), output_path)
else:
writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(sorted_packages)
def extract_all(
image: str,
types: set[str],
docker_cmd: str,
verbose: bool,
) -> list[dict[str, str]]:
"""Run all requested extractors against an image."""
packages = []
if "dpkg" in types:
log.info("Extracting dpkg packages from %s ...", image)
packages.extend(extract_dpkg(image, docker_cmd=docker_cmd, verbose=verbose))
if "python" in types:
log.info("Extracting Python packages from %s ...", image)
packages.extend(extract_python(image, docker_cmd=docker_cmd, verbose=verbose))
return packages
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Generate attribution CSV files for container images",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s my-registry/dynamo:vllm-runtime -o vllm.csv
%(prog)s my-registry/dynamo:vllm-runtime --framework vllm --cuda-version 12.9 -o vllm.csv
%(prog)s my-registry/dynamo:vllm-runtime --base-image nvcr.io/nvidia/cuda:12.9.1-runtime-ubuntu24.04 -o vllm.csv
%(prog)s my-registry/dynamo:frontend --framework dynamo --target frontend -o frontend.csv
""",
)
parser.add_argument(
"image", help="Container image to scan (e.g., my-registry/dynamo:latest)"
)
parser.add_argument(
"--output",
"-o",
help="Output CSV file path (default: stdout)",
)
parser.add_argument(
"--base-image",
help="Base image for diff calculation (explicit, overrides --framework auto-resolve)",
)
parser.add_argument(
"--framework",
choices=["vllm", "sglang", "trtllm", "dynamo"],
help="Framework name for auto-resolving base image from context.yaml",
)
parser.add_argument(
"--target",
default="runtime",
choices=["runtime", "frontend"],
help="Build target for base image resolution (default: runtime)",
)
parser.add_argument(
"--cuda-version",
choices=["12.9", "13.0", "13.1"],
help="CUDA version for base image resolution",
)
parser.add_argument(
"--context-yaml",
default=str(_REPO_ROOT / "container" / "context.yaml"),
help="Path to context.yaml (default: container/context.yaml in repo root)",
)
parser.add_argument(
"--types",
default="dpkg,python",
help="Comma-separated extraction types (default: dpkg,python)",
)
parser.add_argument(
"--docker-cmd",
default="docker",
help="Docker command to use (default: docker)",
)
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Enable verbose logging",
)
return parser.parse_args()
def main() -> None:
args = parse_args()
logging.basicConfig(
level=logging.DEBUG if args.verbose else logging.INFO,
format="%(levelname)s: %(message)s",
stream=sys.stderr,
)
types = set(args.types.split(","))
invalid = types - VALID_TYPES
if invalid:
log.error("Invalid types: %s (valid: %s)", invalid, VALID_TYPES)
sys.exit(1)
# Resolve base image if needed
base_image = args.base_image
if not base_image and args.framework:
if args.target != "frontend" and not args.cuda_version:
log.error(
"--cuda-version is required when using --framework for runtime targets"
)
sys.exit(1)
base_image = resolve_base_image(
framework=args.framework,
target=args.target,
cuda_version=args.cuda_version or "",
context_yaml_path=Path(args.context_yaml),
)
log.info("Auto-resolved base image: %s", base_image)
# Extract from target image
target_packages = extract_all(args.image, types, args.docker_cmd, args.verbose)
log.info("Total packages extracted from target: %d", len(target_packages))
# Write full CSV
write_csv(target_packages, args.output)
# Compute and write diff if base image is available
if base_image:
log.info("Extracting packages from base image for diff: %s", base_image)
base_packages = extract_all(base_image, types, args.docker_cmd, args.verbose)
log.info("Total packages extracted from base: %d", len(base_packages))
diff_packages = compute_diff(target_packages, base_packages)
log.info("Diff: %d new/changed packages", len(diff_packages))
if args.output:
# Insert _diff before the file extension
output_path = Path(args.output)
diff_path = str(output_path.with_stem(output_path.stem + "_diff"))
write_csv(diff_packages, diff_path)
else:
# Write diff to stdout with a separator
print("\n# --- DIFF (new/changed packages vs base) ---", file=sys.stderr)
write_csv(diff_packages, None)
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment