Initial commit

f356f546 · maming · f356f546 · f356f546 · f356f546 · f356f546
Commit f356f546 authored Feb 04, 2026 by maming
20 changed files
--- a/.gitignore
+++ b/.gitignore
+__pycache__
+*.bak
+*.log
+.vscode
+.idea
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "Megatron-LM"]
+	path = Megatron-LM
+	url = https://github.com/NVIDIA/Megatron-LM.git
+	branch = main 
+[submodule]
+	Megatron-LM = main
+[submodule "Megatron-Energon"]
+	path = Megatron-Energon
+	url = https://github.com/NVIDIA/Megatron-Energon.git
--- a/Megatron-Energon/.coveragerc
+++ b/Megatron-Energon/.coveragerc
+[report]
+include = ./src/megatron/energon/**
+
+[xml]
+output = ./coverage.xml
\ No newline at end of file
--- a/Megatron-Energon/.env
+++ b/Megatron-Energon/.env
+PYTHONPATH=src
--- a/Megatron-Energon/.github/workflows/documentation.yml
+++ b/Megatron-Energon/.github/workflows/documentation.yml
+name: documentation
+
+  # Runs on any pushes
+on: [push, workflow_dispatch]
+
+# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
+# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+
+jobs:
+  # Build job
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+  
+      - name: Install just
+        uses: extractions/setup-just@v3
+  
+      - name: Install dependencies
+        run: |
+          just dev-sync
+
+      - name: Sphinx build
+        run: |
+          just docs
+      
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v3
+        with:
+          path: docs/build
+
+  # Deployment job
+  deploy:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    needs: build
+    if: github.ref_name == 'main'
+    steps:
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v4
--- a/Megatron-Energon/.github/workflows/license_headers.yml
+++ b/Megatron-Energon/.github/workflows/license_headers.yml
+name: verify license headers
+
+on:
+  push:
+    branches:
+      - develop
+  workflow_dispatch:
+  pull_request:
+    branches:
+      - develop
+
+jobs:
+  license-check:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.9
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install click==8.1.7
+      - name: Run License Header Check
+        run: python scripts/license_headers.py .
--- a/Megatron-Energon/.github/workflows/release.yml
+++ b/Megatron-Energon/.github/workflows/release.yml
+name: release
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  deploy:
+    name: Upload release to PyPI
+    runs-on: ubuntu-latest
+    environment:
+      name: release
+      url: https://pypi.org/project/megatron-energon
+    permissions:
+      id-token: write  # This permission is mandatory for trusted publishing
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+  
+      - name: Install just
+        uses: extractions/setup-just@v3
+  
+      - name: Install dependencies
+        run: |
+          just dev-sync
+
+      - name: Build package
+        run: |
+          just build
+
+      - name: Publish package
+        uses: pypa/gh-action-pypi-publish@release/v1
--- a/Megatron-Energon/.github/workflows/ruff.yml
+++ b/Megatron-Energon/.github/workflows/ruff.yml
+name: ruff checks
+
+on:
+  push:
+    branches:
+      - develop
+  workflow_dispatch:
+  pull_request:
+    branches:
+      - develop
+
+jobs:
+  ruff:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Install uv
+      uses: astral-sh/setup-uv@v5
+
+    - name: Install just
+      uses: extractions/setup-just@v3
+
+    - name: Install dependencies
+      run: |
+        just dev-sync
+
+    - name: Check code
+      run: |
+        just check
--- a/Megatron-Energon/.github/workflows/tests.yml
+++ b/Megatron-Energon/.github/workflows/tests.yml
+name: tests
+
+on:
+  push:
+    branches:
+      - develop
+  workflow_dispatch:
+  pull_request:
+    branches:
+      - develop
+
+jobs:
+  unittest:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Install uv
+      uses: astral-sh/setup-uv@v5
+
+    - name: Install just
+      uses: extractions/setup-just@v3
+
+    - name: Install minimum supported python version
+      run: |
+        uv python pin 3.10
+
+    - name: Install dependencies
+      run: |
+        just dev-sync
+
+    - name: Run unit tests
+      run: |
+        just test
--- a/Megatron-Energon/.gitignore
+++ b/Megatron-Energon/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# C extensions
+*.so
+
+# C objects
+*.o
+
+# IDE
+/.idea
+/.vscode
+.DS_Store
+.spyderproject
+.spyproject
+.ropeproject
+*.code-workspace
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# Environments
+#.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Temporary Data
+/test_dataset
+
+# drawio temp files
+.$*
+
+# ninja build
+.ninja*
+
+# Other files
+*.log
+*.csv
+/*.png
+/public
+/.run
--- a/Megatron-Energon/CONTRIBUTING.md
+++ b/Megatron-Energon/CONTRIBUTING.md
+<!--- Copyright (c) 2025, NVIDIA CORPORATION.
+SPDX-License-Identifier: BSD-3-Clause -->
+
+## Megatron-Energon OSS Contribution Rules
+
+
+#### Coding Guidelines
+
+- We recommend to discuss and plan contributions by opening an issue beforehand.
+
+- All source code contributions must strictly adhere to ruff linter and formatter as configured in `pyproject.toml`.
+    - You can check those by running `just check`.
+
+- All source code files must include the NVIDIA license header as you can see in `src/megatron/energon/__init__.py`.
+
+- In addition, please follow the existing conventions in the relevant file, submodule, module, and project when you add new code or when you extend/fix existing functionality.
+
+- Avoid introducing unnecessary complexity into existing code so that maintainability and readability are preserved.
+
+- Try to keep pull requests (PRs) as concise as possible:
+  - Avoid committing commented-out code.
+  - Wherever possible, each PR should address a single concern. If there are several otherwise-unrelated things that should be fixed to reach a desired endpoint, our recommendation is to open several PRs and indicate the dependencies in the description. The more complex the changes are in a single PR, the more time it will take to review those changes.
+
+- Ensure that all tests pass prior to submitting your code.
+
+- All components must contain accompanying documentation in `docs/source`.
+
+- Make sure that you can contribute your work to open source (no license and/or patent conflict is introduced by your code). You will need to [`sign`](#signing-your-work) your commit.
+
+
+#### Signing Your Work
+
+* We require that all contributors "sign-off" on their commits. This certifies that the contribution is your original work, or you have rights to submit it under the same license, or a compatible license.
+
+  * Any contribution which contains commits that are not Signed-Off will not be accepted.
+
+* To sign off on a commit you simply use the `--signoff` (or `-s`) option when committing your changes:
+  ```bash
+  $ git commit -s -m "Add cool feature."
+  ```
+  This will append the following to your commit message:
+  ```
+  Signed-off-by: Your Name <your@email.com>
+  ```
+
+* Full text of the DCO:
+
+  ```
+    Developer Certificate of Origin
+    Version 1.1
+    
+    Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+    1 Letterman Drive
+    Suite D4700
+    San Francisco, CA, 94129
+    
+    Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
+
+
+    Developer's Certificate of Origin 1.1
+    
+    By making a contribution to this project, I certify that:
+    
+    (a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or
+    
+    (b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or
+    
+    (c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it.
+    
+    (d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved.
+  ```
--- a/Megatron-Energon/LICENSE
+++ b/Megatron-Energon/LICENSE
--- a/Megatron-Energon/README.md
+++ b/Megatron-Energon/README.md
+<!--- Copyright (c) 2025, NVIDIA CORPORATION.
+SPDX-License-Identifier: BSD-3-Clause -->
+<a name="top"></a>
+
+<div align="center">
+  <h4 align="center">Megatron's multi-modal data loader</h4>
+  <h2 align="center">Megatron Energon</h2>
+  <p align="center">
+    <a href="https://github.com/NVIDIA/Megatron-Energon/actions/workflows/tests.yml"><img src="https://github.com/NVIDIA/Megatron-Energon/actions/workflows/tests.yml/badge.svg" alt="Tests"></a> <a href="https://nvidia.github.io/Megatron-Energon/"><img src="https://github.com/NVIDIA/Megatron-Energon/actions/workflows/documentation.yml/badge.svg" alt="Documentation"></a>
+    <br />
+    <a href="https://github.com/NVIDIA/Megatron-Energon/issues">Report Bug</a>
+    ·
+    <a href="https://github.com/NVIDIA/Megatron-Energon/issues">Request Feature</a>
+  </p>
+</div>
+
+<br />
+
+ _**DISCLAIMER**: This package contains research code. APIs may change._
+
+# What is this?
+
+**Megatron Energon** is the multi-modal data loader of [Megatron](https://github.com/NVIDIA/Megatron-LM) (you can also use it independently).
+
+It's best at
+
+- loading large training data to train large multi-modal models
+- blending many different datasets together
+- distributing the work across many nodes and processes of a cluster
+- ensuring reproducibility and resumability
+- adapting easily to various types of data samples and processing
+
+Try using it together with [Megatron](https://github.com/NVIDIA/Megatron-LM) Core.
+
+# Quickstart
+**Megatron Energon** is a pip-installable python package that offers
+- dataset-related classes that you can import in your project
+- a command line utility for data preprocessing and conversion
+
+This document is just a quick start. Please also check out the [documentation](https://nvidia.github.io/Megatron-Energon/).
+
+## Installation
+
+To install the latest stable version:
+```shell
+pip install megatron-energon
+```
+
+Or to install the current development version:
+```shell
+pip install git+https://github.com/NVIDIA/Megatron-Energon.git
+```
+
+**NOTE**: We encourage you to install the package (and not just import a local copy). This will ensure you have all the needed dependencies and that you can use the command line tool.
+
+For more details on installing this package, see [here](https://nvidia.github.io/Megatron-Energon/intro/installation.html).
+
+## Usage of command line tool
+
+After installation, the command `energon` will be available.
+
+Here are some examples for things you can do:
+
+| Command | Description  |
+|---|---|
+| `energon prepare DATASET_ROOT` | Take an existing WebDataset and add the required yaml files to turn it into an energon-compatible dataset |
+| `energon lint DATASET_ROOT` | Verify that the dataset complies with the energon dataset format and that all samples are loadable |
+
+
+## Usage of the library
+
+To get started, pick a [WebDataset](https://github.com/webdataset/webdataset)-compliant dataset and run `energon prepare DATASET_ROOT` on it, to run the interactive assistant and create the `.nv-meta` folder. As an alternative to WebDataset, Energon also supports the JSONL format, see [here](https://nvidia.github.io/Megatron-Energon/basic/data_prep.html).
+
+Once done, try to load it from your Python program:
+
+```python
+from megatron.energon import get_train_dataset, get_loader, WorkerConfig
+
+
+simple_worker_config = WorkerConfig(rank=0, world_size=1, num_workers=2)
+
+
+train_ds = get_train_dataset(
+    '/my/dataset/path',
+    batch_size=2,
+    shuffle_buffer_size=None,
+    max_samples_per_sequence=None,
+    worker_config=simple_worker_config,
+)
+
+train_loader = get_loader(train_ds)
+
+for batch in train_loader:
+    # Do something with batch
+    # Infer, gradient step, ...
+    pass
+```
+
+For more details, read the [documentation](https://nvidia.github.io/Megatron-Energon/).
--- a/Megatron-Energon/docker/energon-ci.Dockerfile
+++ b/Megatron-Energon/docker/energon-ci.Dockerfile
+FROM nvcr.io/nvidia/pytorch:24.02-py3
+
+ENV TORCH_CUDA_ARCH_LIST "7.0;7.5;8.0;8.6;8.7;8.9;9.0"
+ENV MMCV_WITH_OPS 1
+ENV FORCE_CUDA 1
+
+RUN python3 -m pip install --upgrade pip
+
+# Install, then uninstall to get only the deps
+COPY . ./megatron-energon
+RUN pip install -e ./megatron-energon && pip uninstall -y megatron-energon && rm -rf ./megatron-energon
--- a/Megatron-Energon/docs/Makefile
+++ b/Megatron-Energon/docs/Makefile
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = python3 -m sphinx.cmd.build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+	
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/Megatron-Energon/docs/README.md
+++ b/Megatron-Energon/docs/README.md
+<!--- Copyright (c) 2025, NVIDIA CORPORATION.
+SPDX-License-Identifier: BSD-3-Clause -->
+
+# Building the documentation
+
+To build the documentation, you need sphinx and additional packages:
+
+- sphinx-rtd-theme
+- sphinx    
+- sphinxcontrib-napoleon
+- myst-parser
+
+You can install these like
+
+`pip install sphinx-rtd-theme sphinx sphinxcontrib-napoleon myst-parser sphinx-click`
+
+Use `make html` to build it.
+
+Or use PyCharm by adding a configuration:
+
+    `Run -> Edit Configurations -> Add new Configuration -> Python docs -> Sphinx task`
+
+Use the `src/docs/source` folder as input folder and the `src/docs/build` as output.
--- a/Megatron-Energon/docs/source/_static/android-chrome-192x192.png
+++ b/Megatron-Energon/docs/source/_static/android-chrome-192x192.png
--- a/Megatron-Energon/docs/source/_static/android-chrome-512x512.png
+++ b/Megatron-Energon/docs/source/_static/android-chrome-512x512.png
--- a/Megatron-Energon/docs/source/_static/apple-touch-icon.png
+++ b/Megatron-Energon/docs/source/_static/apple-touch-icon.png
--- a/Megatron-Energon/docs/source/_static/css/custom.css
+++ b/Megatron-Energon/docs/source/_static/css/custom.css
+/*.sig-param:nth-child(1 of .sig-param):nth-last-child(n + 3 of .sig-param)::before,
+.sig-param:nth-child(1 of .sig-param):nth-last-child(n + 3 of .sig-param) ~ .sig-param ::before {
+    content: "\a\20\20\20\20\20\20\20\20\20\20\20\20\20\20\20\20";
+    white-space: pre;
+}*/
+
+/* Newlines (\a) and spaces (\20) before each parameter */
+.sig-param::before {
+    content: "\a\20\20\20\20\20\20\20\20\20\20\20\20\20\20\20\20";
+    white-space: pre;
+}
+
+/* Newline after the last parameter (so the closing bracket is on a new line) */
+dt em.sig-param:last-of-type::after {
+    content: "\a";
+    white-space: pre;
+}
+
+/* To have blue background of width of the block (instead of width of content) */
+dl.class > dt:first-of-type {
+    display: block !important;
+}