Commit f356f546 authored by maming's avatar maming
Browse files

Initial commit

parents
Pipeline #3339 canceled with stages
__pycache__
*.bak
*.log
.vscode
.idea
[submodule "Megatron-LM"]
path = Megatron-LM
url = https://github.com/NVIDIA/Megatron-LM.git
branch = main
[submodule]
Megatron-LM = main
[submodule "Megatron-Energon"]
path = Megatron-Energon
url = https://github.com/NVIDIA/Megatron-Energon.git
[report]
include = ./src/megatron/energon/**
[xml]
output = ./coverage.xml
\ No newline at end of file
name: documentation
# Runs on any pushes
on: [push, workflow_dispatch]
# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
permissions:
contents: read
pages: write
id-token: write
# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
concurrency:
group: "pages"
cancel-in-progress: false
jobs:
# Build job
build:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v5
- name: Install just
uses: extractions/setup-just@v3
- name: Install dependencies
run: |
just dev-sync
- name: Sphinx build
run: |
just docs
- name: Upload artifact
uses: actions/upload-pages-artifact@v3
with:
path: docs/build
# Deployment job
deploy:
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
runs-on: ubuntu-latest
needs: build
if: github.ref_name == 'main'
steps:
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v4
name: verify license headers
on:
push:
branches:
- develop
workflow_dispatch:
pull_request:
branches:
- develop
jobs:
license-check:
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: 3.9
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install click==8.1.7
- name: Run License Header Check
run: python scripts/license_headers.py .
name: release
on:
release:
types: [published]
jobs:
deploy:
name: Upload release to PyPI
runs-on: ubuntu-latest
environment:
name: release
url: https://pypi.org/project/megatron-energon
permissions:
id-token: write # This permission is mandatory for trusted publishing
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v5
- name: Install just
uses: extractions/setup-just@v3
- name: Install dependencies
run: |
just dev-sync
- name: Build package
run: |
just build
- name: Publish package
uses: pypa/gh-action-pypi-publish@release/v1
name: ruff checks
on:
push:
branches:
- develop
workflow_dispatch:
pull_request:
branches:
- develop
jobs:
ruff:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v5
- name: Install just
uses: extractions/setup-just@v3
- name: Install dependencies
run: |
just dev-sync
- name: Check code
run: |
just check
name: tests
on:
push:
branches:
- develop
workflow_dispatch:
pull_request:
branches:
- develop
jobs:
unittest:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v5
- name: Install just
uses: extractions/setup-just@v3
- name: Install minimum supported python version
run: |
uv python pin 3.10
- name: Install dependencies
run: |
just dev-sync
- name: Run unit tests
run: |
just test
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# C extensions
*.so
# C objects
*.o
# IDE
/.idea
/.vscode
.DS_Store
.spyderproject
.spyproject
.ropeproject
*.code-workspace
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# Environments
#.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Temporary Data
/test_dataset
# drawio temp files
.$*
# ninja build
.ninja*
# Other files
*.log
*.csv
/*.png
/public
/.run
<!--- Copyright (c) 2025, NVIDIA CORPORATION.
SPDX-License-Identifier: BSD-3-Clause -->
## Megatron-Energon OSS Contribution Rules
#### Coding Guidelines
- We recommend to discuss and plan contributions by opening an issue beforehand.
- All source code contributions must strictly adhere to ruff linter and formatter as configured in `pyproject.toml`.
- You can check those by running `just check`.
- All source code files must include the NVIDIA license header as you can see in `src/megatron/energon/__init__.py`.
- In addition, please follow the existing conventions in the relevant file, submodule, module, and project when you add new code or when you extend/fix existing functionality.
- Avoid introducing unnecessary complexity into existing code so that maintainability and readability are preserved.
- Try to keep pull requests (PRs) as concise as possible:
- Avoid committing commented-out code.
- Wherever possible, each PR should address a single concern. If there are several otherwise-unrelated things that should be fixed to reach a desired endpoint, our recommendation is to open several PRs and indicate the dependencies in the description. The more complex the changes are in a single PR, the more time it will take to review those changes.
- Ensure that all tests pass prior to submitting your code.
- All components must contain accompanying documentation in `docs/source`.
- Make sure that you can contribute your work to open source (no license and/or patent conflict is introduced by your code). You will need to [`sign`](#signing-your-work) your commit.
#### Signing Your Work
* We require that all contributors "sign-off" on their commits. This certifies that the contribution is your original work, or you have rights to submit it under the same license, or a compatible license.
* Any contribution which contains commits that are not Signed-Off will not be accepted.
* To sign off on a commit you simply use the `--signoff` (or `-s`) option when committing your changes:
```bash
$ git commit -s -m "Add cool feature."
```
This will append the following to your commit message:
```
Signed-off-by: Your Name <your@email.com>
```
* Full text of the DCO:
```
Developer Certificate of Origin
Version 1.1
Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
1 Letterman Drive
Suite D4700
San Francisco, CA, 94129
Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
Developer's Certificate of Origin 1.1
By making a contribution to this project, I certify that:
(a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or
(b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or
(c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it.
(d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved.
```
This diff is collapsed.
<!--- Copyright (c) 2025, NVIDIA CORPORATION.
SPDX-License-Identifier: BSD-3-Clause -->
<a name="top"></a>
<div align="center">
<h4 align="center">Megatron's multi-modal data loader</h4>
<h2 align="center">Megatron Energon</h2>
<p align="center">
<a href="https://github.com/NVIDIA/Megatron-Energon/actions/workflows/tests.yml"><img src="https://github.com/NVIDIA/Megatron-Energon/actions/workflows/tests.yml/badge.svg" alt="Tests"></a> <a href="https://nvidia.github.io/Megatron-Energon/"><img src="https://github.com/NVIDIA/Megatron-Energon/actions/workflows/documentation.yml/badge.svg" alt="Documentation"></a>
<br />
<a href="https://github.com/NVIDIA/Megatron-Energon/issues">Report Bug</a>
·
<a href="https://github.com/NVIDIA/Megatron-Energon/issues">Request Feature</a>
</p>
</div>
<br />
_**DISCLAIMER**: This package contains research code. APIs may change._
# What is this?
**Megatron Energon** is the multi-modal data loader of [Megatron](https://github.com/NVIDIA/Megatron-LM) (you can also use it independently).
It's best at
- loading large training data to train large multi-modal models
- blending many different datasets together
- distributing the work across many nodes and processes of a cluster
- ensuring reproducibility and resumability
- adapting easily to various types of data samples and processing
Try using it together with [Megatron](https://github.com/NVIDIA/Megatron-LM) Core.
# Quickstart
**Megatron Energon** is a pip-installable python package that offers
- dataset-related classes that you can import in your project
- a command line utility for data preprocessing and conversion
This document is just a quick start. Please also check out the [documentation](https://nvidia.github.io/Megatron-Energon/).
## Installation
To install the latest stable version:
```shell
pip install megatron-energon
```
Or to install the current development version:
```shell
pip install git+https://github.com/NVIDIA/Megatron-Energon.git
```
**NOTE**: We encourage you to install the package (and not just import a local copy). This will ensure you have all the needed dependencies and that you can use the command line tool.
For more details on installing this package, see [here](https://nvidia.github.io/Megatron-Energon/intro/installation.html).
## Usage of command line tool
After installation, the command `energon` will be available.
Here are some examples for things you can do:
| Command | Description |
|---|---|
| `energon prepare DATASET_ROOT` | Take an existing WebDataset and add the required yaml files to turn it into an energon-compatible dataset |
| `energon lint DATASET_ROOT` | Verify that the dataset complies with the energon dataset format and that all samples are loadable |
## Usage of the library
To get started, pick a [WebDataset](https://github.com/webdataset/webdataset)-compliant dataset and run `energon prepare DATASET_ROOT` on it, to run the interactive assistant and create the `.nv-meta` folder. As an alternative to WebDataset, Energon also supports the JSONL format, see [here](https://nvidia.github.io/Megatron-Energon/basic/data_prep.html).
Once done, try to load it from your Python program:
```python
from megatron.energon import get_train_dataset, get_loader, WorkerConfig
simple_worker_config = WorkerConfig(rank=0, world_size=1, num_workers=2)
train_ds = get_train_dataset(
'/my/dataset/path',
batch_size=2,
shuffle_buffer_size=None,
max_samples_per_sequence=None,
worker_config=simple_worker_config,
)
train_loader = get_loader(train_ds)
for batch in train_loader:
# Do something with batch
# Infer, gradient step, ...
pass
```
For more details, read the [documentation](https://nvidia.github.io/Megatron-Energon/).
FROM nvcr.io/nvidia/pytorch:24.02-py3
ENV TORCH_CUDA_ARCH_LIST "7.0;7.5;8.0;8.6;8.7;8.9;9.0"
ENV MMCV_WITH_OPS 1
ENV FORCE_CUDA 1
RUN python3 -m pip install --upgrade pip
# Install, then uninstall to get only the deps
COPY . ./megatron-energon
RUN pip install -e ./megatron-energon && pip uninstall -y megatron-energon && rm -rf ./megatron-energon
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = python3 -m sphinx.cmd.build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
<!--- Copyright (c) 2025, NVIDIA CORPORATION.
SPDX-License-Identifier: BSD-3-Clause -->
# Building the documentation
To build the documentation, you need sphinx and additional packages:
- sphinx-rtd-theme
- sphinx
- sphinxcontrib-napoleon
- myst-parser
You can install these like
`pip install sphinx-rtd-theme sphinx sphinxcontrib-napoleon myst-parser sphinx-click`
Use `make html` to build it.
Or use PyCharm by adding a configuration:
`Run -> Edit Configurations -> Add new Configuration -> Python docs -> Sphinx task`
Use the `src/docs/source` folder as input folder and the `src/docs/build` as output.
/*.sig-param:nth-child(1 of .sig-param):nth-last-child(n + 3 of .sig-param)::before,
.sig-param:nth-child(1 of .sig-param):nth-last-child(n + 3 of .sig-param) ~ .sig-param ::before {
content: "\a\20\20\20\20\20\20\20\20\20\20\20\20\20\20\20\20";
white-space: pre;
}*/
/* Newlines (\a) and spaces (\20) before each parameter */
.sig-param::before {
content: "\a\20\20\20\20\20\20\20\20\20\20\20\20\20\20\20\20";
white-space: pre;
}
/* Newline after the last parameter (so the closing bracket is on a new line) */
dt em.sig-param:last-of-type::after {
content: "\a";
white-space: pre;
}
/* To have blue background of width of the block (instead of width of content) */
dl.class > dt:first-of-type {
display: block !important;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment