Commit e532679c authored by oahzxl's avatar oahzxl
Browse files

Merge branch 'main' of https://github.com/oahzxl/ColossalAI into chunk

parents c1492e50 7d5640b9
import os
def show_files(path, all_files):
# Traverse all the folder/file in current directory
file_list = os.listdir(path)
# Determine the element is folder or file. If file, pass it into list, if folder, recurse.
for file_name in file_list:
# Get the abs directory using os.path.join() and store into cur_path.
cur_path = os.path.join(path, file_name)
# Determine whether folder
if os.path.isdir(cur_path):
show_files(cur_path, all_files)
else:
all_files.append(cur_path)
return all_files
def join(input_list, sep=None):
return (sep or ' ').join(input_list)
def main():
contents = show_files('examples/', [])
all_loc = []
for file_loc in contents:
split_loc = file_loc.split('/')
# must have two sub-folder levels after examples folder, such as examples/images/vit is acceptable, examples/images/README.md is not, examples/requirements.txt is not.
if len(split_loc) >= 4:
re_loc = '/'.join(split_loc[1:3])
if re_loc not in all_loc:
all_loc.append(re_loc)
print(all_loc)
if __name__ == '__main__':
main()
import argparse
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--fileNameList', type=str, help="The list of changed files")
args = parser.parse_args()
name_list = args.fileNameList.split(":")
folder_need_check = set()
for loc in name_list:
# Find only the sub-sub-folder of 'example' folder
# the examples folder structure is like
# - examples
# - area
# - application
# - file
if loc.split("/")[0] == "examples" and len(loc.split("/")) >= 4:
folder_need_check.add('/'.join(loc.split("/")[1:3]))
# Output the result using print. Then the shell can get the values.
print(list(folder_need_check))
if __name__ == '__main__':
main()
......@@ -2,9 +2,10 @@
# coding: utf-8
import argparse
import requests
import re
import os
import re
import requests
COMMIT_API = 'https://api.github.com/repos/hpcaitech/ColossalAI/commits'
TAGS_API = 'https://api.github.com/repos/hpcaitech/ColossalAI/tags'
......
......@@ -43,4 +43,3 @@ jobs:
assignees: ${{ github.actor }}
delete-branch: true
branch: create-pull-request/patch-sync-submodule
\ No newline at end of file
......@@ -134,10 +134,23 @@ dmypy.json
.vscode/
# macos
.DS_Store
*.DS_Store
#data/
docs/.build
# pytorch checkpoint
*.pt
# ignore version.py generated by setup.py
colossalai/version.py
# ignore any kernel build files
.o
.so
# ignore python interface defition file
.pyi
# ignore coverage test file
converage.lcov
Copyright 2021- The Colossal-ai Authors. All rights reserved.
Copyright 2021- HPC-AI Technology Inc. All rights reserved.
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
......@@ -187,7 +187,7 @@ Copyright 2021- The Colossal-ai Authors. All rights reserved.
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Copyright 2021- HPC-AI Technology Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......
include *.txt README.md
recursive-include requirements *.txt
recursive-include colossalai *.cpp *.h *.cu *.tr *.cuh *.cc
\ No newline at end of file
recursive-include colossalai *.cpp *.h *.cu *.tr *.cuh *.cc *.pyi
recursive-include op_builder *.py
# Colossal-AI
<div id="top" align="center">
[![logo](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Colossal-AI_logo.png)](https://www.colossalai.org/)
[![logo](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/colossal-ai_logo_vertical.png)](https://www.colossalai.org/)
Colossal-AI: 一个面向大模型时代的通用深度学习系统
......@@ -22,6 +22,13 @@
</div>
## 新闻
* [2023/01] [Hardware Savings Up to 46 Times for AIGC and Automatic Parallelism](https://www.hpc-ai.tech/blog/colossal-ai-0-2-0)
* [2022/11] [Diffusion Pretraining and Hardware Fine-Tuning Can Be Almost 7X Cheaper](https://www.hpc-ai.tech/blog/diffusion-pretraining-and-hardware-fine-tuning-can-be-almost-7x-cheaper)
* [2022/10] [Use a Laptop to Analyze 90% of Proteins, With a Single-GPU Inference Sequence Exceeding 10,000](https://www.hpc-ai.tech/blog/use-a-laptop-to-analyze-90-of-proteins-with-a-single-gpu-inference-sequence-exceeding)
* [2022/10] [Embedding Training With 1% GPU Memory and 100 Times Less Budget for Super-Large Recommendation Model](https://www.hpc-ai.tech/blog/embedding-training-with-1-gpu-memory-and-10-times-less-budget-an-open-source-solution-for)
* [2022/09] [HPC-AI Tech Completes $6 Million Seed and Angel Round Fundraising](https://www.hpc-ai.tech/blog/hpc-ai-tech-completes-6-million-seed-and-angel-round-fundraising-led-by-bluerun-ventures-in-the)
## 目录
<ul>
......@@ -30,12 +37,12 @@
<li>
<a href="#并行训练样例展示">并行训练样例展示</a>
<ul>
<li><a href="#ViT">ViT</a></li>
<li><a href="#GPT-3">GPT-3</a></li>
<li><a href="#GPT-2">GPT-2</a></li>
<li><a href="#BERT">BERT</a></li>
<li><a href="#PaLM">PaLM</a></li>
<li><a href="#OPT">OPT</a></li>
<li><a href="#ViT">ViT</a></li>
<li><a href="#推荐系统模型">推荐系统模型</a></li>
</ul>
</li>
......@@ -51,12 +58,14 @@
<ul>
<li><a href="#GPT-3-Inference">GPT-3</a></li>
<li><a href="#OPT-Serving">1750亿参数OPT在线推理服务</a></li>
<li><a href="#BLOOM-Inference">1750亿参数 BLOOM</a></li>
</ul>
</li>
<li>
<a href="#Colossal-AI-in-the-Real-World">Colossal-AI 成功案例</a>
<ul>
<li><a href="#xTrimoMultimer">xTrimoMultimer: 蛋白质单体与复合物结构预测</a></li>
<li><a href="#AIGC">AIGC: 加速 Stable Diffusion</a></li>
<li><a href="#生物医药">生物医药: 加速AlphaFold蛋白质结构预测</a></li>
</ul>
</li>
<li>
......@@ -69,11 +78,6 @@
<li><a href="#使用-Docker">使用 Docker</a></li>
<li><a href="#社区">社区</a></li>
<li><a href="#做出贡献">做出贡献</a></li>
<li><a href="#快速预览">快速预览</a></li>
<ul>
<li><a href="#几行代码开启分布式训练">几行代码开启分布式训练</a></li>
<li><a href="#构建一个简单的2维并行模型">构建一个简单的2维并行模型</a></li>
</ul>
<li><a href="#引用我们">引用我们</a></li>
</ul>
......@@ -98,6 +102,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
- 1维, [2维](https://arxiv.org/abs/2104.05343), [2.5维](https://arxiv.org/abs/2105.14500), [3维](https://arxiv.org/abs/2105.14450) 张量并行
- [序列并行](https://arxiv.org/abs/2105.13120)
- [零冗余优化器 (ZeRO)](https://arxiv.org/abs/1910.02054)
- [自动并行](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt/auto_parallel_with_gpt)
- 异构内存管理
- [PatrickStar](https://arxiv.org/abs/2108.05818)
- 使用友好
......@@ -105,16 +110,11 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
- 推理
- [Energon-AI](https://github.com/hpcaitech/EnergonAI)
- Colossal-AI 成功案例
- [xTrimoMultimer: 蛋白质单体与复合物结构预测](https://github.com/biomap-research/xTrimoMultimer)
- 生物医药: [FastFold](https://github.com/hpcaitech/FastFold) 加速蛋白质结构预测 AlphaFold 训练与推理
<p align="right">(<a href="#top">返回顶端</a>)</p>
## 并行训练样例展示
### ViT
<p align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/ViT.png" width="450" />
</p>
- 14倍批大小和5倍训练速度(张量并行=64)
### GPT-3
<p align="center">
......@@ -149,6 +149,12 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
请访问我们的 [文档](https://www.colossalai.org/)[例程](https://github.com/hpcaitech/ColossalAI-Examples) 以了解详情。
### ViT
<p align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/ViT.png" width="450" />
</p>
- 14倍批大小和5倍训练速度(张量并行=64)
### 推荐系统模型
- [Cached Embedding](https://github.com/hpcaitech/CachedEmbedding), 使用软件Cache实现Embeddings,用更少GPU显存训练更大的模型。
......@@ -178,7 +184,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
- 用相同的硬件训练34倍大的模型
<p align="right">(<a href="#top">back to top</a>)</p>
<p align="right">(<a href="#top">返回顶端</a>)</p>
## 推理 (Energon-AI) 样例展示
......@@ -195,23 +201,82 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
- [OPT推理服务](https://service.colossalai.org/opt): 无需注册,免费体验1750亿参数OPT在线推理服务
<p id="BLOOM-Inference" align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/BLOOM%20Inference.PNG" width=800/>
</p>
- [BLOOM](https://github.com/hpcaitech/EnergonAI/tree/main/examples/bloom): 降低1750亿参数BLOOM模型部署推理成本超10倍
<p align="right">(<a href="#top">back to top</a>)</p>
<p align="right">(<a href="#top">返回顶端</a>)</p>
## Colossal-AI 成功案例
### xTrimoMultimer: 蛋白质单体与复合物结构预测
### AIGC
加速AIGC(AI内容生成)模型,如[Stable Diffusion v1](https://github.com/CompVis/stable-diffusion)[Stable Diffusion v2](https://github.com/Stability-AI/stablediffusion)
<p id="diffusion_train" align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Stable%20Diffusion%20v2.png" width=800/>
</p>
- [训练](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion): 减少5.6倍显存消耗,硬件成本最高降低46倍(从A100到RTX3060)
<p id="diffusion_demo" align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/DreamBooth.png" width=800/>
</p>
- [DreamBooth微调](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/dreambooth): 仅需3-5张目标主题图像个性化微调
<p id="inference" align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Stable%20Diffusion%20Inference.jpg" width=800/>
</p>
- [推理](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion): GPU推理显存消耗降低2.5倍
<p align="right">(<a href="#top">返回顶端</a>)</p>
### 生物医药
加速 [AlphaFold](https://alphafold.ebi.ac.uk/) 蛋白质结构预测
<p id="FastFold" align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/FastFold.jpg" width=800/>
</p>
- [FastFold](https://github.com/hpcaitech/FastFold): 加速AlphaFold训练与推理、数据前处理、推理序列长度超过10000残基
<p id="xTrimoMultimer" align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/xTM_Prediction.jpg" width=380/>
<p></p>
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/xTrimoMultimer_Table.jpg" width=800/>
</p>
- [xTrimoMultimer](https://github.com/biomap-research/xTrimoMultimer): 11倍加速蛋白质单体与复合物结构预测
<p align="right">(<a href="#top">返回顶端</a>)</p>
## 安装
### 从PyPI安装
您可以用下面的命令直接从PyPI上下载并安装Colossal-AI。我们默认不会安装PyTorch扩展包
```bash
pip install colossalai
```
但是,如果你想在安装时就直接构建PyTorch扩展,您可以设置环境变量`CUDA_EXT=1`.
```bash
CUDA_EXT=1 pip install colossalai
```
**否则,PyTorch扩展只会在你实际需要使用他们时在运行时里被构建。**
与此同时,我们也每周定时发布Nightly版本,这能让你提前体验到新的feature和bug fix。你可以通过以下命令安装Nightly版本。
```bash
pip install colossalai-nightly
```
### 从官方安装
您可以访问我们[下载](https://www.colossalai.org/download)页面来安装Colossal-AI,在这个页面上发布的版本都预编译了CUDA扩展。
......@@ -231,10 +296,10 @@ pip install -r requirements/requirements.txt
pip install .
```
如果您不想安装和启用 CUDA 内核融合(使用融合优化器时强制安装):
我们默认在`pip install`时不安装PyTorch扩展,而是在运行时临时编译,如果你想要提前安装这些扩展的话(在使用融合优化器时会用到),可以使用一下命令。
```shell
NO_CUDA_EXT=1 pip install .
CUDA_EXT=1 pip install .
```
<p align="right">(<a href="#top">返回顶端</a>)</p>
......@@ -283,31 +348,6 @@ docker run -ti --gpus all --rm --ipc=host colossalai bash
<p align="right">(<a href="#top">返回顶端</a>)</p>
## 快速预览
### 几行代码开启分布式训练
```python
parallel = dict(
pipeline=2,
tensor=dict(mode='2.5d', depth = 1, size=4)
)
```
### 几行代码开启异构训练
```python
zero = dict(
model_config=dict(
tensor_placement_policy='auto',
shard_strategy=TensorShardStrategy(),
reuse_fp16_shard=True
),
optimizer_config=dict(initial_scale=2**5, gpu_margin_mem_ratio=0.2)
)
```
<p align="right">(<a href="#top">返回顶端</a>)</p>
## 引用我们
......
# Colossal-AI
<div id="top" align="center">
[![logo](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Colossal-AI_logo.png)](https://www.colossalai.org/)
[![logo](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/colossal-ai_logo_vertical.png)](https://www.colossalai.org/)
Colossal-AI: A Unified Deep Learning System for Big Model Era
......@@ -23,6 +23,13 @@
</div>
## Latest News
* [2023/01] [Hardware Savings Up to 46 Times for AIGC and Automatic Parallelism](https://www.hpc-ai.tech/blog/colossal-ai-0-2-0)
* [2022/11] [Diffusion Pretraining and Hardware Fine-Tuning Can Be Almost 7X Cheaper](https://www.hpc-ai.tech/blog/diffusion-pretraining-and-hardware-fine-tuning-can-be-almost-7x-cheaper)
* [2022/10] [Use a Laptop to Analyze 90% of Proteins, With a Single-GPU Inference Sequence Exceeding 10,000](https://www.hpc-ai.tech/blog/use-a-laptop-to-analyze-90-of-proteins-with-a-single-gpu-inference-sequence-exceeding)
* [2022/10] [Embedding Training With 1% GPU Memory and 100 Times Less Budget for Super-Large Recommendation Model](https://www.hpc-ai.tech/blog/embedding-training-with-1-gpu-memory-and-10-times-less-budget-an-open-source-solution-for)
* [2022/09] [HPC-AI Tech Completes $6 Million Seed and Angel Round Fundraising](https://www.hpc-ai.tech/blog/hpc-ai-tech-completes-6-million-seed-and-angel-round-fundraising-led-by-bluerun-ventures-in-the)
## Table of Contents
<ul>
<li><a href="#Why-Colossal-AI">Why Colossal-AI</a> </li>
......@@ -30,12 +37,12 @@
<li>
<a href="#Parallel-Training-Demo">Parallel Training Demo</a>
<ul>
<li><a href="#ViT">ViT</a></li>
<li><a href="#GPT-3">GPT-3</a></li>
<li><a href="#GPT-2">GPT-2</a></li>
<li><a href="#BERT">BERT</a></li>
<li><a href="#PaLM">PaLM</a></li>
<li><a href="#OPT">OPT</a></li>
<li><a href="#ViT">ViT</a></li>
<li><a href="#Recommendation-System-Models">Recommendation System Models</a></li>
</ul>
</li>
......@@ -51,12 +58,14 @@
<ul>
<li><a href="#GPT-3-Inference">GPT-3</a></li>
<li><a href="#OPT-Serving">OPT-175B Online Serving for Text Generation</a></li>
<li><a href="#BLOOM-Inference">175B BLOOM</a></li>
</ul>
</li>
<li>
<a href="#Colossal-AI-in-the-Real-World">Colossal-AI for Real World Applications</a>
<ul>
<li><a href="#xTrimoMultimer">xTrimoMultimer: Accelerating Protein Monomer and Multimer Structure Prediction</a></li>
<li><a href="#AIGC">AIGC: Acceleration of Stable Diffusion</a></li>
<li><a href="#Biomedicine">Biomedicine: Acceleration of AlphaFold Protein Structure</a></li>
</ul>
</li>
<li>
......@@ -69,11 +78,6 @@
<li><a href="#Use-Docker">Use Docker</a></li>
<li><a href="#Community">Community</a></li>
<li><a href="#contributing">Contributing</a></li>
<li><a href="#Quick-View">Quick View</a></li>
<ul>
<li><a href="#Start-Distributed-Training-in-Lines">Start Distributed Training in Lines</a></li>
<li><a href="#Write-a-Simple-2D-Parallel-Model">Write a Simple 2D Parallel Model</a></li>
</ul>
<li><a href="#Cite-Us">Cite Us</a></li>
</ul>
......@@ -100,6 +104,7 @@ distributed training and inference in a few lines.
- 1D, [2D](https://arxiv.org/abs/2104.05343), [2.5D](https://arxiv.org/abs/2105.14500), [3D](https://arxiv.org/abs/2105.14450) Tensor Parallelism
- [Sequence Parallelism](https://arxiv.org/abs/2105.13120)
- [Zero Redundancy Optimizer (ZeRO)](https://arxiv.org/abs/1910.02054)
- [Auto-Parallelism](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt/auto_parallel_with_gpt)
- Heterogeneous Memory Management
- [PatrickStar](https://arxiv.org/abs/2108.05818)
......@@ -111,16 +116,10 @@ distributed training and inference in a few lines.
- [Energon-AI](https://github.com/hpcaitech/EnergonAI)
- Colossal-AI in the Real World
- [xTrimoMultimer](https://github.com/biomap-research/xTrimoMultimer): Accelerating Protein Monomer and Multimer Structure Prediction
- Biomedicine: [FastFold](https://github.com/hpcaitech/FastFold) accelerates training and inference of AlphaFold protein structure
<p align="right">(<a href="#top">back to top</a>)</p>
## Parallel Training Demo
### ViT
<p align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/ViT.png" width="450" />
</p>
- 14x larger batch size, and 5x faster training for Tensor Parallelism = 64
### GPT-3
<p align="center">
......@@ -154,6 +153,13 @@ distributed training and inference in a few lines.
Please visit our [documentation](https://www.colossalai.org/) and [examples](https://github.com/hpcaitech/ColossalAI-Examples) for more details.
### ViT
<p align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/ViT.png" width="450" />
</p>
- 14x larger batch size, and 5x faster training for Tensor Parallelism = 64
### Recommendation System Models
- [Cached Embedding](https://github.com/hpcaitech/CachedEmbedding), utilize software cache to train larger embedding tables with a smaller GPU memory budget.
......@@ -198,26 +204,85 @@ Please visit our [documentation](https://www.colossalai.org/) and [examples](htt
- [OPT Serving](https://service.colossalai.org/opt): Try 175-billion-parameter OPT online services for free, without any registration whatsoever.
<p id="BLOOM-Inference" align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/BLOOM%20Inference.PNG" width=800/>
</p>
- [BLOOM](https://github.com/hpcaitech/EnergonAI/tree/main/examples/bloom): Reduce hardware deployment costs of 175-billion-parameter BLOOM by more than 10 times.
<p align="right">(<a href="#top">back to top</a>)</p>
## Colossal-AI in the Real World
### xTrimoMultimer: Accelerating Protein Monomer and Multimer Structure Prediction
### AIGC
Acceleration of AIGC (AI-Generated Content) models such as [Stable Diffusion v1](https://github.com/CompVis/stable-diffusion) and [Stable Diffusion v2](https://github.com/Stability-AI/stablediffusion).
<p id="diffusion_train" align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Stable%20Diffusion%20v2.png" width=800/>
</p>
- [Training](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion): Reduce Stable Diffusion memory consumption by up to 5.6x and hardware cost by up to 46x (from A100 to RTX3060).
<p id="diffusion_demo" align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/DreamBooth.png" width=800/>
</p>
- [DreamBooth Fine-tuning](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/dreambooth): Personalize your model using just 3-5 images of the desired subject.
<p id="inference" align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Stable%20Diffusion%20Inference.jpg" width=800/>
</p>
- [Inference](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion): Reduce inference GPU memory consumption by 2.5x.
<p align="right">(<a href="#top">back to top</a>)</p>
### Biomedicine
Acceleration of [AlphaFold Protein Structure](https://alphafold.ebi.ac.uk/)
<p id="FastFold" align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/FastFold.jpg" width=800/>
</p>
- [FastFold](https://github.com/hpcaitech/FastFold): accelerating training and inference on GPU Clusters, faster data processing, inference sequence containing more than 10000 residues.
<p id="xTrimoMultimer" align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/xTM_Prediction.jpg" width=380/>
<p></p>
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/xTrimoMultimer_Table.jpg" width=800/>
</p>
- [xTrimoMultimer](https://github.com/biomap-research/xTrimoMultimer): accelerating structure prediction of protein monomers and multimer by 11x
- [xTrimoMultimer](https://github.com/biomap-research/xTrimoMultimer): accelerating structure prediction of protein monomers and multimer by 11x.
<p align="right">(<a href="#top">back to top</a>)</p>
## Installation
### Install from PyPI
You can easily install Colossal-AI with the following command. **By defualt, we do not build PyTorch extensions during installation.**
```bash
pip install colossalai
```
However, if you want to build the PyTorch extensions during installation, you can set `CUDA_EXT=1`.
```bash
CUDA_EXT=1 pip install colossalai
```
**Otherwise, CUDA kernels will be built during runtime when you actually need it.**
We also keep release the nightly version to PyPI on a weekly basis. This allows you to access the unreleased features and bug fixes in the main branch.
Installation can be made via
```bash
pip install colossalai-nightly
```
### Download From Official Releases
You can visit the [Download](https://www.colossalai.org/download) page to download Colossal-AI with pre-built CUDA extensions.
You can visit the [Download](https://www.colossalai.org/download) page to download Colossal-AI with pre-built PyTorch extensions.
### Download From Source
......@@ -228,17 +293,15 @@ You can visit the [Download](https://www.colossalai.org/download) page to downlo
git clone https://github.com/hpcaitech/ColossalAI.git
cd ColossalAI
# install dependency
pip install -r requirements/requirements.txt
# install colossalai
pip install .
```
If you don't want to install and enable CUDA kernel fusion (compulsory installation when using fused optimizer):
By default, we do not compile CUDA/C++ kernels. ColossalAI will build them during runtime.
If you want to install and enable CUDA kernel fusion (compulsory installation when using fused optimizer):
```shell
NO_CUDA_EXT=1 pip install .
CUDA_EXT=1 pip install .
```
<p align="right">(<a href="#top">back to top</a>)</p>
......@@ -289,32 +352,6 @@ Thanks so much to all of our amazing contributors!
<p align="right">(<a href="#top">back to top</a>)</p>
## Quick View
### Start Distributed Training in Lines
```python
parallel = dict(
pipeline=2,
tensor=dict(mode='2.5d', depth = 1, size=4)
)
```
### Start Heterogeneous Training in Lines
```python
zero = dict(
model_config=dict(
tensor_placement_policy='auto',
shard_strategy=TensorShardStrategy(),
reuse_fp16_shard=True
),
optimizer_config=dict(initial_scale=2**5, gpu_margin_mem_ratio=0.2)
)
```
<p align="right">(<a href="#top">back to top</a>)</p>
## Cite Us
......
......@@ -7,4 +7,11 @@ from .initialize import (
launch_from_torch,
)
__version__ = '0.1.11rc1'
try:
# .version will be created by setup.py
from .version import __version__
except ModuleNotFoundError:
# this will only happen if the user did not run `pip install`
# and directly set PYTHONPATH to use Colossal-AI which is a bad practice
__version__ = '0.0.0'
print('please install Colossal-AI from https://www.colossalai.org/download or from source')
from .apex_amp import ApexAMPOptimizer
import torch.nn as nn
from torch.optim import Optimizer
from .apex_amp import ApexAMPOptimizer
def convert_to_apex_amp(model: nn.Module, optimizer: Optimizer, amp_config):
r"""A helper function to wrap training components with Apex AMP modules
......
import inspect
import torch.nn as nn
from torch.optim import Optimizer
from colossalai.utils import is_no_pp_or_last_stage
from .naive_amp import NaiveAMPOptimizer, NaiveAMPModel
from .grad_scaler import DynamicGradScaler, ConstantGradScaler
from ._fp16_optimizer import FP16Optimizer
from .grad_scaler import ConstantGradScaler, DynamicGradScaler
from .naive_amp import NaiveAMPModel, NaiveAMPOptimizer
def convert_to_naive_amp(model: nn.Module, optimizer: Optimizer, amp_config):
......
......@@ -3,24 +3,33 @@
import torch
import torch.distributed as dist
try:
import colossal_C
except:
print('Colossalai should be built with cuda extension to use the FP16 optimizer')
from torch.distributed import ProcessGroup
from torch.optim import Optimizer
from colossalai.core import global_context as gpc
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.kernel.op_builder import FusedOptimBuilder
from colossalai.logging import get_dist_logger
from colossalai.utils import (copy_tensor_parallel_attributes, clip_grad_norm_fp32, multi_tensor_applier)
from torch.distributed import ProcessGroup
from .grad_scaler import BaseGradScaler
from colossalai.utils import clip_grad_norm_fp32, copy_tensor_parallel_attributes, multi_tensor_applier
from ._utils import has_inf_or_nan, zero_gard_by_list
from .grad_scaler import BaseGradScaler
try:
from colossalai._C import fused_optim
except:
fused_optim = None
__all__ = ['FP16Optimizer']
def load_fused_optim():
global fused_optim
if fused_optim is None:
fused_optim = FusedOptimBuilder().load()
def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None):
"""
adapted from Megatron-LM (https://github.com/NVIDIA/Megatron-LM)
......@@ -33,7 +42,9 @@ def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None):
if overflow_buf:
overflow_buf.fill_(0)
# Scaling with factor `1.0` is equivalent to copy.
multi_tensor_applier(colossal_C.multi_tensor_scale, overflow_buf, [this, that], 1.0)
global fused_optim
load_fused_optim()
multi_tensor_applier(fused_optim.multi_tensor_scale, overflow_buf, [this, that], 1.0)
else:
for this_, that_ in zip(this, that):
that_.copy_(this_)
......@@ -73,8 +84,8 @@ class FP16Optimizer(Optimizer):
# get process group
def _get_process_group(parallel_mode):
if gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(ParallelMode.DATA):
return gpc.get_group(ParallelMode.DATA)
if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode):
return gpc.get_group(parallel_mode)
else:
return None
......@@ -150,6 +161,12 @@ class FP16Optimizer(Optimizer):
f"==========================================",
ranks=[0])
@property
def max_norm(self):
"""Returns the maximum norm of gradient clipping.
"""
return self._clip_grad_max_norm
@property
def grad_scaler(self):
"""Returns the gradient scaler.
......
from typing import List
from torch import Tensor
......
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import torch
from abc import ABC, abstractmethod
from colossalai.logging import get_dist_logger
from torch import Tensor
from typing import Dict
import torch
from torch import Tensor
from colossalai.logging import get_dist_logger
__all__ = ['BaseGradScaler']
......
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from typing import Optional
import torch
from .base_grad_scaler import BaseGradScaler
from typing import Optional
__all__ = ['DynamicGradScaler']
......
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from typing import Any
import torch
import torch.nn as nn
import torch.distributed as dist
import torch.nn as nn
from torch import Tensor
from typing import Any
from torch.optim import Optimizer
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
from torch.distributed import ReduceOp
from colossalai.core import global_context as gpc
from torch.optim import Optimizer
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.nn.optimizer import ColossalaiOptimizer
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
from ._fp16_optimizer import FP16Optimizer
......@@ -40,7 +43,11 @@ class NaiveAMPOptimizer(ColossalaiOptimizer):
return self.optim.step()
def clip_grad_norm(self, model: nn.Module, max_norm: float):
pass
if self.optim.max_norm == max_norm:
return
raise RuntimeError("NaiveAMP optimizer has clipped gradients during optimizer.step(). "
"If you have supplied clip_grad_norm in the amp_config, "
"executing the method clip_grad_norm is not allowed.")
class NaiveAMPModel(nn.Module):
......
from typing import Optional
import torch.nn as nn
from torch.optim import Optimizer
from torch.nn.modules.loss import _Loss
from torch.optim import Optimizer
from colossalai.context import Config
from .torch_amp import TorchAMPOptimizer, TorchAMPModel, TorchAMPLoss
from typing import Optional
from .torch_amp import TorchAMPLoss, TorchAMPModel, TorchAMPOptimizer
def convert_to_torch_amp(model: nn.Module,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment