Merge branch 'main' of https://github.com/oahzxl/ColossalAI into chunk

e532679c · oahzxl · c1492e50 · 7d5640b9 · e532679c · e532679c
Commit e532679c authored Jan 10, 2023 by oahzxl
20 changed files
--- a/.github/workflows/scripts/example_checks/check_example_weekly.py
+++ b/.github/workflows/scripts/example_checks/check_example_weekly.py
+import os
+
+
+def show_files(path, all_files):
+    # Traverse all the folder/file in current directory
+    file_list = os.listdir(path)
+    # Determine the element is folder or file. If file, pass it into list, if folder, recurse.
+    for file_name in file_list:
+        # Get the abs directory using os.path.join() and store into cur_path.
+        cur_path = os.path.join(path, file_name)
+        # Determine whether folder
+        if os.path.isdir(cur_path):
+            show_files(cur_path, all_files)
+        else:
+            all_files.append(cur_path)
+    return all_files
+
+
+def join(input_list, sep=None):
+    return (sep or ' ').join(input_list)
+
+
+def main():
+    contents = show_files('examples/', [])
+    all_loc = []
+    for file_loc in contents:
+        split_loc = file_loc.split('/')
+        # must have two sub-folder levels after examples folder, such as examples/images/vit is acceptable, examples/images/README.md is not, examples/requirements.txt is not.
+        if len(split_loc) >= 4:
+            re_loc = '/'.join(split_loc[1:3])
+            if re_loc not in all_loc:
+                all_loc.append(re_loc)
+    print(all_loc)
+
+
+if __name__ == '__main__':
+    main()
--- a/.github/workflows/scripts/example_checks/detect_changed_example.py
+++ b/.github/workflows/scripts/example_checks/detect_changed_example.py
+import argparse
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', '--fileNameList', type=str, help="The list of changed files")
+    args = parser.parse_args()
+    name_list = args.fileNameList.split(":")
+    folder_need_check = set()
+    for loc in name_list:
+        # Find only the sub-sub-folder of 'example' folder
+        # the examples folder structure is like
+        # - examples
+        #   - area
+        #     - application
+        #       - file
+        if loc.split("/")[0] == "examples" and len(loc.split("/")) >= 4:
+            folder_need_check.add('/'.join(loc.split("/")[1:3]))
+    # Output the result using print. Then the shell can get the values.
+    print(list(folder_need_check))
+
+
+if __name__ == '__main__':
+    main()
--- a/.github/workflows/scripts/generate_release_draft.py
+++ b/.github/workflows/scripts/generate_release_draft.py
@@ -2,9 +2,10 @@
 # coding: utf-8

 import argparse
-import requests
-import re
 import os
+import re
+
+import requests

 COMMIT_API = 'https://api.github.com/repos/hpcaitech/ColossalAI/commits'
 TAGS_API = 'https://api.github.com/repos/hpcaitech/ColossalAI/tags'

--- a/.github/workflows/submodule.yml
+++ b/.github/workflows/submodule.yml
@@ -43,4 +43,3 @@ jobs:
          assignees: ${{ github.actor }}
          delete-branch: true
          branch: create-pull-request/patch-sync-submodule
-          
\ No newline at end of file
--- a/.gitignore
+++ b/.gitignore
@@ -134,10 +134,23 @@ dmypy.json
 .vscode/

 # macos
-.DS_Store
+*.DS_Store
 #data/

 docs/.build

 # pytorch checkpoint
 *.pt
+
+# ignore version.py generated by setup.py
+colossalai/version.py
+
+# ignore any kernel build files
+.o
+.so
+
+# ignore python interface defition file
+.pyi
+
+# ignore coverage test file
+converage.lcov
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
--- a/LICENSE
+++ b/LICENSE
-Copyright 2021- The Colossal-ai Authors. All rights reserved.
+Copyright 2021- HPC-AI Technology Inc. All rights reserved.
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
@@ -187,7 +187,7 @@ Copyright 2021- The Colossal-ai Authors. All rights reserved.
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

-   Copyright [yyyy] [name of copyright owner]
+   Copyright 2021- HPC-AI Technology Inc.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.

--- a/MANIFEST.in
+++ b/MANIFEST.in
 include *.txt README.md
 recursive-include requirements *.txt
-recursive-include colossalai *.cpp *.h *.cu *.tr *.cuh *.cc
\ No newline at end of file
+recursive-include colossalai *.cpp *.h *.cu *.tr *.cuh *.cc *.pyi
+recursive-include op_builder *.py
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
 # Colossal-AI
 <div id="top" align="center">

-   [![logo](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Colossal-AI_logo.png)](https://www.colossalai.org/)
+   [![logo](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/colossal-ai_logo_vertical.png)](https://www.colossalai.org/)

   Colossal-AI: 一个面向大模型时代的通用深度学习系统

@@ -22,6 +22,13 @@

 </div>

+## 新闻
+* [2023/01] [Hardware Savings Up to 46 Times for AIGC and  Automatic Parallelism](https://www.hpc-ai.tech/blog/colossal-ai-0-2-0)
+* [2022/11] [Diffusion Pretraining and Hardware Fine-Tuning Can Be Almost 7X Cheaper](https://www.hpc-ai.tech/blog/diffusion-pretraining-and-hardware-fine-tuning-can-be-almost-7x-cheaper)
+* [2022/10] [Use a Laptop to Analyze 90% of Proteins, With a Single-GPU Inference Sequence Exceeding 10,000](https://www.hpc-ai.tech/blog/use-a-laptop-to-analyze-90-of-proteins-with-a-single-gpu-inference-sequence-exceeding)
+* [2022/10] [Embedding Training With 1% GPU Memory and 100 Times Less Budget for Super-Large Recommendation Model](https://www.hpc-ai.tech/blog/embedding-training-with-1-gpu-memory-and-10-times-less-budget-an-open-source-solution-for)
+* [2022/09] [HPC-AI Tech Completes $6 Million Seed and Angel Round Fundraising](https://www.hpc-ai.tech/blog/hpc-ai-tech-completes-6-million-seed-and-angel-round-fundraising-led-by-bluerun-ventures-in-the)
+

 ## 目录
 <ul>
@@ -30,12 +37,12 @@
 <li>
   <a href="#并行训练样例展示">并行训练样例展示</a>
   <ul>
-     <li><a href="#ViT">ViT</a></li>
     <li><a href="#GPT-3">GPT-3</a></li>
     <li><a href="#GPT-2">GPT-2</a></li>
     <li><a href="#BERT">BERT</a></li>
     <li><a href="#PaLM">PaLM</a></li>
     <li><a href="#OPT">OPT</a></li>
+     <li><a href="#ViT">ViT</a></li>
     <li><a href="#推荐系统模型">推荐系统模型</a></li>
   </ul>
 </li>
@@ -51,12 +58,14 @@
   <ul>
     <li><a href="#GPT-3-Inference">GPT-3</a></li>
     <li><a href="#OPT-Serving">1750亿参数OPT在线推理服务</a></li>
+     <li><a href="#BLOOM-Inference">1750亿参数 BLOOM</a></li>
   </ul>
 </li>
 <li>
   <a href="#Colossal-AI-in-the-Real-World">Colossal-AI 成功案例</a>
   <ul>
-     <li><a href="#xTrimoMultimer">xTrimoMultimer: 蛋白质单体与复合物结构预测</a></li>
+     <li><a href="#AIGC">AIGC: 加速 Stable Diffusion</a></li>
+     <li><a href="#生物医药">生物医药: 加速AlphaFold蛋白质结构预测</a></li>
   </ul>
 </li>
 <li>
@@ -69,11 +78,6 @@
 <li><a href="#使用-Docker">使用 Docker</a></li>
 <li><a href="#社区">社区</a></li>
 <li><a href="#做出贡献">做出贡献</a></li>
- <li><a href="#快速预览">快速预览</a></li>
-   <ul>
-     <li><a href="#几行代码开启分布式训练">几行代码开启分布式训练</a></li>
-     <li><a href="#构建一个简单的2维并行模型">构建一个简单的2维并行模型</a></li>
-   </ul>
 <li><a href="#引用我们">引用我们</a></li>
 </ul>

@@ -98,6 +102,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
  - 1维, [2维](https://arxiv.org/abs/2104.05343), [2.5维](https://arxiv.org/abs/2105.14500), [3维](https://arxiv.org/abs/2105.14450) 张量并行
  - [序列并行](https://arxiv.org/abs/2105.13120)
  - [零冗余优化器 (ZeRO)](https://arxiv.org/abs/1910.02054)
+  - [自动并行](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt/auto_parallel_with_gpt)
 - 异构内存管理
  - [PatrickStar](https://arxiv.org/abs/2108.05818)
 - 使用友好
@@ -105,16 +110,11 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 - 推理
  - [Energon-AI](https://github.com/hpcaitech/EnergonAI)
 - Colossal-AI 成功案例
-  - [xTrimoMultimer: 蛋白质单体与复合物结构预测](https://github.com/biomap-research/xTrimoMultimer)
+  - 生物医药: [FastFold](https://github.com/hpcaitech/FastFold) 加速蛋白质结构预测 AlphaFold 训练与推理
 <p align="right">(<a href="#top">返回顶端</a>)</p>

 ## 并行训练样例展示
-### ViT
-<p align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/ViT.png" width="450" />
-</p>

- 14倍批大小和5倍训练速度（张量并行=64）

 ### GPT-3
 <p align="center">
@@ -149,6 +149,12 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的

 请访问我们的 [文档](https://www.colossalai.org/) 和 [例程](https://github.com/hpcaitech/ColossalAI-Examples) 以了解详情。

+### ViT
+<p align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/ViT.png" width="450" />
+</p>
+
+- 14倍批大小和5倍训练速度（张量并行=64）

 ### 推荐系统模型
 - [Cached Embedding](https://github.com/hpcaitech/CachedEmbedding), 使用软件Cache实现Embeddings，用更少GPU显存训练更大的模型。
@@ -178,7 +184,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的

 - 用相同的硬件训练34倍大的模型

-<p align="right">(<a href="#top">back to top</a>)</p>
+<p align="right">(<a href="#top">返回顶端</a>)</p>


 ## 推理 (Energon-AI) 样例展示
@@ -195,23 +201,82 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的

 - [OPT推理服务](https://service.colossalai.org/opt): 无需注册，免费体验1750亿参数OPT在线推理服务

+<p id="BLOOM-Inference" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/BLOOM%20Inference.PNG" width=800/>
+</p>
+
+- [BLOOM](https://github.com/hpcaitech/EnergonAI/tree/main/examples/bloom): 降低1750亿参数BLOOM模型部署推理成本超10倍

-<p align="right">(<a href="#top">back to top</a>)</p>
+<p align="right">(<a href="#top">返回顶端</a>)</p>

 ## Colossal-AI 成功案例

-### xTrimoMultimer: 蛋白质单体与复合物结构预测
+### AIGC
+加速AIGC(AI内容生成)模型，如[Stable Diffusion v1](https://github.com/CompVis/stable-diffusion) 和 [Stable Diffusion v2](https://github.com/Stability-AI/stablediffusion)
+
+<p id="diffusion_train" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Stable%20Diffusion%20v2.png" width=800/>
+</p>
+
+- [训练](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion): 减少5.6倍显存消耗，硬件成本最高降低46倍(从A100到RTX3060)
+
+<p id="diffusion_demo" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/DreamBooth.png" width=800/>
+</p>
+
+- [DreamBooth微调](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/dreambooth): 仅需3-5张目标主题图像个性化微调
+
+<p id="inference" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Stable%20Diffusion%20Inference.jpg" width=800/>
+</p>
+
+- [推理](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion): GPU推理显存消耗降低2.5倍
+
+
+<p align="right">(<a href="#top">返回顶端</a>)</p>
+
+### 生物医药
+
+加速 [AlphaFold](https://alphafold.ebi.ac.uk/) 蛋白质结构预测
+
+<p id="FastFold" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/FastFold.jpg" width=800/>
+</p>
+
+- [FastFold](https://github.com/hpcaitech/FastFold): 加速AlphaFold训练与推理、数据前处理、推理序列长度超过10000残基
+
 <p id="xTrimoMultimer" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/xTM_Prediction.jpg" width=380/>
-<p></p>
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/xTrimoMultimer_Table.jpg" width=800/>
 </p>

 - [xTrimoMultimer](https://github.com/biomap-research/xTrimoMultimer): 11倍加速蛋白质单体与复合物结构预测

+<p align="right">(<a href="#top">返回顶端</a>)</p>

 ## 安装

+### 从PyPI安装
+
+您可以用下面的命令直接从PyPI上下载并安装Colossal-AI。我们默认不会安装PyTorch扩展包
+
+```bash
+pip install colossalai
+```
+
+但是，如果你想在安装时就直接构建PyTorch扩展，您可以设置环境变量`CUDA_EXT=1`.
+
+```bash
+CUDA_EXT=1 pip install colossalai
+```
+
+**否则，PyTorch扩展只会在你实际需要使用他们时在运行时里被构建。**
+
+与此同时，我们也每周定时发布Nightly版本，这能让你提前体验到新的feature和bug fix。你可以通过以下命令安装Nightly版本。
+
+```bash
+pip install colossalai-nightly
+```
+
 ### 从官方安装

 您可以访问我们[下载](https://www.colossalai.org/download)页面来安装Colossal-AI，在这个页面上发布的版本都预编译了CUDA扩展。
@@ -231,10 +296,10 @@ pip install -r requirements/requirements.txt
 pip install .
 ```

-如果您不想安装和启用 CUDA 内核融合（使用融合优化器时强制安装）：
+我们默认在`pip install`时不安装PyTorch扩展，而是在运行时临时编译，如果你想要提前安装这些扩展的话（在使用融合优化器时会用到），可以使用一下命令。

 ```shell
-NO_CUDA_EXT=1 pip install .
+CUDA_EXT=1 pip install .
 ```

 <p align="right">(<a href="#top">返回顶端</a>)</p>
@@ -283,31 +348,6 @@ docker run -ti --gpus all --rm --ipc=host colossalai bash

 <p align="right">(<a href="#top">返回顶端</a>)</p>

-## 快速预览
-
-### 几行代码开启分布式训练
-
-```python
-parallel = dict(
-    pipeline=2,
-    tensor=dict(mode='2.5d', depth = 1, size=4)
-)
-```
-
-### 几行代码开启异构训练
-
-```python
-zero = dict(
-    model_config=dict(
-        tensor_placement_policy='auto',
-        shard_strategy=TensorShardStrategy(),
-        reuse_fp16_shard=True
-    ),
-    optimizer_config=dict(initial_scale=2**5, gpu_margin_mem_ratio=0.2)
-)
-```
-
-<p align="right">(<a href="#top">返回顶端</a>)</p>

 ## 引用我们


--- a/README.md
+++ b/README.md
 # Colossal-AI
 <div id="top" align="center">

-   [![logo](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Colossal-AI_logo.png)](https://www.colossalai.org/)
+   [![logo](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/colossal-ai_logo_vertical.png)](https://www.colossalai.org/)

   Colossal-AI: A Unified Deep Learning System for Big Model Era

@@ -23,6 +23,13 @@

 </div>

+## Latest News
+* [2023/01] [Hardware Savings Up to 46 Times for AIGC and  Automatic Parallelism](https://www.hpc-ai.tech/blog/colossal-ai-0-2-0)
+* [2022/11] [Diffusion Pretraining and Hardware Fine-Tuning Can Be Almost 7X Cheaper](https://www.hpc-ai.tech/blog/diffusion-pretraining-and-hardware-fine-tuning-can-be-almost-7x-cheaper)
+* [2022/10] [Use a Laptop to Analyze 90% of Proteins, With a Single-GPU Inference Sequence Exceeding 10,000](https://www.hpc-ai.tech/blog/use-a-laptop-to-analyze-90-of-proteins-with-a-single-gpu-inference-sequence-exceeding)
+* [2022/10] [Embedding Training With 1% GPU Memory and 100 Times Less Budget for Super-Large Recommendation Model](https://www.hpc-ai.tech/blog/embedding-training-with-1-gpu-memory-and-10-times-less-budget-an-open-source-solution-for)
+* [2022/09] [HPC-AI Tech Completes $6 Million Seed and Angel Round Fundraising](https://www.hpc-ai.tech/blog/hpc-ai-tech-completes-6-million-seed-and-angel-round-fundraising-led-by-bluerun-ventures-in-the)
+
 ## Table of Contents
 <ul>
 <li><a href="#Why-Colossal-AI">Why Colossal-AI</a> </li>
@@ -30,12 +37,12 @@
 <li>
   <a href="#Parallel-Training-Demo">Parallel Training Demo</a>
   <ul>
-     <li><a href="#ViT">ViT</a></li>
     <li><a href="#GPT-3">GPT-3</a></li>
     <li><a href="#GPT-2">GPT-2</a></li>
     <li><a href="#BERT">BERT</a></li>
     <li><a href="#PaLM">PaLM</a></li>
     <li><a href="#OPT">OPT</a></li>
+     <li><a href="#ViT">ViT</a></li>
     <li><a href="#Recommendation-System-Models">Recommendation System Models</a></li>
   </ul>
 </li>
@@ -51,12 +58,14 @@
   <ul>
     <li><a href="#GPT-3-Inference">GPT-3</a></li>
     <li><a href="#OPT-Serving">OPT-175B Online Serving for Text Generation</a></li>
+     <li><a href="#BLOOM-Inference">175B BLOOM</a></li>
   </ul>
 </li>
   <li>
   <a href="#Colossal-AI-in-the-Real-World">Colossal-AI for Real World Applications</a>
   <ul>
-     <li><a href="#xTrimoMultimer">xTrimoMultimer: Accelerating Protein Monomer and Multimer Structure Prediction</a></li>
+     <li><a href="#AIGC">AIGC: Acceleration of Stable Diffusion</a></li>
+     <li><a href="#Biomedicine">Biomedicine: Acceleration of AlphaFold Protein Structure</a></li>
   </ul>
 </li>
 <li>
@@ -69,11 +78,6 @@
 <li><a href="#Use-Docker">Use Docker</a></li>
 <li><a href="#Community">Community</a></li>
 <li><a href="#contributing">Contributing</a></li>
- <li><a href="#Quick-View">Quick View</a></li>
-   <ul>
-     <li><a href="#Start-Distributed-Training-in-Lines">Start Distributed Training in Lines</a></li>
-     <li><a href="#Write-a-Simple-2D-Parallel-Model">Write a Simple 2D Parallel Model</a></li>
-   </ul>
 <li><a href="#Cite-Us">Cite Us</a></li>
 </ul>

@@ -100,6 +104,7 @@ distributed training and inference in a few lines.
  - 1D, [2D](https://arxiv.org/abs/2104.05343), [2.5D](https://arxiv.org/abs/2105.14500), [3D](https://arxiv.org/abs/2105.14450) Tensor Parallelism
  - [Sequence Parallelism](https://arxiv.org/abs/2105.13120)
  - [Zero Redundancy Optimizer (ZeRO)](https://arxiv.org/abs/1910.02054)
+  - [Auto-Parallelism](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt/auto_parallel_with_gpt)

 - Heterogeneous Memory Management
  - [PatrickStar](https://arxiv.org/abs/2108.05818)
@@ -111,16 +116,10 @@ distributed training and inference in a few lines.
  - [Energon-AI](https://github.com/hpcaitech/EnergonAI)

 - Colossal-AI in the Real World
-  - [xTrimoMultimer](https://github.com/biomap-research/xTrimoMultimer): Accelerating Protein Monomer and Multimer Structure Prediction
+  - Biomedicine: [FastFold](https://github.com/hpcaitech/FastFold) accelerates training and inference of AlphaFold protein structure
 <p align="right">(<a href="#top">back to top</a>)</p>

 ## Parallel Training Demo
-### ViT
-<p align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/ViT.png" width="450" />
-</p>
-
- 14x larger batch size, and 5x faster training for Tensor Parallelism = 64

 ### GPT-3
 <p align="center">
@@ -154,6 +153,13 @@ distributed training and inference in a few lines.

 Please visit our [documentation](https://www.colossalai.org/) and [examples](https://github.com/hpcaitech/ColossalAI-Examples) for more details.

+### ViT
+<p align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/ViT.png" width="450" />
+</p>
+
+- 14x larger batch size, and 5x faster training for Tensor Parallelism = 64
+
 ### Recommendation System Models
 - [Cached Embedding](https://github.com/hpcaitech/CachedEmbedding), utilize software cache to train larger embedding tables with a smaller GPU memory budget.

@@ -198,26 +204,85 @@ Please visit our [documentation](https://www.colossalai.org/) and [examples](htt

 - [OPT Serving](https://service.colossalai.org/opt): Try 175-billion-parameter OPT online services for free, without any registration whatsoever.

+<p id="BLOOM-Inference" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/BLOOM%20Inference.PNG" width=800/>
+</p>
+
+- [BLOOM](https://github.com/hpcaitech/EnergonAI/tree/main/examples/bloom): Reduce hardware deployment costs of 175-billion-parameter BLOOM by more than 10 times.
+
 <p align="right">(<a href="#top">back to top</a>)</p>

 ## Colossal-AI in the Real World

-### xTrimoMultimer: Accelerating Protein Monomer and Multimer Structure Prediction
+### AIGC
+Acceleration of AIGC (AI-Generated Content) models such as [Stable Diffusion v1](https://github.com/CompVis/stable-diffusion) and [Stable Diffusion v2](https://github.com/Stability-AI/stablediffusion).
+<p id="diffusion_train" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Stable%20Diffusion%20v2.png" width=800/>
+</p>
+
+- [Training](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion): Reduce Stable Diffusion memory consumption by up to 5.6x and hardware cost by up to 46x (from A100 to RTX3060).
+
+<p id="diffusion_demo" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/DreamBooth.png" width=800/>
+</p>
+
+- [DreamBooth Fine-tuning](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/dreambooth): Personalize your model using just 3-5 images of the desired subject.
+
+<p id="inference" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Stable%20Diffusion%20Inference.jpg" width=800/>
+</p>
+
+- [Inference](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion): Reduce inference GPU memory consumption by 2.5x.
+
+
+<p align="right">(<a href="#top">back to top</a>)</p>
+
+### Biomedicine
+Acceleration of [AlphaFold Protein Structure](https://alphafold.ebi.ac.uk/)
+
+<p id="FastFold" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/FastFold.jpg" width=800/>
+</p>
+
+- [FastFold](https://github.com/hpcaitech/FastFold): accelerating training and inference on GPU Clusters, faster data processing, inference sequence containing more than 10000 residues.
+
 <p id="xTrimoMultimer" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/xTM_Prediction.jpg" width=380/>
-<p></p>
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/xTrimoMultimer_Table.jpg" width=800/>
 </p>

- [xTrimoMultimer](https://github.com/biomap-research/xTrimoMultimer): accelerating structure prediction of protein monomers and multimer by 11x
+- [xTrimoMultimer](https://github.com/biomap-research/xTrimoMultimer): accelerating structure prediction of protein monomers and multimer by 11x.
+

 <p align="right">(<a href="#top">back to top</a>)</p>

 ## Installation

+### Install from PyPI
+
+You can easily install Colossal-AI with the following command. **By defualt, we do not build PyTorch extensions during installation.**
+
+```bash
+pip install colossalai
+```
+
+However, if you want to build the PyTorch extensions during installation, you can set `CUDA_EXT=1`.
+
+```bash
+CUDA_EXT=1 pip install colossalai
+```
+
+**Otherwise, CUDA kernels will be built during runtime when you actually need it.**
+
+We also keep release the nightly version to PyPI on a weekly basis. This allows you to access the unreleased features and bug fixes in the main branch.
+Installation can be made via
+
+```bash
+pip install colossalai-nightly
+```
+
 ### Download From Official Releases

-You can visit the [Download](https://www.colossalai.org/download) page to download Colossal-AI with pre-built CUDA extensions.
+You can visit the [Download](https://www.colossalai.org/download) page to download Colossal-AI with pre-built PyTorch extensions.


 ### Download From Source
@@ -228,17 +293,15 @@ You can visit the [Download](https://www.colossalai.org/download) page to downlo
 git clone https://github.com/hpcaitech/ColossalAI.git
 cd ColossalAI

-# install dependency
-pip install -r requirements/requirements.txt
-
 # install colossalai
 pip install .
 ```

-If you don't want to install and enable CUDA kernel fusion (compulsory installation when using fused optimizer):
+By default, we do not compile CUDA/C++ kernels. ColossalAI will build them during runtime.
+If you want to install and enable CUDA kernel fusion (compulsory installation when using fused optimizer):

 ```shell
-NO_CUDA_EXT=1 pip install .
+CUDA_EXT=1 pip install .
 ```

 <p align="right">(<a href="#top">back to top</a>)</p>
@@ -289,32 +352,6 @@ Thanks so much to all of our amazing contributors!

 <p align="right">(<a href="#top">back to top</a>)</p>

-## Quick View
-
-### Start Distributed Training in Lines
-
-```python
-parallel = dict(
-    pipeline=2,
-    tensor=dict(mode='2.5d', depth = 1, size=4)
-)
-```
-
-### Start Heterogeneous Training in Lines
-
-```python
-zero = dict(
-    model_config=dict(
-        tensor_placement_policy='auto',
-        shard_strategy=TensorShardStrategy(),
-        reuse_fp16_shard=True
-    ),
-    optimizer_config=dict(initial_scale=2**5, gpu_margin_mem_ratio=0.2)
-)
-
-```
-
-<p align="right">(<a href="#top">back to top</a>)</p>

 ## Cite Us


--- a/colossalai/_C/__init__.py
+++ b/colossalai/_C/__init__.py
--- a/colossalai/__init__.py
+++ b/colossalai/__init__.py
@@ -7,4 +7,11 @@ from .initialize import (
    launch_from_torch,
 )

-__version__ = '0.1.11rc1'
+try:
+    # .version will be created by setup.py
+    from .version import __version__
+except ModuleNotFoundError:
+    # this will only happen if the user did not run `pip install`
+    # and directly set PYTHONPATH to use Colossal-AI which is a bad practice
+    __version__ = '0.0.0'
+    print('please install Colossal-AI from https://www.colossalai.org/download or from source')
--- a/colossalai/amp/apex_amp/__init__.py
+++ b/colossalai/amp/apex_amp/__init__.py
-from .apex_amp import ApexAMPOptimizer
 import torch.nn as nn
 from torch.optim import Optimizer

+from .apex_amp import ApexAMPOptimizer
+

 def convert_to_apex_amp(model: nn.Module, optimizer: Optimizer, amp_config):
    r"""A helper function to wrap training components with Apex AMP modules

--- a/colossalai/amp/naive_amp/__init__.py
+++ b/colossalai/amp/naive_amp/__init__.py
 import inspect
+
 import torch.nn as nn
 from torch.optim import Optimizer
+
 from colossalai.utils import is_no_pp_or_last_stage
-from .naive_amp import NaiveAMPOptimizer, NaiveAMPModel
-from .grad_scaler import DynamicGradScaler, ConstantGradScaler
+
 from ._fp16_optimizer import FP16Optimizer
+from .grad_scaler import ConstantGradScaler, DynamicGradScaler
+from .naive_amp import NaiveAMPModel, NaiveAMPOptimizer


 def convert_to_naive_amp(model: nn.Module, optimizer: Optimizer, amp_config):

--- a/colossalai/amp/naive_amp/_fp16_optimizer.py
+++ b/colossalai/amp/naive_amp/_fp16_optimizer.py
@@ -3,24 +3,33 @@

 import torch
 import torch.distributed as dist
-
-try:
-    import colossal_C
-except:
-    print('Colossalai should be built with cuda extension to use the FP16 optimizer')
-
+from torch.distributed import ProcessGroup
 from torch.optim import Optimizer
-from colossalai.core import global_context as gpc
+
 from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.kernel.op_builder import FusedOptimBuilder
 from colossalai.logging import get_dist_logger
-from colossalai.utils import (copy_tensor_parallel_attributes, clip_grad_norm_fp32, multi_tensor_applier)
-from torch.distributed import ProcessGroup
-from .grad_scaler import BaseGradScaler
+from colossalai.utils import clip_grad_norm_fp32, copy_tensor_parallel_attributes, multi_tensor_applier
+
 from ._utils import has_inf_or_nan, zero_gard_by_list
+from .grad_scaler import BaseGradScaler
+
+try:
+    from colossalai._C import fused_optim
+except:
+    fused_optim = None

 __all__ = ['FP16Optimizer']


+def load_fused_optim():
+    global fused_optim
+
+    if fused_optim is None:
+        fused_optim = FusedOptimBuilder().load()
+
+
 def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None):
    """
    adapted from Megatron-LM (https://github.com/NVIDIA/Megatron-LM)
@@ -33,7 +42,9 @@ def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None):
    if overflow_buf:
        overflow_buf.fill_(0)
        # Scaling with factor `1.0` is equivalent to copy.
-        multi_tensor_applier(colossal_C.multi_tensor_scale, overflow_buf, [this, that], 1.0)
+        global fused_optim
+        load_fused_optim()
+        multi_tensor_applier(fused_optim.multi_tensor_scale, overflow_buf, [this, that], 1.0)
    else:
        for this_, that_ in zip(this, that):
            that_.copy_(this_)
@@ -73,8 +84,8 @@ class FP16Optimizer(Optimizer):

        # get process group
        def _get_process_group(parallel_mode):
-            if gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(ParallelMode.DATA):
-                return gpc.get_group(ParallelMode.DATA)
+            if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode):
+                return gpc.get_group(parallel_mode)
            else:
                return None

@@ -150,6 +161,12 @@ class FP16Optimizer(Optimizer):
                f"==========================================",
                ranks=[0])

+    @property
+    def max_norm(self):
+        """Returns the maximum norm of gradient clipping.
+        """
+        return self._clip_grad_max_norm
+
    @property
    def grad_scaler(self):
        """Returns the gradient scaler.

--- a/colossalai/amp/naive_amp/_utils.py
+++ b/colossalai/amp/naive_amp/_utils.py
 from typing import List
+
 from torch import Tensor



--- a/colossalai/amp/naive_amp/grad_scaler/base_grad_scaler.py
+++ b/colossalai/amp/naive_amp/grad_scaler/base_grad_scaler.py
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-

-import torch
 from abc import ABC, abstractmethod
-from colossalai.logging import get_dist_logger
-from torch import Tensor
 from typing import Dict

+import torch
+from torch import Tensor
+
+from colossalai.logging import get_dist_logger
+
 __all__ = ['BaseGradScaler']



--- a/colossalai/amp/naive_amp/grad_scaler/dynamic_grad_scaler.py
+++ b/colossalai/amp/naive_amp/grad_scaler/dynamic_grad_scaler.py
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-

+from typing import Optional
+
 import torch
+
 from .base_grad_scaler import BaseGradScaler
-from typing import Optional

 __all__ = ['DynamicGradScaler']


--- a/colossalai/amp/naive_amp/naive_amp.py
+++ b/colossalai/amp/naive_amp/naive_amp.py
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-

+from typing import Any
+
 import torch
-import torch.nn as nn
 import torch.distributed as dist
+import torch.nn as nn
 from torch import Tensor
-from typing import Any
-from torch.optim import Optimizer
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from torch.distributed import ReduceOp
-from colossalai.core import global_context as gpc
+from torch.optim import Optimizer
+
 from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
 from colossalai.nn.optimizer import ColossalaiOptimizer
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
 from ._fp16_optimizer import FP16Optimizer


@@ -40,7 +43,11 @@ class NaiveAMPOptimizer(ColossalaiOptimizer):
        return self.optim.step()

    def clip_grad_norm(self, model: nn.Module, max_norm: float):
-        pass
+        if self.optim.max_norm == max_norm:
+            return
+        raise RuntimeError("NaiveAMP optimizer has clipped gradients during optimizer.step(). "
+                           "If you have supplied clip_grad_norm in the amp_config, "
+                           "executing the method clip_grad_norm is not allowed.")


 class NaiveAMPModel(nn.Module):

--- a/colossalai/amp/torch_amp/__init__.py
+++ b/colossalai/amp/torch_amp/__init__.py
+from typing import Optional
+
 import torch.nn as nn
-from torch.optim import Optimizer
 from torch.nn.modules.loss import _Loss
+from torch.optim import Optimizer
+
 from colossalai.context import Config
-from .torch_amp import TorchAMPOptimizer, TorchAMPModel, TorchAMPLoss
-from typing import Optional
+
+from .torch_amp import TorchAMPLoss, TorchAMPModel, TorchAMPOptimizer


 def convert_to_torch_amp(model: nn.Module,