init colossalai, support dtk2304

08f2920e · zhuwenwen · da3f0934 · 08f2920e · 08f2920e · 08f2920e
Commit 08f2920e authored Apr 23, 2023 by zhuwenwen
20 changed files
--- a/.clang-format
+++ b/.clang-format
+BasedOnStyle: Google
--- a/.flake8
+++ b/.flake8
+[flake8]
+ignore =
+    ;W503 line break before binary operator
+    W503,
+    ;E203 whitespace before ':'
+    E203,
+; exclude file
+exclude =
+    .tox,
+    .git,
+    __pycache__,
+    build,
+    dist,
+    *.pyc,
+    *.egg-info,
+    .cache,
+    .eggs
+max-line-length = 120
+per-file-ignores = __init__.py:F401
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
+name: 🐛 Bug Report
+description: Create a report to help us reproduce and fix the bug
+title: "[BUG]: "
+labels: [bug]
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Not suitable for your needs? [Open a blank issue](https://github.com/hpcaitech/ColossalAI/issues/new).
+- type: textarea
+  attributes:
+    label: 🐛 Describe the bug
+    description: |
+      **Describe the bug**
+      A clear and concise description of what the bug is.
+      **To Reproduce**
+      Steps or code snippet to reproduce the behavior.
+      **Expected behavior**
+      A clear and concise description of what you expected to happen.
+      **Screenshots**
+      If applicable, add screenshots to help explain your problem.
+      **Optional: Affiliation**
+      Institution/email information helps better analyze and evaluate users to improve the project. Welcome to establish in-depth cooperation.
+    placeholder: |
+      A clear and concise description of what the bug is.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Environment
+    description: |
+      Please provide the environment information, eg. CUDA/cuDNN/NCCL/Python/PyTorch version.
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
+blank_issues_enabled: true
+contact_links:
+  - name: ❓ Simple question - Slack Chat
+    url: https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w
+    about: This issue tracker is not for technical support. Please use our Slack chat, and ask the community for help.
+  - name: ❓ Simple question - WeChat
+    url: https://github.com/hpcaitech/ColossalAI/blob/main/docs/images/WeChat.png
+    about: This issue tracker is not for technical support. Please use WeChat, and ask the community for help.
+  - name: 😊 Advanced question - GitHub Discussions
+    url: https://github.com/hpcaitech/ColossalAI/discussions
+    about: Use GitHub Discussions for advanced and unanswered technical questions, requiring a maintainer's answer.
\ No newline at end of file
--- a/.github/ISSUE_TEMPLATE/documentation.yml
+++ b/.github/ISSUE_TEMPLATE/documentation.yml
+name: 📚 Documentation
+description: Report an issue related to https://www.colossalai.org/
+title: "[DOC]: "
+labels: [documentation]
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Not suitable for your needs? [Open a blank issue](https://github.com/hpcaitech/ColossalAI/issues/new).
+- type: textarea
+  attributes:
+    label: 📚 The doc issue
+    description: |
+      **Description** What content in [Documentation](https://www.colossalai.org/) is an issue?
+      **Location** Where is the issue location?
+      **Expectation** What is your expected content about it?
+      **Screenshots** If applicable, add screenshots to help explain your problem.
+      **Suggestions** Tell us how we could improve the documentation.
+      **Optional: Affiliation** Institution/email information helps better analyze and evaluate users to improve the project. Welcome to establish in-depth cooperation.
+    placeholder: |
+      A clear and concise description of the issue.
+  validations:
+    required: true
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/feature_request.yml
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
+name: 🚀 Feature request
+description: Suggest an idea for this project
+title: "[FEATURE]: "
+labels: [enhancement]
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Not suitable for your needs? [Open a blank issue](https://github.com/hpcaitech/ColossalAI/issues/new).
+- type: textarea
+  attributes:
+    label: Describe the feature
+    description: |
+      **Is your feature request related to a problem? Please describe.**
+      A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+      **Describe the solution you'd like**
+      A clear and concise description of what you want to happen.
+      **Describe alternatives you've considered**
+      A clear and concise description of any alternative solutions or features you've considered.
+      **Screenshots**
+      If applicable, add screenshots to help explain your problem.
+      **Suggest a potential alternative/fix**
+      Tell us how we could improve this project.
+      **Optional: Affiliation** 
+      Institution/email information helps better analyze and evaluate users to improve the project. Welcome to establish in-depth cooperation.
+    placeholder: |
+      A clear and concise description of your idea.
+  validations:
+    required: true
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/proposal.yml
+++ b/.github/ISSUE_TEMPLATE/proposal.yml
+name: 💥 Proposal
+description: Propose a non-trivial change to Colossal-AI
+title: "[PROPOSAL]: "
+labels: [enhancement]
+body:
+- type: markdown
+  attributes:
+    value: |
+      Common reasons for proposals include:
+      - Altering the infrastructure;
+      - Bumping a critical dependency's major version;
+      - A significant improvement in user-friendliness;
+      - Significant refactor;
+      - Optional: Affiliation/email information helps better analyze and evaluate users to improve the project. Welcome to establish in-depth cooperation.
+      - ...
+      Please note this is not for feature request or bug template; such action could make us identify the issue wrongly and close it without doing anything.
+      We give you maximum freedom to write an elaborated proposal illustrating why you think the change is beneficial for us, and what steps we should take to turn this into reality.
+- type: textarea
+  attributes:
+    label: Proposal
+    description: A clear and concise description of what the proposal is.
+  validations:
+    required: true
+- type: checkboxes
+  attributes:
+    label: Self-service
+    description: |
+      If you feel like you could contribute to this issue, please check the box below. This would tell us and other people looking for contributions that someone's working on it.
+      If you do check this box, please send a pull request within 7 days after a maintainer's approval so we can still delegate this to someone else.
+      Proposals usually involve significant code changes, so please reach consensus with the maintainers before rushing to implement it, and make sure you follow the [Contributing Guidelines](https://github.com/hpcaitech/ColossalAI/blob/main/CONTRIBUTING.md).
+      This ensures that you don't waste your time and we don't waste ours reading the large diffs.
+    options:
+      - label: I'd be willing to do some initial work on this proposal myself.
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
--- a/.github/reviewer_list.yml
+++ b/.github/reviewer_list.yml
+addReviewers: true
+addAssignees: author
+numberOfReviewers: 1
+reviewers:
+  - frankleeeee
+  - kurisusnowdeng
--- a/.github/workflows/assign_reviewer.yml
+++ b/.github/workflows/assign_reviewer.yml
+name: Assign Reviewers for Team
+on: 
+  pull_request:
+    types: [opened]
+jobs:
+  assign_reviewer:
+    name: Assign Reviewer for PR
+    runs-on: ubuntu-latest
+    if: |
+      github.event.pull_request.draft == false && github.base_ref == 'main'
+      && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' 
+      && toJson(github.event.pull_request.requested_reviewers) == '[]'
+    steps:
+      - uses: kentaro-m/auto-assign-action@v1.2.1
+        with:
+          configuration-path: '.github/reviewer_list.yml'
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
+name: Build
+on:
+  pull_request:
+    types: [synchronize, labeled]
+jobs:
+  build:
+    name: Build and Test Colossal-AI
+    if: |
+        github.event.pull_request.draft == false &&
+        github.base_ref == 'main' &&
+        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' &&
+        contains( github.event.pull_request.labels.*.name, 'Run Build and Test')
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.11.0-11.3.0
+      options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
+    timeout-minutes: 40
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          repository: hpcaitech/TensorNVMe
+          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
+          path: TensorNVMe
+      - name: Install tensornvme
+        run: |
+          cd TensorNVMe
+          conda install cmake
+          pip install -r requirements.txt
+          pip install -v .
+      - uses: actions/checkout@v2
+        with:
+          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
+      - name: Install Colossal-AI
+        run: |
+          [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
+          pip install -r requirements/requirements.txt
+          pip install -v -e .
+          cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
+          pip install -r requirements/requirements-test.txt
+      - name: Unit Testing
+        run: |
+          PYTHONPATH=$PWD pytest tests
+        env:
+          DATA: /data/scratch/cifar-10
+          NCCL_SHM_DISABLE: 1
+          LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
--- a/.github/workflows/build_gpu_8.yml
+++ b/.github/workflows/build_gpu_8.yml
+name: Build on 8 GPUs
+on:
+  schedule:
+    # run at 00:00 of every Sunday
+    - cron:  '0 0 * * *'
+  workflow_dispatch:
+jobs:
+  build:
+    name: Build and Test Colossal-AI
+    if: github.repository == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, 8-gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.11.0-11.3.0
+      options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
+    timeout-minutes: 40
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          repository: hpcaitech/TensorNVMe
+          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
+          path: TensorNVMe
+      - name: Install tensornvme
+        run: |
+          cd TensorNVMe
+          conda install cmake
+          pip install -r requirements.txt
+          pip install -v .
+      - uses: actions/checkout@v2
+        with:
+          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
+      - name: Install Colossal-AI
+        run: |
+          [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
+          pip install -r requirements/requirements.txt
+          pip install -v -e .
+          cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
+          pip install -r requirements/requirements-test.txt
+      - name: Unit Testing
+        run: |
+          gpu_used=$(nvidia-smi -i 0 --query-gpu=memory.used --format=csv,noheader,nounits)
+          [ "$gpu_used" -le "100" ] && PYTHONPATH=$PWD pytest tests
+        env:
+          DATA: /data/scratch/cifar-10
+          LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
--- a/.github/workflows/close_inactive.yml
+++ b/.github/workflows/close_inactive.yml
+name: Close inactive issues
+on:
+  schedule:
+    - cron: "0 0 * * *"
+jobs:
+  close-issues:
+    if: github.event.pull_request.draft == false && github.base_ref == 'main' && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@v3
+        with:
+          days-before-issue-stale: 14
+          days-before-issue-close: -1
+          stale-issue-label: "stale"
+          stale-issue-message: "This issue is stale because it has been open for 14 days with no activity."
+#           close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
+          days-before-pr-stale: 14
+          days-before-pr-close: -1
+          stale-pr-message: "This PR is stale because it has been open for 14 days with no activity."
+#           close-pr-message: "This PR was closed because it has been inactive for 14 days since being marked as stale."
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/compatibility_test.yml
+++ b/.github/workflows/compatibility_test.yml
+name: Compatibility Test
+on:
+  workflow_dispatch:
+    inputs:
+      torch_version:
+        type: string
+        description: torch version, separated by comma
+        required: true
+      cuda_version:
+        type: string
+        description: cuda version, separated by comma
+        required: true
+jobs:
+  matrix_preparation:
+    name: Prepare Container List
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+    - id: set-matrix
+      env:
+        TORCH_VERSIONS: ${{ inputs.torch_version }}
+        CUDA_VERSIONS: ${{ inputs.cuda_version }}
+      run: |
+        IFS=','
+        DOCKER_IMAGE=()
+        for tv in $TORCH_VERSIONS
+        do
+            for cv in $CUDA_VERSIONS
+            do
+                DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tv}-${cv}\"")
+            done
+        done
+        container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" )
+        container="[${container}]"
+        echo "$container"
+        echo "::set-output name=matrix::{\"container\":$(echo "$container")}"
+  build:
+    name: Test for PyTorch Compatibility
+    needs: matrix_preparation
+    if: github.repository == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    strategy:
+      fail-fast: false
+      matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
+    container:
+      image: ${{ matrix.container }}
+      options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
+    timeout-minutes: 120
+    steps:
+      - name: Install dependencies
+        run: |
+          pip install -U pip setuptools wheel --user
+      - uses: actions/checkout@v2
+        with:
+          repository: hpcaitech/TensorNVMe
+          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
+          path: TensorNVMe
+      - name: Install tensornvme
+        run: |
+          cd TensorNVMe
+          conda install cmake
+          pip install -r requirements.txt
+          pip install -v .
+      - uses: actions/checkout@v2
+        with:
+          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
+      - name: Install Colossal-AI
+        run: |
+          pip install -r requirements/requirements.txt
+          pip install -v --no-cache-dir .
+          pip install -r requirements/requirements-test.txt
+      - name: Unit Testing
+        run: |
+          PYTHONPATH=$PWD pytest tests
+        env:
+          DATA: /data/scratch/cifar-10
+          NCCL_SHM_DISABLE: 1
+          LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
--- a/.github/workflows/draft_github_release_post.yml
+++ b/.github/workflows/draft_github_release_post.yml
+name: Draft GitHub Release Post
+on:
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - 'version.txt'
+    types:
+      - closed
+jobs:
+  release:
+    name: Draft Release Post
+    if: github.repository == 'hpcaitech/ColossalAI'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - uses: actions/setup-python@v2
+        with:
+          python-version: '3.8.14'
+      - name: generate draft
+        id: generate_draft
+        run: |
+          version=v$(cat version.txt)
+          pip install requests
+          python ./.github/workflows/scripts/generate_release_draft.py --out $PWD/release_draft.md --version $version
+          echo "::set-output name=version::$version"
+          echo "::set-output name=path::$PWD/release_draft.md"
+        env:
+          GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Create Release
+        id: create_release
+        uses: actions/create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ steps.generate_draft.outputs.version }}
+          release_name: Version ${{ steps.generate_draft.outputs.version }} Release Today!
+          body_path: ${{ steps.generate_draft.outputs.path }}
+          draft: True
+          prerelease: false
--- a/.github/workflows/release_bdist.yml
+++ b/.github/workflows/release_bdist.yml
+name: Release bdist wheel
+on:
+  workflow_dispatch:
+    inputs:
+      torch_version:
+        type: string
+        description: torch version, separated by comma
+        required: true
+        default: "all"
+      cuda_version:
+        type: string
+        description: cuda version, separated by comma
+        required: true
+      github_ref:
+        type: string
+        description: Branch or Tag
+        default: 'main'
+        required: true
+jobs:
+  matrix_preparation:
+    name: Prepare Container List
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+    - id: set-matrix
+      env:
+        TORCH_VERSIONS: ${{ inputs.torch_version }}
+        CUDA_VERSIONS: ${{ inputs.cuda_version }}
+      run: |
+        echo $TORCH_VERSIONS
+        echo $CUDA_VERSIONS
+        IFS=','
+        DOCKER_IMAGE=()
+        for cv in $CUDA_VERSIONS
+        do
+            DOCKER_IMAGE+=("\"hpcaitech/cuda-conda:${cv}\"")
+        done
+        container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" )
+        container="[${container}]"
+        echo "$container"
+        echo "::set-output name=matrix::{\"container\":$(echo "$container")}"
+  build:
+    name: Release bdist wheels
+    needs: matrix_preparation
+    if: github.repository == 'hpcaitech/ColossalAI' && contains(fromJson('["FrankLeeeee", "ver217", "feifeibear", "kurisusnowdeng"]'), github.actor)
+    runs-on: [self-hosted, gpu]
+    strategy:
+      fail-fast: false
+      matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
+    container:
+      image: ${{ matrix.container }}
+      options: --gpus all --rm
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      # cub is for cuda 10.2
+      - name: Copy scripts and checkout
+        run: |
+          cp -r ./.github/workflows/scripts/* ./
+          # link the cache diretories to current path
+          ln -s /github/home/conda_pkgs ./conda_pkgs
+          ln -s /github/home/pip_wheels ./pip_wheels
+          # set the conda package path
+          echo "pkgs_dirs:\n  - $PWD/conda_pkgs" > ~/.condarc
+          # set safe directory
+          git config --global --add safe.directory /__w/ColossalAI/ColossalAI
+          # check out
+          git checkout $git_ref
+          # get cub package for cuda 10.2
+          wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
+          unzip 1.8.0.zip
+        env:
+          git_ref: ${{ github.event.inputs.github_ref }}
+      - name: Build bdist wheel
+        run: |
+          pip install beautifulsoup4 requests packaging
+          python ./build_colossalai_wheel.py --torch_version $TORCH_VERSIONS
+        env:
+          TORCH_VERSIONS: ${{ inputs.torch_version }}
+      - name: 🚀 Deploy
+        uses: garygrossgarten/github-action-scp@release
+        with:
+          local: all_dist
+          remote: ${{ secrets.PRIVATE_PYPI_DIR }}
+          host: ${{ secrets.PRIVATE_PYPI_HOST }}
+          username: ${{ secrets.PRIVATE_PYPI_USER }}
+          password: ${{ secrets.PRIVATE_PYPI_PASSWD }}
--- a/.github/workflows/release_docker.yml
+++ b/.github/workflows/release_docker.yml
+name: Publish Docker Image to DockerHub
+on:
+  workflow_dispatch:
+  release:
+    types: [published]
+jobs:
+  release:
+    name: Publish Docker Image to DockerHub
+    if: github.repository == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    container:
+      image: "hpcaitech/docker-in-docker:latest"
+      options: --gpus all --rm -v /var/run/docker.sock:/var/run/docker.sock
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Build Docker
+        run: |
+          version=$(cat version.txt)
+          docker build --build-arg http_proxy=http://172.17.0.1:7890 --build-arg https_proxy=http://172.17.0.1:7890 -t hpcaitech/colossalai:$version ./docker
+      - name: Log in to Docker Hub
+        uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38
+        with:
+          images: hpcaitech/colossalai
+      - name: Build and push Docker image
+        uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
+        with:
+          context: .
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
--- a/.github/workflows/release_nightly.yml
+++ b/.github/workflows/release_nightly.yml
+name: Release bdist wheel for Nightly versions
+on:
+  schedule:
+    # run at 00:00 of every Sunday
+    - cron:  '0 0 * * 6'
+  workflow_dispatch:
+jobs:
+  matrix_preparation:
+    name: Prepare Container List
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+    - id: set-matrix
+      run: |
+        matrix="[\"hpcaitech/cuda-conda:11.3\", \"hpcaitech/cuda-conda:10.2\"]"
+        echo $matrix
+        echo "::set-output name=matrix::{\"container\":$(echo $matrix)}"
+  build:
+    name: Release bdist wheels
+    needs: matrix_preparation
+    if: github.repository == 'hpcaitech/ColossalAI' && contains(fromJson('["FrankLeeeee", "ver217", "feifeibear", "kurisusnowdeng"]'), github.actor)
+    runs-on: [self-hosted, gpu]
+    strategy:
+      fail-fast: false
+      matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
+    container:
+      image: ${{ matrix.container }}
+      options: --gpus all --rm
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      # cub is for cuda 10.2
+      - name: Copy scripts and checkout
+        run: |
+          cp -r ./.github/workflows/scripts/* ./
+          ln -s /github/home/pip_wheels ./pip_wheels
+          wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
+          unzip 1.8.0.zip
+      - name: Build bdist wheel
+        run: |
+          pip install beautifulsoup4 requests packaging
+          python ./build_colossalai_wheel.py --nightly
+      - name: 🚀 Deploy
+        uses: garygrossgarten/github-action-scp@release
+        with:
+          local: all_dist
+          remote: ${{ secrets.PRIVATE_PYPI_NIGHTLY_DIR }}
+          host: ${{ secrets.PRIVATE_PYPI_HOST }}
+          username: ${{ secrets.PRIVATE_PYPI_USER }}
+          password: ${{ secrets.PRIVATE_PYPI_PASSWD }}
+  remove_old_build:
+    name: Remove old nightly build
+    runs-on: ubuntu-latest
+    needs: build
+    steps:
+      - name: executing remote ssh commands using password
+        uses: appleboy/ssh-action@master
+        env:
+          BUILD_DIR: ${{ secrets.PRIVATE_PYPI_NIGHTLY_DIR }}
+        with:
+          host: ${{ secrets.PRIVATE_PYPI_HOST }}
+          username: ${{ secrets.PRIVATE_PYPI_USER }}
+          password: ${{ secrets.PRIVATE_PYPI_PASSWD }}
+          envs: BUILD_DIR
+          script: |
+            cd $BUILD_DIR
+            find . -type f -mtime +0 -exec rm -f {} +
+          script_stop: true
--- a/.github/workflows/scripts/build_colossalai_wheel.py
+++ b/.github/workflows/scripts/build_colossalai_wheel.py
+import argparse
+import os
+import subprocess
+from filecmp import cmp
+from functools import cmp_to_key
+import requests
+from bs4 import BeautifulSoup
+from packaging import version
+WHEEL_TEXT_ROOT_URL = 'https://github.com/hpcaitech/public_assets/tree/main/colossalai/torch_build/torch_wheels'
+RAW_TEXT_FILE_PREFIX = 'https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/torch_build/torch_wheels'
+CUDA_HOME = os.environ['CUDA_HOME']
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--torch_version', type=str)
+    parser.add_argument(
+        '--nightly',
+        action='store_true',
+        help=
+        'whether this build is for nightly release, if True, will only build on the latest PyTorch version and Python 3.8'
+    )
+    return parser.parse_args()
+def get_cuda_bare_metal_version():
+    raw_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True)
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    release = output[release_idx].split(".")
+    bare_metal_major = release[0]
+    bare_metal_minor = release[1][0]
+    return bare_metal_major, bare_metal_minor
+def all_wheel_info():
+    page_text = requests.get(WHEEL_TEXT_ROOT_URL).text
+    soup = BeautifulSoup(page_text)
+    all_a_links = soup.find_all('a')
+    wheel_info = dict()
+    for a_link in all_a_links:
+        if 'cuda' in a_link.text and '.txt' in a_link.text:
+            filename = a_link.text
+            torch_version, cuda_version = filename.rstrip('.txt').split('-')
+            cuda_version = cuda_version.lstrip('cuda')
+            if torch_version not in wheel_info:
+                wheel_info[torch_version] = dict()
+            wheel_info[torch_version][cuda_version] = dict()
+            file_text = requests.get(f'{RAW_TEXT_FILE_PREFIX}/{filename}').text
+            lines = file_text.strip().split('\n')
+            for line in lines:
+                parts = line.split('\t')
+                method, url, python_version = parts[:3]
+                if len(parts) > 3:
+                    flags = parts[3]
+                    flags = ' '.join(flags.split('+'))
+                else:
+                    flags = ''
+                wheel_info[torch_version][cuda_version][python_version] = dict(method=method, url=url, flags=flags)
+    return wheel_info
+def build_colossalai(wheel_info):
+    cuda_version_major, cuda_version_minor = get_cuda_bare_metal_version()
+    cuda_version_on_host = f'{cuda_version_major}.{cuda_version_minor}'
+    for torch_version, cuda_versioned_wheel_info in wheel_info.items():
+        for cuda_version, python_versioned_wheel_info in cuda_versioned_wheel_info.items():
+            if cuda_version_on_host == cuda_version:
+                for python_version, wheel_info in python_versioned_wheel_info.items():
+                    url = wheel_info['url']
+                    method = wheel_info['method']
+                    flags = wheel_info['flags']
+                    filename = url.split('/')[-1].replace('%2B', '+')
+                    cmd = f'bash ./build_colossalai_wheel.sh {method} {url} {filename} {cuda_version} {python_version} {torch_version} {flags}'
+                    os.system(cmd)
+def main():
+    args = parse_args()
+    wheel_info = all_wheel_info()
+    # filter wheels on condition
+    all_torch_versions = list(wheel_info.keys())
+    def _compare_version(a, b):
+        if version.parse(a) > version.parse(b):
+            return 1
+        else:
+            return -1
+    all_torch_versions.sort(key=cmp_to_key(_compare_version))
+    if args.nightly:
+        # only keep the latest version
+        for key in all_torch_versions[:-1]:
+            wheel_info.pop(key)
+    elif args.torch_version != 'all':
+        torch_versions = args.torch_version.split(',')
+        # only keep the torch versions specified
+        for key in all_torch_versions:
+            if key not in torch_versions:
+                wheel_info.pop(key)
+    build_colossalai(wheel_info)
+if __name__ == '__main__':
+    main()
--- a/.github/workflows/scripts/build_colossalai_wheel.sh
+++ b/.github/workflows/scripts/build_colossalai_wheel.sh
+#!/usr/bin/env bash
+method=${1}
+url=${2}
+filename=${3}
+cuda_version=${4}
+python_version=${5}
+torch_version=${6}
+flags=${@:7}
+git reset --hard HEAD
+mkdir -p ./all_dist
+source activate base
+conda create -n $python_version -y python=$python_version
+source activate $python_version
+if [ $1 == "pip" ]
+then
+    wget -nc -q -O ./pip_wheels/$filename $url
+    pip install ./pip_wheels/$filename
+elif [ $1 == 'conda' ]
+then
+    conda install pytorch==$torch_version cudatoolkit=$cuda_version $flags
+else
+    echo Invalid installation method
+    exit
+fi
+if [ $cuda_version == "10.2" ]
+then
+    cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
+fi
+python setup.py bdist_wheel
+mv ./dist/* ./all_dist
+# must remove build to enable compilation for
+# cuda extension in the next build
+rm -rf ./build
+python setup.py clean
+conda deactivate
+conda env remove -n $python_version
--- a/.github/workflows/scripts/generate_release_draft.py
+++ b/.github/workflows/scripts/generate_release_draft.py
+#!/usr/bin/env python
+# coding: utf-8
+import argparse
+import os
+import re
+import requests
+COMMIT_API = 'https://api.github.com/repos/hpcaitech/ColossalAI/commits'
+TAGS_API = 'https://api.github.com/repos/hpcaitech/ColossalAI/tags'
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--out', type=str, help='output path for the release draft', required=True)
+    parser.add_argument('--version', type=str, help='current version to release', required=True)
+    return parser.parse_args()
+def get_latest_tag_commit(headers=None):
+    res = requests.get(url=TAGS_API, headers=headers)
+    data = res.json()
+    commit_hash = data[0]['commit']['sha']
+    version = data[0]['name']
+    return commit_hash, version
+def get_commit_info(commit_hash, headers=None):
+    api = f'{COMMIT_API}/{commit_hash}'
+    res = requests.get(url=api, headers=headers)
+    return res.json()
+def get_all_commit_info(since, headers=None):
+    page = 1
+    results = []
+    while True:
+        api = f'{COMMIT_API}?since={since}&per_page=100&page={page}'
+        resp = requests.get(url=api, headers=headers)
+        data = resp.json()
+        # exit when no more data
+        if len(data) == 0:
+            break
+        results.extend(data)
+        page += 1
+    return results
+def collate_release_info(commit_info_list):
+    results = dict()
+    pattern = pattern = r'\[.*\]'
+    for commit_info in commit_info_list:
+        author = commit_info['commit']['author']['name']
+        author_url = commit_info['author']['url']
+        msg = commit_info['commit']['message']
+        match = re.search(pattern, msg)
+        if match:
+            tag = match.group().lstrip('[').rstrip(']').capitalize()
+            if tag not in results:
+                results[tag] = []
+            results[tag].append((msg, author, author_url))
+    return results
+def generate_release_post_markdown(current_version, last_version, release_info):
+    text = []
+    # add highlights
+    highlights = "## What's Changed \n\n"
+    text.append(highlights)
+    # add items
+    for k, v in release_info.items():
+        topic = f"### {k} \n"
+        text.append(topic)
+        for msg, author, author_url in v:
+            # only keep the first line
+            msg = msg.split('\n')[0]
+            item = f'{msg} by [{author}]({author_url})\n'
+            text.append(f'- {item}')
+        text.append('\n')
+    # add full change log
+    text.append(
+        f'**Full Changelog**: https://github.com/hpcaitech/ColossalAI/compare/{current_version}...{last_version}')
+    return text
+if __name__ == '__main__':
+    args = parse_args()
+    token = os.environ['GITHUB_API_TOKEN']
+    headers = {'Authorization': token}
+    # get previous release tag
+    last_release_commit, last_version = get_latest_tag_commit(headers)
+    last_release_commit_info = get_commit_info(last_release_commit, headers=headers)
+    last_release_date = last_release_commit_info['commit']['author']['date']
+    # get the commits since last release
+    commit_info = get_all_commit_info(since=last_release_date, headers=headers)
+    commit_info = commit_info[:-1]    # remove the release commit
+    # collate into markdown
+    release_info = collate_release_info(commit_info)
+    markdown_text = generate_release_post_markdown(args.version, last_version, release_info)
+    # write into a file
+    with open(args.out, 'w') as f:
+        for line in markdown_text:
+            f.write(line)