Oneflow 0.8 for DCU

21d47d0e · yuguo · 21d47d0e · 21d47d0e · 21d47d0e · 21d47d0e
Commit 21d47d0e authored Oct 24, 2022 by yuguo
20 changed files
--- a/docker/ci/test/requirements.txt
+++ b/docker/ci/test/requirements.txt
+sphinx==3.5.4
+jinja2<3.1
+recommonmark==0.6.0
+furo==2021.4.11b34
+sphinx-copybutton==0.5.0
+# dependencies above must be identical to docs/requirements.txt
+pycocotools
+opencv-python==4.2.0.34
+scipy
+pillow
+tensorflow-addons==0.9.1
+https://oneflow-static.oss-cn-beijing.aliyuncs.com/pipindex/pipindex-0.1.3-py2.py3-none-any.whl
--- a/docker/ci/third_party/Dockerfile
+++ b/docker/ci/third_party/Dockerfile
+ARG from
+FROM ${from}
+WORKDIR /workspace/build
+
+COPY cmake /workspace/cmake
+COPY CMakeLists.txt /workspace/CMakeLists.txt
+
+# BUILD DEPENDENCY
+COPY build/third_party /workspace/build/third_party
+RUN export LD_LIBRARY_PATH=/opt/intel/lib/intel64_lin:/opt/intel/mkl/lib/intel64:$LD_LIBRARY_PATH; \
+    cmake -DTHIRD_PARTY=ON -DONEFLOW=OFF -DCMAKE_BUILD_TYPE=Release -DRELEASE_VERSION=ON .. && make -j prepare_oneflow_third_party
--- a/docker/package/manylinux/CentOS-Base.repo
+++ b/docker/package/manylinux/CentOS-Base.repo
+# CentOS-Base.repo
+#
+# From https://mirror.tuna.tsinghua.edu.cn/help/centos/
+#
+# The mirror system uses the connecting IP address of the client and the
+# update status of each mirror to pick mirrors that are updated to and
+# geographically close to the client.  You should use this for CentOS updates
+# unless you are manually picking other mirrors.
+#
+# If the mirrorlist= does not work for you, as a fall back you can try the
+# remarked out baseurl= line instead.
+#
+#
+
+
+[base]
+name=CentOS-$releasever - Base
+baseurl=https://mirrors.tuna.tsinghua.edu.cn/centos/$releasever/os/$basearch/
+        http://mirrors.aliyun.com/centos/$releasever/os/$basearch/
+        http://mirrors.aliyuncs.com/centos/$releasever/os/$basearch/
+#mirrorlist=http://mirrorlist.centos.org/?release=$releasever&arch=$basearch&repo=os
+enabled=1
+gpgcheck=1
+gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-7
+
+#released updates
+[updates]
+name=CentOS-$releasever - Updates
+baseurl=https://mirrors.tuna.tsinghua.edu.cn/centos/$releasever/updates/$basearch/
+        http://mirrors.aliyun.com/centos/$releasever/updates/$basearch/
+        http://mirrors.aliyuncs.com/centos/$releasever/updates/$basearch/
+#mirrorlist=http://mirrorlist.centos.org/?release=$releasever&arch=$basearch&repo=updates
+enabled=1
+gpgcheck=1
+gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-7
+
+
+
+#additional packages that may be useful
+[extras]
+name=CentOS-$releasever - Extras
+baseurl=https://mirrors.tuna.tsinghua.edu.cn/centos/$releasever/extras/$basearch/
+        http://mirrors.aliyun.com/centos/$releasever/extras/$basearch/
+        http://mirrors.aliyuncs.com/centos/$releasever/extras/$basearch/
+#mirrorlist=http://mirrorlist.centos.org/?release=$releasever&arch=$basearch&repo=extras
+enabled=1
+gpgcheck=1
+gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-7
+
+
+
+#additional packages that extend functionality of existing packages
+[centosplus]
+name=CentOS-$releasever - Plus
+baseurl=https://mirrors.tuna.tsinghua.edu.cn/centos/$releasever/centosplus/$basearch/
+        http://mirrors.aliyun.com/centos/$releasever/centosplus/$basearch/
+        http://mirrors.aliyuncs.com/centos/$releasever/centosplus/$basearch/
+#mirrorlist=http://mirrorlist.centos.org/?release=$releasever&arch=$basearch&repo=centosplus
+gpgcheck=1
+enabled=0
+gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-7
--- a/docker/package/manylinux/CentOS7-Base-163.repo
+++ b/docker/package/manylinux/CentOS7-Base-163.repo
+# CentOS-Base.repo
+#
+# The mirror system uses the connecting IP address of the client and the
+# update status of each mirror to pick mirrors that are updated to and
+# geographically close to the client.  You should use this for CentOS updates
+# unless you are manually picking other mirrors.
+#
+# If the mirrorlist= does not work for you, as a fall back you can try the
+# remarked out baseurl= line instead.
+#
+#
+[base]
+name=CentOS-$releasever - Base - 163.com
+#mirrorlist=http://mirrorlist.centos.org/?release=$releasever&arch=$basearch&repo=os
+baseurl=http://mirrors.163.com/centos/$releasever/os/$basearch/
+gpgcheck=1
+gpgkey=http://mirrors.163.com/centos/RPM-GPG-KEY-CentOS-7
+
+#released updates
+[updates]
+name=CentOS-$releasever - Updates - 163.com
+#mirrorlist=http://mirrorlist.centos.org/?release=$releasever&arch=$basearch&repo=updates
+baseurl=http://mirrors.163.com/centos/$releasever/updates/$basearch/
+gpgcheck=1
+gpgkey=http://mirrors.163.com/centos/RPM-GPG-KEY-CentOS-7
+
+#additional packages that may be useful
+[extras]
+name=CentOS-$releasever - Extras - 163.com
+#mirrorlist=http://mirrorlist.centos.org/?release=$releasever&arch=$basearch&repo=extras
+baseurl=http://mirrors.163.com/centos/$releasever/extras/$basearch/
+gpgcheck=1
+gpgkey=http://mirrors.163.com/centos/RPM-GPG-KEY-CentOS-7
+
+#additional packages that extend functionality of existing packages
+[centosplus]
+name=CentOS-$releasever - Plus - 163.com
+baseurl=http://mirrors.163.com/centos/$releasever/centosplus/$basearch/
+gpgcheck=1
+enabled=0
+gpgkey=http://mirrors.163.com/centos/RPM-GPG-KEY-CentOS-7
--- a/docker/package/manylinux/Dockerfile
+++ b/docker/package/manylinux/Dockerfile
+ARG from
+FROM ${from}
+ARG use_tuna_yum=0
+ARG pip_args=""
+ARG bazel_url="https://github.com/bazelbuild/bazel/releases/download/3.4.1/bazel-3.4.1-linux-x86_64"
+LABEL maintainer="OneFlow Maintainers"
+
+# manylinux2014
+ENV AUDITWHEEL_ARCH x86_64
+ENV AUDITWHEEL_PLAT manylinux2014_$AUDITWHEEL_ARCH
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+ENV PATH $PATH:/usr/local/bin
+ENV LD_LIBRARY_PATH /usr/local/lib64:/usr/local/lib
+ENV PKG_CONFIG_PATH /usr/local/lib/pkgconfig
+
+# use tuna mirror
+COPY docker/package/manylinux/CentOS7-Base-163.repo /tmp/CentOS-Base.repo
+RUN if [ "${use_tuna_yum}" = "1" ]; then mv /tmp/CentOS-Base.repo /etc/yum.repos.d/ && yum makecache ; fi
+
+# to speed up docker img building disable cuda repo
+# in 10.1, cuda yum repo will update cublas to 10.2 and breaks build
+RUN yum-config-manager --disable cuda nvidia-ml
+
+ARG MANYLINUX_SHA=b634044
+RUN yum -y install unzip && curl -L -o manylinux.zip https://github.com/Oneflow-Inc/manylinux/archive/${MANYLINUX_SHA}.zip && unzip manylinux.zip -d tmp && cp -r tmp/*/docker/build_scripts /build_scripts && bash build_scripts/build.sh && rm -r build_scripts tmp manylinux.zip
+
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+# manylinux2014 end
+
+RUN yum-config-manager --add-repo https://yum.repos.intel.com/oneapi && \
+    rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && \
+    yum update -y && yum install -y epel-release && \
+    yum -y install centos-release-scl && \
+    yum install -y intel-oneapi-mkl-devel-2021.2.0 nasm rdma-core-devel devtoolset-7-gcc* rsync gdb
+
+RUN /opt/python/cp35-cp35m/bin/pip install $pip_args -U cmake==3.18.4.post1 && ln -s /opt/_internal/cpython-3.5.9/bin/cmake /usr/bin/cmake
+
+RUN mkdir -p /tmp && cd /tmp && \
+    curl -L -o patchelf-src.zip \
+    https://github.com/Oneflow-Inc/patchelf/archive/64bf5388ef7d45d3697c4aadbd3f5d7d68a22aa3.zip && \
+    unzip patchelf-src.zip && cd patchelf-* && ./bootstrap.sh && ./configure && make -j`nproc` && \
+    make install && cd .. && rm -rf patchelf-*
+
+RUN curl -L $bazel_url -o /usr/local/bin/bazel \
+    && chmod +x /usr/local/bin/bazel \
+    && bazel
+
+COPY dev-requirements.txt /tmp/dev-requirements.txt
+RUN /opt/python/cp36-cp36m/bin/pip install $pip_args -r /tmp/dev-requirements.txt --user \
+    && /opt/python/cp37-cp37m/bin/pip install $pip_args -r /tmp/dev-requirements.txt --user \
+    && /opt/python/cp38-cp38/bin/pip install $pip_args -r /tmp/dev-requirements.txt --user \
+    && rm /tmp/dev-requirements.txt
--- a/docker/package/manylinux/README.md
+++ b/docker/package/manylinux/README.md
+# 使用 docker 生成 OneFlow wheel 包
+
+### 创建 docker 容器
+
+在 OneFlow 源码根目录下运行:
+```
+docker build -f docker/package/manylinux/Dockerfile --build-arg from=nvidia/cuda:10.2-cudnn7-devel-centos7 -t oneflow:manylinux2014-cuda10.2 .
+```
+
+### 打包 manylinux python wheel
+
+这里有 manylinux2014(centos7) + cuda10.2 的 Dockerfile，里面安装了编译 oneflow 所需的库，假设你已经用 Dockerfile build 了一个 docker 镜像，叫做 oneflow:manylinux2014-cuda10.2，那么只要在 oneflow 源码目录运行
+
+```bash
+docker run --rm -it -v `pwd`:/oneflow-src -w /oneflow-src oneflow:manylinux2014-cuda10.2
+```
+
+
+If you prefer operate inside docker:
+
+```bash
+docker run --rm -it -v `pwd`:/oneflow-src -w /oneflow-src oneflow:manylinux2014-cuda10.2 bash
+```
+
+```bash
+/oneflow-src/docker/package/manylinux/build_wheel.sh --python3.6 --wheel-dir /oneflow-src/wheel-test
+```
+
+就会在 docker 镜像里执行 build_wheel.sh 来编译生成 python 3.5 到 python 3.8 的 oneflow manylinux2014 wheel。生成的包在 oneflow 源码目录下的 wheelhouse/ 文件夹内
+
+#### 注意事项
+
+1. 运行 `docker run` 时可能需要添加 `-e http_proxy=$http_proxy -e https_proxy=$https_proxy` 参数，以在容器内使用宿主机的代理，避免编译第三方库时因为网络问题而出错
+
+2. 只要运行了 `cmake -DTHIRD_PARTY=ON ..`，oneflow 本体都会从头编译，所以如果第三方库已经由 docker 容器编译过，这次只想增量编译 oneflow 本体，可以用命令
+
+    ```bash
+    docker run --rm -it -v `pwd`:/oneflow-src oneflow:manylinux2014-cuda10.2 /oneflow-src/docker/package/manylinux/build_wheel.sh --skip-third-party
+    ```
+
+   这会给 build_wheel.sh 传一个 `--skip-third-party` 参数，跳过第三方库的编译
+
+3. 只想在生成某些 python 版本的包，例如 python3.5，可以用命令
+
+    ```bash
+    docker run --rm -it -v `pwd`:/oneflow-src oneflow:manylinux2014-cuda10.2 /oneflow-src/docker/package/manylinux/build_wheel.sh --python3.5
+    ```
+
+    支持的参数是 `--python3.5`、`--python3.6`、`--python3.7`、`--python3.8`，需要生成多个版本可以同时传入多个参数。不传入版本参数则会生成所有的 python 版本的包
+
+3. 如果想自定义 oneflow 编译时的 cmake 参数，可以直接把 cmake 参数写出来，如：
+
+    ```bash
+    docker run --rm -it -v `pwd`:/oneflow-src oneflow:manylinux2014-cuda10.2 /oneflow-src/docker/package/manylinux/build_wheel.sh -DWITH_XLA=ON
+    ```
--- a/docker/package/manylinux/build_wheel.py
+++ b/docker/package/manylinux/build_wheel.py
+import os
+import subprocess
+import tempfile
+from pathlib import Path
+import getpass
+import uuid
+
+
+def get_arg_env(env_var_name: str, mode="run"):
+    val = os.getenv(env_var_name)
+    assert val, f"system environment variable {env_var_name} found empty"
+    if mode == "run":
+        return f"--env {env_var_name}={val}"
+    elif mode == "build":
+        return f"--build-arg {env_var_name}={val}"
+    else:
+        raise f"{mode} not supported"
+
+
+def get_proxy_build_args():
+    proxy_build_args = []
+    if os.getenv("HTTP_PROXY"):
+        for v in ["HTTP_PROXY", "HTTPS_PROXY"]:
+            proxy_build_args.append(get_arg_env(v, mode="build"))
+    if os.getenv("http_proxy"):
+        for v in ["http_proxy", "https_proxy"]:
+            proxy_build_args.append(get_arg_env(v, mode="build"))
+    return " ".join(proxy_build_args)
+
+
+def get_proxy_env_args():
+    proxy_build_args = []
+    if os.getenv("HTTP_PROXY"):
+        for v in ["HTTP_PROXY", "HTTPS_PROXY"]:
+            proxy_build_args.append(get_arg_env(v))
+    if os.getenv("http_proxy"):
+        for v in ["http_proxy", "https_proxy"]:
+            proxy_build_args.append(get_arg_env(v))
+    return " ".join(proxy_build_args)
+
+
+def build_img(
+    cuda_version,
+    oneflow_src_dir,
+    use_aliyun_mirror,
+    use_tuna,
+    use_system_proxy,
+    img_tag,
+    dry,
+):
+    cudnn_version = 7
+    if str(cuda_version).startswith("11"):
+        cudnn_version = 8
+    cuda_version_img = cuda_version
+    if cuda_version == "11.2":
+        cuda_version_img = "11.2.2"
+    if cuda_version == "11.1":
+        cuda_version_img = "11.1.1"
+    if cuda_version == "11.0":
+        cuda_version_img = "11.0.3"
+    from_img = f"nvidia/cuda:{cuda_version_img}-cudnn{cudnn_version}-devel-centos7"
+    tuna_build_arg = ""
+    if use_tuna:
+        tuna_build_arg = '--build-arg use_tuna_yum=1 --build-arg pip_args="-i https://mirrors.aliyun.com/pypi/simple"'
+    if use_aliyun_mirror:
+        tuna_build_arg += ' --build-arg bazel_url="https://oneflow-static.oss-cn-beijing.aliyuncs.com/deps/bazel-3.4.1-linux-x86_64"'
+
+    proxy_build_arg = get_proxy_build_args() if use_system_proxy else ""
+    cmd = f"docker build -f docker/package/manylinux/Dockerfile {proxy_build_arg} {tuna_build_arg} --build-arg from={from_img} -t {img_tag} ."
+    print(cmd)
+    if dry == False:
+        subprocess.check_call(cmd, cwd=oneflow_src_dir, shell=True)
+
+
+def common_cmake_args(cache_dir=None, extra_oneflow_cmake_args=None):
+    assert cache_dir
+    ret = ""
+    if (
+        not extra_oneflow_cmake_args
+        or "-DCMAKE_BUILD_TYPE" not in extra_oneflow_cmake_args
+    ):
+        ret += " -DCMAKE_BUILD_TYPE=Release"
+    if not extra_oneflow_cmake_args or "-DBUILD_RDMA" not in extra_oneflow_cmake_args:
+        ret += " -DBUILD_RDMA=ON"
+    third_party_install_dir = os.path.join(cache_dir, "build-third-party-install")
+    ret += f" -DTHIRD_PARTY_DIR={third_party_install_dir}"
+    return ret
+
+
+def get_build_dir_arg(cache_dir, oneflow_src_dir):
+    return ""
+    build_dir_real = os.path.join(cache_dir, "build")
+    build_dir_mount = os.path.join(oneflow_src_dir, "build")
+    return f"-v {build_dir_real}:{build_dir_mount}"
+
+
+def force_rm_dir(dir_to_clean):
+    print("cleaning:", dir_to_clean)
+    assert dir_to_clean
+    clean_cmd = f"docker run --network=host --rm -v {dir_to_clean}:{dir_to_clean} -w {dir_to_clean} busybox rm -rf {dir_to_clean}/*"
+    subprocess.check_call(clean_cmd, shell=True)
+
+
+def create_tmp_bash_and_run(docker_cmd, img, bash_cmd, bash_args, bash_wrap, dry):
+    with tempfile.NamedTemporaryFile(mode="w+", encoding="utf-8") as wrapper_f:
+        with tempfile.NamedTemporaryFile(mode="w+", encoding="utf-8") as f:
+            w_name = "/host" + wrapper_f.name
+            f_name = "/host" + f.name
+            bash_cmd = "PATH=/opt/python/cp37-cp37m/bin:$PATH\n" + bash_cmd
+            f.write(bash_cmd)
+            f.flush()
+            wrapped = f"""
+{bash_wrap}
+bash {bash_args} {f_name}
+"""
+            wrapper_f.write(wrapped)
+            wrapper_f.flush()
+
+            print("=" * 5 + f"bash_cmd: {f_name}" + "=" * 5)
+            print(bash_cmd)
+            print("=" * 5 + f"bash_cmd: {f_name}" + "=" * 5)
+
+            print("=" * 5 + f"wrapped: {w_name}" + "=" * 5)
+            print(wrapped)
+            print("=" * 5 + f"wrapped: {w_name}" + "=" * 5)
+
+            docker_cmd = f"{docker_cmd} -v /tmp:/host/tmp {img}"
+            cmd = f"{docker_cmd} bash {bash_args} {w_name}"
+            print(cmd)
+            if dry:
+                print("dry run, skipping")
+            else:
+                subprocess.check_call(cmd, shell=True)
+
+
+def get_common_docker_args(
+    oneflow_src_dir=None,
+    cache_dir=None,
+    current_dir=None,
+    house_dir=None,
+    use_system_proxy=True,
+    inplace=False,
+):
+    root = Path(cache_dir)
+    child = Path(current_dir)
+    assert root in child.parents
+    cwd = os.getcwd()
+    pwd_arg = f"-v {cwd}:{cwd}"
+    cache_dir_arg = f"-v {cache_dir}:{cache_dir}"
+    house_dir_arg = ""
+    if house_dir:
+        house_dir_arg = f"-v {house_dir}:{house_dir}"
+    build_dir_arg = get_build_dir_arg(cache_dir, oneflow_src_dir)
+    proxy_env_arg = get_proxy_env_args() if use_system_proxy else ""
+    inplace_attr = ""
+    if inplace == False:
+        inplace_attr = ":ro"
+    cache_dir_args = " ".join(
+        [
+            f"-v {os.path.join(cache_dir, 'ccache')}:/root/.ccache",
+            f"-v {os.path.join(cache_dir, 'local')}:/root/.local",
+            f"-v {os.path.join(cache_dir, 'cache')}:/root/.cache",
+        ]
+    )
+    return f"{cache_dir_args} -v {oneflow_src_dir}:{oneflow_src_dir}{inplace_attr} {proxy_env_arg} {pwd_arg} {house_dir_arg} {cache_dir_arg} {build_dir_arg} -w {current_dir} --shm-size=8g"
+
+
+def get_python_dir(inplace=True, oneflow_src_dir=None, cache_dir=None):
+    if inplace:
+        assert oneflow_src_dir
+        return os.path.join(oneflow_src_dir, "python")
+    else:
+        assert cache_dir
+        return os.path.join(cache_dir, "python")
+
+
+def build_third_party(
+    img_tag,
+    oneflow_src_dir,
+    cache_dir,
+    extra_oneflow_cmake_args,
+    extra_docker_args,
+    bash_args,
+    bash_wrap,
+    dry,
+    use_system_proxy,
+    inplace,
+):
+    third_party_build_dir = os.path.join(cache_dir, "build-third-party")
+    oneflow_python_dir = get_python_dir(
+        inplace=inplace, oneflow_src_dir=oneflow_src_dir, cache_dir=cache_dir
+    )
+    if inplace:
+        inplace_arg = ""
+        oneflow_python_dir_cmd = ""
+    else:
+        inplace_arg = f"-DONEFLOW_PYTHON_DIR={oneflow_python_dir}"
+        oneflow_python_dir_cmd = f"""
+        rm -rf {oneflow_python_dir}
+        cp -r {oneflow_src_dir}/python {oneflow_python_dir}
+        cd {oneflow_python_dir}
+        git init
+        git clean -nXd
+        git clean -fXd
+        cd -
+        """
+    cmake_cmd = " ".join(
+        [
+            "cmake",
+            common_cmake_args(
+                cache_dir=cache_dir, extra_oneflow_cmake_args=extra_oneflow_cmake_args
+            ),
+            "-DTHIRD_PARTY=ON -DONEFLOW=OFF",
+            extra_oneflow_cmake_args,
+            oneflow_src_dir,
+            inplace_arg,
+        ]
+    )
+
+    bash_cmd = f"""set -ex
+export ONEFLOW_PYTHON_DIR={oneflow_python_dir}
+{oneflow_python_dir_cmd}
+export PATH="$PATH:$(dirname {get_python_bin('3.6')})"
+export PYTHON_BIN_PATH={get_python_bin('3.6')}
+$PYTHON_BIN_PATH -m pip install -i https://mirrors.aliyun.com/pypi/simple --user -r {os.path.join(oneflow_src_dir, "ci/fixed-dev-requirements.txt")}
+$PYTHON_BIN_PATH -c "from __future__ import print_function;import numpy; print(numpy.get_include());"
+{cmake_cmd}
+cmake --build . -j `nproc` --target oneflow_deps
+"""
+    common_docker_args = get_common_docker_args(
+        oneflow_src_dir=oneflow_src_dir,
+        cache_dir=cache_dir,
+        current_dir=third_party_build_dir,
+        use_system_proxy=use_system_proxy,
+        inplace=inplace,
+    )
+    docker_cmd = (
+        f"docker run --network=host {extra_docker_args} --rm {common_docker_args}"
+    )
+    create_tmp_bash_and_run(docker_cmd, img_tag, bash_cmd, bash_args, bash_wrap, dry)
+
+
+def get_python_bin(version):
+    assert version in ["3.5", "3.6", "3.7", "3.8", "3.9"]
+    py_ver = "".join(version.split("."))
+    py_abi = f"cp{py_ver}-cp{py_ver}"
+    if version in ["3.5", "3.6", "3.7"]:
+        py_abi = f"{py_abi}m"
+    py_root = f"/opt/python/{py_abi}"
+    py_bin = f"{py_root}/bin/python"
+    return py_bin
+
+
+def build_oneflow(
+    img_tag,
+    oneflow_src_dir,
+    cache_dir,
+    extra_oneflow_cmake_args,
+    extra_docker_args,
+    python_version,
+    skip_wheel,
+    package_name,
+    house_dir,
+    bash_args,
+    bash_wrap,
+    dry,
+    use_system_proxy,
+    enter_bash,
+    skip_audit,
+    inplace,
+):
+    oneflow_build_dir = os.path.join(cache_dir, "build-oneflow")
+    python_bin = get_python_bin(python_version)
+    oneflow_python_dir = get_python_dir(
+        inplace=inplace, oneflow_src_dir=oneflow_src_dir, cache_dir=cache_dir
+    )
+    if inplace:
+        inplace_arg = ""
+    else:
+        inplace_arg = f"-DONEFLOW_PYTHON_DIR={oneflow_python_dir}"
+    cmake_cmd = " ".join(
+        [
+            "cmake",
+            common_cmake_args(
+                cache_dir=cache_dir, extra_oneflow_cmake_args=extra_oneflow_cmake_args
+            ),
+            "-DTHIRD_PARTY=OFF -DONEFLOW=ON",
+            extra_oneflow_cmake_args,
+            "-DCMAKE_EXPORT_COMPILE_COMMANDS=1",
+            f"-DPython3_EXECUTABLE={python_bin}",
+            f"-DCODEGEN_PYTHON_EXECUTABLE={get_python_bin('3.6')}",
+            oneflow_src_dir,
+            inplace_arg,
+        ]
+    )
+    common_docker_args = get_common_docker_args(
+        oneflow_src_dir=oneflow_src_dir,
+        cache_dir=cache_dir,
+        current_dir=oneflow_build_dir,
+        house_dir=house_dir,
+        use_system_proxy=use_system_proxy,
+        inplace=inplace,
+    )
+    docker_cmd = (
+        f"docker run --network=host --rm {common_docker_args} {extra_docker_args}"
+    )
+    if enter_bash:
+        docker_cmd += " -it"
+    bash_cmd = f"""set -ex
+export LD_LIBRARY_PATH=/opt/intel/lib/intel64_lin:/opt/intel/mkl/lib/intel64:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=/opt/intel/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/latest/lib/intel64:$LD_LIBRARY_PATH
+export ONEFLOW_SRC_DIR={oneflow_src_dir}
+export ONEFLOW_CMAKE_CMD="{cmake_cmd}"
+{python_bin} -m pip install -i https://mirrors.aliyun.com/pypi/simple --user -r {os.path.join(oneflow_src_dir, "ci/fixed-dev-requirements.txt")}
+"""
+    if enter_bash:
+        bash_cmd += "\nbash"
+    else:
+        bash_cmd += f"""
+cd {oneflow_python_dir}
+git clean -nXd -e \!oneflow/include -e \!oneflow/include/**
+git clean -fXd -e \!oneflow/include -e \!oneflow/include/**
+cd -
+{cmake_cmd}
+cmake --build . -j `nproc`
+"""
+    if skip_wheel or enter_bash:
+        pass
+    else:
+        bash_cmd += f"""
+cd {oneflow_python_dir}
+{python_bin} setup.py bdist_wheel -d /tmp/tmp_wheel --package_name {package_name}
+cd -
+"""
+    if skip_wheel == False:
+        if skip_audit:
+            bash_cmd += f"""
+    cp /tmp/tmp_wheel/*.whl {house_dir}
+    """
+        else:
+            bash_cmd += f"""
+    auditwheel repair /tmp/tmp_wheel/*.whl --wheel-dir {house_dir}
+    """
+    return create_tmp_bash_and_run(
+        docker_cmd, img_tag, bash_cmd, bash_args, bash_wrap, dry
+    )
+
+
+def is_img_existing(tag):
+    returncode = subprocess.run(
+        f"docker image inspect {tag}",
+        shell=True,
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    ).returncode
+    if returncode == 0:
+        return True
+    else:
+        return False
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--custom_img_tag", type=str, required=False, default=None,
+    )
+    parser.add_argument(
+        "--container_name", type=str, required=False, default=None,
+    )
+    parser.add_argument(
+        "--cache_dir", type=str, required=False, default=None,
+    )
+    default_wheel_house_dir = os.path.join(os.getcwd(), "wheelhouse")
+    parser.add_argument(
+        "--wheel_house_dir", type=str, required=False, default=default_wheel_house_dir,
+    )
+    parser.add_argument("--python_version", type=str, required=True)
+    parser.add_argument(
+        "--cuda_version", type=str, required=False, default="10.2",
+    )
+    parser.add_argument(
+        "--package_name", type=str, required=False, default="oneflow",
+    )
+    parser.add_argument(
+        "--extra_oneflow_cmake_args", action="append", nargs="+", default=[]
+    )
+    parser.add_argument(
+        "--extra_docker_args", type=str, required=False, default="",
+    )
+    parser.add_argument(
+        "--oneflow_src_dir", type=str, required=False, default=os.getcwd(),
+    )
+    parser.add_argument(
+        "--skip_third_party", default=False, action="store_true", required=False
+    )
+    parser.add_argument(
+        "--skip_wheel", default=False, action="store_true", required=False
+    )
+    parser.add_argument(
+        "--skip_img", default=False, action="store_true", required=False
+    )
+    parser.add_argument(
+        "--skip_audit", default=False, action="store_true", required=False
+    )
+    parser.add_argument(
+        "--build_img", default=False, action="store_true", required=False
+    )
+    parser.add_argument(
+        "--use_tuna", default=False, action="store_true", required=False
+    )
+    parser.add_argument("--dry", default=False, action="store_true", required=False)
+    parser.add_argument(
+        "--use_system_proxy", default=False, action="store_true", required=False
+    )
+    parser.add_argument("--mlir", default=False, action="store_true", required=False)
+    parser.add_argument("--gcc4", default=False, action="store_true", required=False)
+    parser.add_argument("--gcc7", default=False, action="store_true", required=False)
+    parser.add_argument("--gcc9", default=False, action="store_true", required=False)
+    parser.add_argument(
+        "--use_aliyun_mirror", default=False, action="store_true", required=False
+    )
+    parser.add_argument("--cpu", default=False, action="store_true", required=False)
+    parser.add_argument("--bash", default=False, action="store_true", required=False)
+    parser.add_argument("--inplace", default=False, action="store_true", required=False)
+    parser.add_argument(
+        "--shared_lib", default=False, action="store_true", required=False
+    )
+    parser.add_argument("--retry", default=0, type=int)
+    args = parser.parse_args()
+    if args.skip_img:
+        "Arg skip_img is deprecated. Setting it has no effect. If you want to build image, use --build_img"
+    if args.skip_wheel:
+        args.skip_audit = True
+    print("args.extra_oneflow_cmake_args", args.extra_oneflow_cmake_args)
+    assert args.package_name
+    extra_oneflow_cmake_args = " ".join(
+        [" ".join(l) for l in args.extra_oneflow_cmake_args]
+    )
+    if (not args.gcc4) and (not args.gcc7) and (not args.gcc9):
+        args.gcc7 = True
+    cuda_versions = []
+    if args.use_aliyun_mirror:
+        extra_oneflow_cmake_args += " -DTHIRD_PARTY_MIRROR=aliyun"
+    if args.shared_lib:
+        extra_oneflow_cmake_args += " -DBUILD_SHARED_LIBS=ON"
+    if args.cpu:
+        extra_oneflow_cmake_args += " -DBUILD_CUDA=OFF"
+        cuda_versions = ["10.2"]
+    else:
+        extra_oneflow_cmake_args += " -DBUILD_CUDA=ON"
+    cuda_versions = args.cuda_version.split(",")
+    cuda_versions = [v.strip() for v in cuda_versions]
+    if args.mlir:
+        extra_oneflow_cmake_args += " -DWITH_MLIR=ON"
+    else:
+        extra_oneflow_cmake_args += " -DWITH_MLIR=Off"
+    for cuda_version in cuda_versions:
+
+        cache_dir = None
+
+        def build():
+            img_tag = None
+            img_prefix = f"oneflow-manylinux2014-cuda{cuda_version}"
+            user = getpass.getuser()
+            versioned_img_tag = f"{img_prefix}:0.1"
+            if cuda_version in ["11.0", "11.1"]:
+                versioned_img_tag = f"{img_prefix}:0.2"
+            enforced_oneflow_cmake_args = ""
+            enforced_oneflow_cmake_args += " -DBUILD_TESTING=ON"
+            if float(cuda_version) >= 11:
+                assert (
+                    "CUDNN_STATIC" not in extra_oneflow_cmake_args
+                ), "CUDNN_STATIC will be set to OFF if cuda_version > 11"
+                enforced_oneflow_cmake_args += " -DCUDNN_STATIC=OFF"
+            extra_docker_args = args.extra_docker_args
+            if not args.container_name:
+                args.container_name = f"manylinux-build-run-by-{getpass.getuser()}"
+            assert args.container_name
+            subprocess.call(
+                f"docker rm -f {args.container_name}", shell=True,
+            )
+            extra_docker_args += f" --name {args.container_name}"
+            user_img_tag = f"{img_prefix}:{user}"
+            inc_img_tag = f"oneflowinc/{versioned_img_tag}"
+            img_tag = inc_img_tag
+            if args.build_img:
+                img_tag = user_img_tag
+            elif args.custom_img_tag:
+                img_tag = args.custom_img_tag
+            else:
+                if is_img_existing(versioned_img_tag):
+                    img_tag = versioned_img_tag
+                elif is_img_existing(inc_img_tag):
+                    img_tag = inc_img_tag
+                else:
+                    raise ValueError(
+                        f"img not found, please run 'docker pull {inc_img_tag}'"
+                    )
+            assert img_tag is not None
+            print("using", img_tag)
+            if args.build_img:
+                build_img(
+                    cuda_version,
+                    args.oneflow_src_dir,
+                    args.use_aliyun_mirror,
+                    args.use_tuna,
+                    args.use_system_proxy,
+                    img_tag,
+                    args.dry,
+                )
+            bash_args = ""
+            bash_wrap = ""
+            if args.gcc4:
+                bash_wrap = "gcc --version"
+            elif args.gcc7:
+                bash_wrap = """
+source scl_source enable devtoolset-7
+gcc --version
+"""
+            elif args.gcc9:
+                bash_wrap = """
+source scl_source enable devtoolset-9
+gcc --version
+"""
+            else:
+                raise ValueError("either one in gcc4, gcc7, gcc9 must be enabled")
+
+            global cache_dir
+            if args.cache_dir:
+                cache_dir = args.cache_dir
+            else:
+                cache_dir = os.path.join(os.getcwd(), "manylinux2014-build-cache")
+                sub_dir = cuda_version
+                if args.mlir:
+                    sub_dir += "-mlir"
+                if args.gcc4:
+                    sub_dir += "-gcc4"
+                if args.gcc7:
+                    sub_dir += "-gcc7"
+                if args.gcc9:
+                    sub_dir += "-gcc9"
+                if args.cpu:
+                    assert len(cuda_versions) == 1
+                    sub_dir += "-cpu"
+                if args.shared_lib:
+                    sub_dir += "-shared"
+                cache_dir = os.path.join(cache_dir, sub_dir)
+            if args.build_img:
+                return
+            if args.skip_third_party == False:
+                build_third_party(
+                    img_tag,
+                    args.oneflow_src_dir,
+                    cache_dir,
+                    extra_oneflow_cmake_args + enforced_oneflow_cmake_args,
+                    extra_docker_args,
+                    bash_args,
+                    bash_wrap,
+                    args.dry,
+                    args.use_system_proxy,
+                    args.inplace,
+                )
+            print(cuda_version.split("."))
+            cuda_version_literal = "".join(cuda_version.split(".")[:2])
+            assert len(cuda_version_literal) == 3
+            python_versions = args.python_version.split(",")
+            python_versions = [pv.strip() for pv in python_versions]
+            for python_version in python_versions:
+                print("building for python version:", python_version)
+                build_oneflow(
+                    img_tag,
+                    args.oneflow_src_dir,
+                    cache_dir,
+                    extra_oneflow_cmake_args + enforced_oneflow_cmake_args,
+                    extra_docker_args,
+                    python_version,
+                    args.skip_wheel,
+                    args.package_name,
+                    args.wheel_house_dir,
+                    bash_args,
+                    bash_wrap,
+                    args.dry,
+                    args.use_system_proxy,
+                    args.bash,
+                    args.skip_audit,
+                    args.inplace,
+                )
+
+        try:
+            build()
+        except subprocess.CalledProcessError as e:
+            print("failed: ", e.cmd, e.args)
+            if cache_dir and args.retry > 0:
+                print("clean: ", cache_dir, flush=True)
+                print("start retrying...", flush=True)
+                if args.dry:
+                    pass
+                else:
+                    force_rm_dir(cache_dir)
+                build()
+            else:
+                exit(1)
--- a/docker/package/manylinux/launch.sh
+++ b/docker/package/manylinux/launch.sh
+set -ex
+docker run --rm -it \
+    -v `pwd`:`pwd` \
+    -w `pwd` oneflow:rel-manylinux2014-cuda-11.0 bash
--- a/docs/Makefile
+++ b/docs/Makefile
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+html_cn: Makefile
+	@CN_DOCS=1 $(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)-cn" $(SPHINXOPTS) $(O)
+
+html: Makefile
+	@$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+clean: Makefile
+	@rm -rf build build-cn
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
+sphinx==3.5.4
+jinja2<3.1
+recommonmark==0.6.0
+furo==2021.4.11b34
+sphinx-copybutton==0.5.0
+# above are dev dependencies
+--pre
+--find-links https://staging.oneflow.info/branch/master/cpu
+oneflow
--- a/docs/source/_static/.gitkeep
+++ b/docs/source/_static/.gitkeep
--- a/docs/source/autograd.rst
+++ b/docs/source/autograd.rst
+oneflow.autograd
+================================================
+Functions and classes for autograd.
+---------------------------------------------------
+.. currentmodule:: oneflow.autograd
+.. autoclass:: oneflow.autograd.Function
+    :members: apply,
+    :special-members: __call__,
+
+.. automodule:: oneflow.autograd
+    :members: grad,
+      backward,
--- a/docs/source/cn/__init__.py
+++ b/docs/source/cn/__init__.py
+from .math_ops import *
+from .activation import *
--- a/docs/source/cn/activation.py
+++ b/docs/source/cn/activation.py
+import oneflow
+from oneflow.framework.docstr.utils import reset_docstr
+
+reset_docstr(
+    oneflow.nn.ReLU,
+    r"""ReLU(inplace=False)
+    
+    ReLU 激活函数，对张量中的每一个元素做 element-wise 运算，公式如下:
+
+    :math:`\text{ReLU}(x) = (x)^+ = \max(0, x)`
+
+    参数:
+        inplace: 是否做 in-place 操作。 默认为 ``False``
+
+    形状:
+        - Input: :math:`(N, *)` 其中 `*` 的意思是，可以指定任意维度
+        - Output: :math:`(N, *)` 输入形状与输出形状一致
+
+    示例：
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> relu = flow.nn.ReLU()
+        >>> ndarr = np.asarray([1, -2, 3])
+        >>> x = flow.Tensor(ndarr)
+        >>> relu(x)
+        tensor([1., 0., 3.], dtype=oneflow.float32)
+
+    """,
+)
--- a/docs/source/cn/math_ops.py
+++ b/docs/source/cn/math_ops.py
+import oneflow
+from oneflow.framework.docstr.utils import reset_docstr
+
+reset_docstr(
+    oneflow.add,
+    r"""add(input, other)
+    
+    计算 `input` 和 `other` 的和。支持 element-wise、标量和广播形式的加法。
+    公式为：
+
+    .. math::
+        out = input + other
+
+    示例：
+
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import oneflow as flow
+        
+        # element-wise 加法
+        >>> x = flow.tensor(np.random.randn(2,3), dtype=flow.float32)
+        >>> y = flow.tensor(np.random.randn(2,3), dtype=flow.float32)
+        >>> out = flow.add(x, y).numpy()
+        >>> out.shape
+        (2, 3)
+
+        # 标量加法
+        >>> x = 5
+        >>> y = flow.tensor(np.random.randn(2,3), dtype=flow.float32)
+        >>> out = flow.add(x, y).numpy()
+        >>> out.shape
+        (2, 3)
+
+        # 广播加法
+        >>> x = flow.tensor(np.random.randn(1,1), dtype=flow.float32)
+        >>> y = flow.tensor(np.random.randn(2,3), dtype=flow.float32)
+        >>> out = flow.add(x, y).numpy()
+        >>> out.shape
+        (2, 3)
+
+    """,
+)
--- a/docs/source/comm.rst
+++ b/docs/source/comm.rst
+oneflow.comm
+===================================
+oneflow communication function
+----------------------------------
+.. currentmodule:: oneflow.comm
+.. automodule:: oneflow.comm
+    :members: all_reduce, 
+        all_gather, 
+        broadcast,
+        scatter,
+        all_to_all,
+        reduce,
+        gather,
+        reduce_scatter,
+        send,
+        recv, 
+        barrier,
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
+# -*- coding: utf-8 -*-
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+import oneflow
+
+sys.path.insert(0, os.path.abspath("."))
+CN_DOCS = os.getenv("CN_DOCS")
+if CN_DOCS:
+    import cn
+
+# -- Project information -----------------------------------------------------
+
+project = u"OneFlow"
+copyright = u"2020, OneFlow"
+author = u"OneFlow"
+
+# The short X.Y version
+version = u""
+# The full version, including alpha/beta/rc tags
+release = u""
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    "sphinx.ext.autodoc",
+    "sphinx.ext.napoleon",
+    "recommonmark",
+    "sphinx_copybutton",
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+source_suffix = {
+    ".rst": "restructuredtext",
+    ".txt": "markdown",
+    ".md": "markdown",
+}
+
+# The master toctree document.
+master_doc = "index"
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = u"en"
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = None
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = "furo"
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ["_static"]
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself.  Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = "OneFlowdoc"
+
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (
+        master_doc,
+        "OneFlow.tex",
+        u"OneFlow API Reference",
+        u"Oneflow Contributors",
+        "manual",
+    ),
+]
+
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [(master_doc, "oneflow", u"OneFlow API Reference", [author], 1)]
+
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (
+        master_doc,
+        "OneFlow",
+        u"OneFlow API Reference",
+        author,
+        "OneFlow",
+        "OneFlow API Reference",
+        "Miscellaneous",
+    ),
+]
+
+
+# -- Options for Epub output -------------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ["search.html"]
+
+
+# -- Extension configuration -------------------------------------------------
+
+autodoc_default_options = {
+    "undoc-members": True,
+    "exclude-members": "forward, extra_repr, reset_parameters",
+}
+
+
+def should_skip_member(app, what, name, obj, skip, options):
+    import collections
+
+    is_deprecated = oneflow.is_deprecated(obj)
+    if is_deprecated:
+        print("skipping deprecated", what, name, obj)
+    magical = name in ["__weakref__", "__doc__", "__module__", "__dict__"]
+    return skip or is_deprecated or magical
+
+
+def setup(app):
+    app.connect("autodoc-skip-member", should_skip_member)
--- a/docs/source/cuda.rst
+++ b/docs/source/cuda.rst
+oneflow.cuda
+===================================
+ONEFLOW.CUDA 
+----------------------------------
+.. currentmodule:: oneflow.cuda
+.. automodule:: oneflow.cuda
+    :members: is_available,
+        device_count,
+        current_device,
+        set_device,
+        synchronize,
+        manual_seed_all,
+        manual_seed,
+        empty_cache,
+        HalfTensor,
+        FloatTensor,
+        DoubleTensor,
+        BoolTensor,
+        ByteTensor,
+        CharTensor,
+        IntTensor,
+        LongTensor,
\ No newline at end of file
--- a/docs/source/distributed.rst
+++ b/docs/source/distributed.rst
+oneflow.distributed
+=========================================================
+
+.. currentmodule:: oneflow.distributed
+
+run commands below to see more about usage.
+
+::
+
+    python3 -m oneflow.distributed.launch -h
+
+.. code-block::
+
+    usage: launch.py [-h] [--nnodes NNODES] [--node_rank NODE_RANK]
+                 [--nproc_per_node NPROC_PER_NODE] [--master_addr MASTER_ADDR]
+                 [--master_port MASTER_PORT] [-m] [--no_python]
+                 [--redirect_stdout_and_stderr] [--logdir LOGDIR]
+                 training_script ...
+
+    OneFlow distributed training launch helper utility that will spawn up multiple
+    distributed processes
+
+    positional arguments:
+    training_script       The full path to the single GPU training program/script to be
+                            launched in parallel, followed by all the arguments for the
+                            training script
+    training_script_args
+
+    optional arguments:
+    -h, --help            show this help message and exit
+    --nnodes NNODES       The number of nodes to use for distributed training
+    --node_rank NODE_RANK
+                            The rank of the node for multi-node distributed training
+    --nproc_per_node NPROC_PER_NODE
+                            The number of processes to launch on each node, for GPU
+                            training, this is recommended to be set to the number of GPUs in
+                            your system so that each process can be bound to a single GPU.
+    --master_addr MASTER_ADDR
+                            Master node (rank 0)'s address, should be either the IP address
+                            or the hostname of node 0, for single node multi-proc training,
+                            the --master_addr can simply be 127.0.0.1
+    --master_port MASTER_PORT
+                            Master node (rank 0)'s free port that needs to be used for
+                            communication during distributed training
+    -m, --module          Changes each process to interpret the launch script as a python
+                            module, executing with the same behavior as'python -m'.
+    --no_python           Do not prepend the training script with "python" - just exec it
+                            directly. Useful when the script is not a Python script.
+    --redirect_stdout_and_stderr
+                            write the stdout and stderr to files 'stdout' and 'stderr'. Only
+                            available when logdir is set
+    --logdir LOGDIR       Relative path to write subprocess logs to. Passing in a relative
+                            path will create a directory if needed. Note that successive
+                            runs with the same path to write logs to will overwrite existing
+                            logs, so be sure to save logs as needed.
\ No newline at end of file
--- a/docs/source/env.rst
+++ b/docs/source/env.rst
+oneflow.env
+===================================
+Environment
+----------------------------------
+.. currentmodule:: oneflow
+
+.. autofunction:: oneflow.env.get_world_size
+.. autofunction:: oneflow.env.get_rank
+.. autofunction:: oneflow.env.get_local_rank
+.. autofunction:: oneflow.env.get_node_size
+.. autofunction:: oneflow.env.init_rdma
+.. autofunction:: oneflow.env.rdma_is_initialized