support pytorch 2.0 (#30)

* change to torchrun * change ubuntu 20.04 * change format * update * fix build error * use latest * fall back * upgrade nvcr * fallback * fall back cu117 * add example * remove duplicate cmd * fix urllib --------- Co-authored-by: jixh <jixh@dp.tech> Co-authored-by: Guolin Ke <guolin.ke@outlook.com>

support pytorch 2.0 (#30)
* change to torchrun * change ubuntu 20.04 * change format * update * fix build error * use latest * fall back * upgrade nvcr * fallback * fall back cu117 * add example * remove duplicate cmd * fix urllib --------- Co-authored-by: jixh <jixh@dp.tech> Co-authored-by: Guolin Ke <guolin.ke@outlook.com>
91ebaa0a · robotcator · GitHub · c04a4bcf · 91ebaa0a · 91ebaa0a
Unverified Commit 91ebaa0a authored Jun 07, 2023 by robotcator Committed by GitHub Jun 07, 2023
8 changed files
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -25,9 +25,9 @@ jobs:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      -
-        name: Build and push cu116
+        name: Build and push cu117
        uses: docker/build-push-action@v3
        with:
-          context: ./docker/cu116/
+          context: ./docker/cu117/
          push: true
-          tags: dptechnology/unicore:${{ github.ref_name }}-pytorch1.12.1-cuda11.6
+          tags: dptechnology/unicore:${{ github.ref_name }}-pytorch2.0.0-cuda11.7
--- a/.github/workflows/docker_rdma.yml
+++ b/.github/workflows/docker_rdma.yml
@@ -25,9 +25,9 @@ jobs:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      -
-        name: Build and push cu116 with rdma
+        name: Build and push cu117 with rdma
        uses: docker/build-push-action@v3
        with:
          context: ./docker/rdma/
          push: true
-          tags: dptechnology/unicore:${{ github.ref_name }}-pytorch1.12.1-cuda11.6-rdma
+          tags: dptechnology/unicore:${{ github.ref_name }}-pytorch2.0.0-cuda11.7-rdma
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -36,8 +36,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-          # os: [ubuntu-20.04]
+          os: [ubuntu-20.04]
-          os: [ubuntu-18.04]
          python-version: ['3.7', '3.8', '3.9', '3.10']
          torch-version: [1.12.0, 1.12.1, 1.13.1, 2.0.0]
          cuda-version: ['113', '116', '117', '118']
@@ -98,7 +97,7 @@ jobs:
      - name: Install PyTorch ${{ matrix.torch-version }}+cu${{ matrix.cuda-version }}
        run: |
          pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses && conda clean -ya
-          pip install --no-index --no-cache-dir torch==${{ matrix.torch-version }} -f https://download.pytorch.org/whl/cu${{ matrix.cuda-version }}/torch_stable.html
+          pip install torch==${{ matrix.torch-version }}+cu${{ matrix.cuda-version }} --extra-index-url https://download.pytorch.org/whl/cu${{ matrix.cuda-version }}
          python --version
          python -c "import torch; print('PyTorch:', torch.__version__)"
          python -c "import torch; print('CUDA:', torch.version.cuda)"

--- a/docker/cu117/Dockerfile
+++ b/docker/cu117/Dockerfile
+FROM nvidia/cuda:11.7.0-cudnn8-devel-ubuntu20.04
+ENV LANG C.UTF-8
+ENV OFED_VERSION=5.3-1.0.0.1
+RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
+    rm -rf /var/lib/apt/lists/* \
+           /etc/apt/sources.list.d/cuda.list \
+           /etc/apt/sources.list.d/nvidia-ml.list && \
+    apt-get update && \
+    DEBIAN_FRONTEND=noninteractive $APT_INSTALL \
+        software-properties-common \
+        && \
+    apt-get update && \
+    DEBIAN_FRONTEND=noninteractive $APT_INSTALL \
+        build-essential \
+        apt-utils \
+        ca-certificates \
+        wget \
+        git \
+        vim \
+        libssl-dev \
+        curl \
+        unzip \
+        unrar \
+        cmake \
+        net-tools \
+        sudo \
+        autotools-dev \
+        rsync \
+        jq \
+        openssh-server \
+        tmux \
+        screen \
+        htop \
+        pdsh \
+        openssh-client \
+        lshw \
+        dmidecode \
+        util-linux \
+        automake \
+        autoconf \
+        libtool \
+        net-tools \
+        pciutils \
+        libpci-dev \
+        libaio-dev \
+        libcap2 \
+        libtinfo5 \
+        fakeroot \
+        devscripts \
+        debhelper \
+        nfs-common
+# ==================================================================
+# InfiniBand & RDMA
+# ------------------------------------------------------------------
+RUN cd /tmp && \
+    wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
+    tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
+    MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
+    rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
+RUN cd /tmp && \
+    mkdir -p /usr/local/nccl-rdma-sharp-plugins && \
+    DEBIAN_FRONTEND=noninteractive apt install -y zlib1g-dev && \
+    git clone --depth=1 https://github.com/Mellanox/nccl-rdma-sharp-plugins.git && \
+    cd nccl-rdma-sharp-plugins && \
+    ./autogen.sh && \
+    ./configure --prefix=/usr/local/nccl-rdma-sharp-plugins --with-cuda=/usr/local/cuda && \
+    make && \
+    make install
+# ==================================================================
+# python
+# ------------------------------------------------------------------
+# Set timezone
+RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
+ENV PATH /usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
+ENV LD_LIBRARY_PATH /usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+ENV PYTHON_VERSION=3.8
+RUN wget -O ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
+     chmod +x ~/miniconda.sh && \
+     ~/miniconda.sh -b -p /opt/conda && \
+     rm ~/miniconda.sh
+ENV PATH /opt/conda/bin:$PATH
+RUN conda install -y python=3.8  && conda clean -ya
+RUN conda install -y scipy scikit-learn pyyaml tensorboard tensorboardX && \
+    conda clean -ya
+RUN ldconfig
+# ==================================================================
+# pytorch
+# ------------------------------------------------------------------
+ENV TORCH_CUDA_ARCH_LIST "7.0;7.5;8.0"
+RUN conda install -y numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas && \
+    conda clean -ya
+RUN pip3 install --upgrade sentry-sdk requests
+RUN conda install pytorch==2.0.0 pytorch-cuda=11.7 -c pytorch -c nvidia && \
+    conda clean -ya
+RUN cd /tmp && \
+    git clone https://github.com/dptech-corp/Uni-Core && \
+    cd Uni-Core && \
+    python setup.py install &&\
+    rm -rf  /tmp/*
+RUN pip install --no-cache-dir tokenizers lmdb biopython ml-collections timeout-decorator urllib3 tree dm-tree
+ENV LD_LIBRARY_PATH=/usr/local/nccl-rdma-sharp-plugins/lib:$LD_LIBRARY_PATH
+ENV PATH=/usr/mpi/gcc/openmpi-4.1.0rc5/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.0rc5/lib:$LD_LIBRARY_PATH
+RUN ldconfig && \
+    apt-get clean && \
+    apt-get autoremove && \
+    rm -rf /var/lib/apt/lists/* /tmp/* && \
+    conda clean -ya
\ No newline at end of file
--- a/docker/rdma/Dockerfile
+++ b/docker/rdma/Dockerfile
-FROM nvcr.io/nvidia/pytorch:22.04-py3
+FROM nvcr.io/nvidia/pytorch:22.05-py3
 RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
    rm -rf /var/lib/apt/lists/* \
@@ -70,7 +70,7 @@ RUN conda install -y ninja typing && \
 RUN pip3 install --upgrade sentry-sdk requests
-RUN pip3 install --no-cache-dir torch==1.12.1 --extra-index-url https://download.pytorch.org/whl/cu116 && rm -rf ~/.cache/pip 
+RUN pip3 install torch==2.0.0+cu117  --index-url https://download.pytorch.org/whl/cu117 && rm -rf ~/.cache/pip
 RUN cd /tmp && \
    git clone https://github.com/dptech-corp/Uni-Core && \

--- a/examples/bert/train_bert_test.sh
+++ b/examples/bert/train_bert_test.sh
@@ -5,7 +5,8 @@ export OMP_NUM_THREADS=1
 run_name=bert_example
 save_dir="./save/${run_name}"
 mkdir -p ${save_dir}
-python -m torch.distributed.launch --nproc_per_node=$n_gpu --master_port=$MASTER_PORT $(which unicore-train) ./example_data  --user-dir . --valid-subset valid \
+torchrun --standalone --nnodes=1 --nproc_per_node=$n_gpu $(which unicore-train) ./example_data  --user-dir . --valid-subset valid \
       --num-workers 0 --ddp-backend=c10d \
       --task bert --loss masked_lm --arch bert_base  \
       --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-6 --clip-norm 1.0 \

--- a/examples/bert/train_bert_test_multi_node.sh
+++ b/examples/bert/train_bert_test_multi_node.sh
+[ -z "${MASTER_PORT}" ] && MASTER_PORT=10086
+[ -z "${n_gpu}" ] && n_gpu=$(nvidia-smi -L | wc -l)
+export NCCL_ASYNC_ERROR_HANDLING=1
+export OMP_NUM_THREADS=1
+run_name=bert_example
+save_dir="./save/${run_name}"
+mkdir -p ${save_dir}
+torchrun --nproc_per_node=$MLP_WORKER_GPU --nnodes=$MLP_WORKER_NUM  --node_rank=$MLP_ROLE_INDEX  --master_addr=$MLP_WORKER_0_HOST --master_port=$MLP_WORKER_0_PORT \
+        $(which unicore-train) ./example_data  --user-dir . --valid-subset valid \
+       --num-workers 0 --ddp-backend=c10d \
+       --task bert --loss masked_lm --arch bert_base  \
+       --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-6 --clip-norm 1.0 \
+       --lr-scheduler polynomial_decay --lr 1e-4 --warmup-updates 100 --total-num-update 10000 --batch-size 4 \
+       --update-freq 1 --seed 1 \
+       --fp16 --fp16-init-scale 4 --fp16-scale-window 256 --tensorboard-logdir $save_dir/tsb \
+       --max-update 10000 --log-interval 100 --log-format simple \
+       --save-interval-updates 1000 --validate-interval-updates 1000 --keep-interval-updates 30 --no-epoch-checkpoints  \
+       --save-dir $save_dir
--- a/unicore/distributed/utils.py
+++ b/unicore/distributed/utils.py
@@ -187,7 +187,7 @@ def call_main(args, main, **kwargs):
                join=True,
            )
        else:
-            distributed_main(args.device_id, main, args, kwargs)
+            distributed_main(int(os.environ['LOCAL_RANK']), main, args, kwargs)
    else:
        # single GPU main
        main(args, **kwargs)