Unverified Commit 91ebaa0a authored by robotcator's avatar robotcator Committed by GitHub
Browse files

support pytorch 2.0 (#30)



* change to torchrun

* change ubuntu 20.04

* change format

* update

* fix build error

* use latest

* fall back

* upgrade nvcr

* fallback

* fall back cu117

* add example

* remove duplicate cmd

* fix urllib

---------
Co-authored-by: default avatarjixh <jixh@dp.tech>
Co-authored-by: default avatarGuolin Ke <guolin.ke@outlook.com>
parent c04a4bcf
...@@ -25,9 +25,9 @@ jobs: ...@@ -25,9 +25,9 @@ jobs:
username: ${{ secrets.DOCKERHUB_USERNAME }} username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }} password: ${{ secrets.DOCKERHUB_TOKEN }}
- -
name: Build and push cu116 name: Build and push cu117
uses: docker/build-push-action@v3 uses: docker/build-push-action@v3
with: with:
context: ./docker/cu116/ context: ./docker/cu117/
push: true push: true
tags: dptechnology/unicore:${{ github.ref_name }}-pytorch1.12.1-cuda11.6 tags: dptechnology/unicore:${{ github.ref_name }}-pytorch2.0.0-cuda11.7
...@@ -25,9 +25,9 @@ jobs: ...@@ -25,9 +25,9 @@ jobs:
username: ${{ secrets.DOCKERHUB_USERNAME }} username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }} password: ${{ secrets.DOCKERHUB_TOKEN }}
- -
name: Build and push cu116 with rdma name: Build and push cu117 with rdma
uses: docker/build-push-action@v3 uses: docker/build-push-action@v3
with: with:
context: ./docker/rdma/ context: ./docker/rdma/
push: true push: true
tags: dptechnology/unicore:${{ github.ref_name }}-pytorch1.12.1-cuda11.6-rdma tags: dptechnology/unicore:${{ github.ref_name }}-pytorch2.0.0-cuda11.7-rdma
...@@ -36,8 +36,7 @@ jobs: ...@@ -36,8 +36,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
# os: [ubuntu-20.04] os: [ubuntu-20.04]
os: [ubuntu-18.04]
python-version: ['3.7', '3.8', '3.9', '3.10'] python-version: ['3.7', '3.8', '3.9', '3.10']
torch-version: [1.12.0, 1.12.1, 1.13.1, 2.0.0] torch-version: [1.12.0, 1.12.1, 1.13.1, 2.0.0]
cuda-version: ['113', '116', '117', '118'] cuda-version: ['113', '116', '117', '118']
...@@ -98,7 +97,7 @@ jobs: ...@@ -98,7 +97,7 @@ jobs:
- name: Install PyTorch ${{ matrix.torch-version }}+cu${{ matrix.cuda-version }} - name: Install PyTorch ${{ matrix.torch-version }}+cu${{ matrix.cuda-version }}
run: | run: |
pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses && conda clean -ya pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses && conda clean -ya
pip install --no-index --no-cache-dir torch==${{ matrix.torch-version }} -f https://download.pytorch.org/whl/cu${{ matrix.cuda-version }}/torch_stable.html pip install torch==${{ matrix.torch-version }}+cu${{ matrix.cuda-version }} --extra-index-url https://download.pytorch.org/whl/cu${{ matrix.cuda-version }}
python --version python --version
python -c "import torch; print('PyTorch:', torch.__version__)" python -c "import torch; print('PyTorch:', torch.__version__)"
python -c "import torch; print('CUDA:', torch.version.cuda)" python -c "import torch; print('CUDA:', torch.version.cuda)"
......
FROM nvidia/cuda:11.7.0-cudnn8-devel-ubuntu20.04
ENV LANG C.UTF-8
ENV OFED_VERSION=5.3-1.0.0.1
RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
rm -rf /var/lib/apt/lists/* \
/etc/apt/sources.list.d/cuda.list \
/etc/apt/sources.list.d/nvidia-ml.list && \
apt-get update && \
DEBIAN_FRONTEND=noninteractive $APT_INSTALL \
software-properties-common \
&& \
apt-get update && \
DEBIAN_FRONTEND=noninteractive $APT_INSTALL \
build-essential \
apt-utils \
ca-certificates \
wget \
git \
vim \
libssl-dev \
curl \
unzip \
unrar \
cmake \
net-tools \
sudo \
autotools-dev \
rsync \
jq \
openssh-server \
tmux \
screen \
htop \
pdsh \
openssh-client \
lshw \
dmidecode \
util-linux \
automake \
autoconf \
libtool \
net-tools \
pciutils \
libpci-dev \
libaio-dev \
libcap2 \
libtinfo5 \
fakeroot \
devscripts \
debhelper \
nfs-common
# ==================================================================
# InfiniBand & RDMA
# ------------------------------------------------------------------
RUN cd /tmp && \
wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
RUN cd /tmp && \
mkdir -p /usr/local/nccl-rdma-sharp-plugins && \
DEBIAN_FRONTEND=noninteractive apt install -y zlib1g-dev && \
git clone --depth=1 https://github.com/Mellanox/nccl-rdma-sharp-plugins.git && \
cd nccl-rdma-sharp-plugins && \
./autogen.sh && \
./configure --prefix=/usr/local/nccl-rdma-sharp-plugins --with-cuda=/usr/local/cuda && \
make && \
make install
# ==================================================================
# python
# ------------------------------------------------------------------
# Set timezone
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
ENV PATH /usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
ENV LD_LIBRARY_PATH /usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
ENV PYTHON_VERSION=3.8
RUN wget -O ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
chmod +x ~/miniconda.sh && \
~/miniconda.sh -b -p /opt/conda && \
rm ~/miniconda.sh
ENV PATH /opt/conda/bin:$PATH
RUN conda install -y python=3.8 && conda clean -ya
RUN conda install -y scipy scikit-learn pyyaml tensorboard tensorboardX && \
conda clean -ya
RUN ldconfig
# ==================================================================
# pytorch
# ------------------------------------------------------------------
ENV TORCH_CUDA_ARCH_LIST "7.0;7.5;8.0"
RUN conda install -y numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas && \
conda clean -ya
RUN pip3 install --upgrade sentry-sdk requests
RUN conda install pytorch==2.0.0 pytorch-cuda=11.7 -c pytorch -c nvidia && \
conda clean -ya
RUN cd /tmp && \
git clone https://github.com/dptech-corp/Uni-Core && \
cd Uni-Core && \
python setup.py install &&\
rm -rf /tmp/*
RUN pip install --no-cache-dir tokenizers lmdb biopython ml-collections timeout-decorator urllib3 tree dm-tree
ENV LD_LIBRARY_PATH=/usr/local/nccl-rdma-sharp-plugins/lib:$LD_LIBRARY_PATH
ENV PATH=/usr/mpi/gcc/openmpi-4.1.0rc5/bin:$PATH
ENV LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-4.1.0rc5/lib:$LD_LIBRARY_PATH
RUN ldconfig && \
apt-get clean && \
apt-get autoremove && \
rm -rf /var/lib/apt/lists/* /tmp/* && \
conda clean -ya
\ No newline at end of file
FROM nvcr.io/nvidia/pytorch:22.04-py3 FROM nvcr.io/nvidia/pytorch:22.05-py3
RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \ RUN APT_INSTALL="apt-get install -y --no-install-recommends" && \
rm -rf /var/lib/apt/lists/* \ rm -rf /var/lib/apt/lists/* \
...@@ -70,7 +70,7 @@ RUN conda install -y ninja typing && \ ...@@ -70,7 +70,7 @@ RUN conda install -y ninja typing && \
RUN pip3 install --upgrade sentry-sdk requests RUN pip3 install --upgrade sentry-sdk requests
RUN pip3 install --no-cache-dir torch==1.12.1 --extra-index-url https://download.pytorch.org/whl/cu116 && rm -rf ~/.cache/pip RUN pip3 install torch==2.0.0+cu117 --index-url https://download.pytorch.org/whl/cu117 && rm -rf ~/.cache/pip
RUN cd /tmp && \ RUN cd /tmp && \
git clone https://github.com/dptech-corp/Uni-Core && \ git clone https://github.com/dptech-corp/Uni-Core && \
......
...@@ -5,7 +5,8 @@ export OMP_NUM_THREADS=1 ...@@ -5,7 +5,8 @@ export OMP_NUM_THREADS=1
run_name=bert_example run_name=bert_example
save_dir="./save/${run_name}" save_dir="./save/${run_name}"
mkdir -p ${save_dir} mkdir -p ${save_dir}
python -m torch.distributed.launch --nproc_per_node=$n_gpu --master_port=$MASTER_PORT $(which unicore-train) ./example_data --user-dir . --valid-subset valid \
torchrun --standalone --nnodes=1 --nproc_per_node=$n_gpu $(which unicore-train) ./example_data --user-dir . --valid-subset valid \
--num-workers 0 --ddp-backend=c10d \ --num-workers 0 --ddp-backend=c10d \
--task bert --loss masked_lm --arch bert_base \ --task bert --loss masked_lm --arch bert_base \
--optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-6 --clip-norm 1.0 \ --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-6 --clip-norm 1.0 \
......
[ -z "${MASTER_PORT}" ] && MASTER_PORT=10086
[ -z "${n_gpu}" ] && n_gpu=$(nvidia-smi -L | wc -l)
export NCCL_ASYNC_ERROR_HANDLING=1
export OMP_NUM_THREADS=1
run_name=bert_example
save_dir="./save/${run_name}"
mkdir -p ${save_dir}
torchrun --nproc_per_node=$MLP_WORKER_GPU --nnodes=$MLP_WORKER_NUM --node_rank=$MLP_ROLE_INDEX --master_addr=$MLP_WORKER_0_HOST --master_port=$MLP_WORKER_0_PORT \
$(which unicore-train) ./example_data --user-dir . --valid-subset valid \
--num-workers 0 --ddp-backend=c10d \
--task bert --loss masked_lm --arch bert_base \
--optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-6 --clip-norm 1.0 \
--lr-scheduler polynomial_decay --lr 1e-4 --warmup-updates 100 --total-num-update 10000 --batch-size 4 \
--update-freq 1 --seed 1 \
--fp16 --fp16-init-scale 4 --fp16-scale-window 256 --tensorboard-logdir $save_dir/tsb \
--max-update 10000 --log-interval 100 --log-format simple \
--save-interval-updates 1000 --validate-interval-updates 1000 --keep-interval-updates 30 --no-epoch-checkpoints \
--save-dir $save_dir
...@@ -187,7 +187,7 @@ def call_main(args, main, **kwargs): ...@@ -187,7 +187,7 @@ def call_main(args, main, **kwargs):
join=True, join=True,
) )
else: else:
distributed_main(args.device_id, main, args, kwargs) distributed_main(int(os.environ['LOCAL_RANK']), main, args, kwargs)
else: else:
# single GPU main # single GPU main
main(args, **kwargs) main(args, **kwargs)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment