# Python CircleCI 2.0 configuration file # # Check https://circleci.com/docs/2.0/language-python/ for more details # # Adopted from # https://github.com/facebookresearch/detectron2/blob/main/.circleci/config.yml # # Pro tip: download circle ci cli to validate the config locally during development. # # To reset/clean the cache update the CACHE_VERSION variable in project settings # in the fairscale project in CircleCI's web UI. The CACHE_VERSION follows the convention # v$(FAIRSCALE_VERSION)-${CACHE_NUMBER}. E.g. v0.4.2-1. CACHE_NUMBER must start # at 1 and increase in whole numbers. When changing the CACHE_VERSION manually # always set the FAIRSCALE_VERSION value to the fairscale version being tested. # To reset the cache when not updating the fairscale version, only update the # CACHE_NUMBER value. version: 2.1 orbs: codecov: codecov/codecov@1.0.2 # ------------------------------------------------------------------------------------- # Environments to run the jobs in # ------------------------------------------------------------------------------------- cpu_py38: &cpu_py38 docker: - image: circleci/python:3.8 resource_class: large cpu_py39: &cpu_py39 docker: - image: circleci/python:3.9 resource_class: large cpu_py310: &cpu_py310 docker: - image: circleci/python:3.10 resource_class: large # Here is the list of GPU images: # https://circleci.com/docs/2.0/configuration-reference/#available-linux-gpu-images # We need to use multiple gpus for several jobs. The resource_class # values are available here T101565170 # gpu.nvidia.small.multi = 2 gpus with 16 GB ram each # gpu.nvidia.medium.multi = 4 gpus with 16 GB ram each gpu_cu_11_2_small_multi: &gpu_cu_11_2_small_multi environment: CUDA_VERSION: "11.2" CUDA_HOME: /usr/local/cuda-11.2 machine: image: ubuntu-2004-cuda-11.2:202103-01 resource_class: gpu.nvidia.small.multi gpu_cu_11_2_medium_multi: &gpu_cu_11_2_medium_multi environment: CUDA_VERSION: "11.2" CUDA_HOME: /usr/local/cuda-11.2 machine: image: ubuntu-2004-cuda-11.2:202103-01 resource_class: gpu.nvidia.medium.multi # ------------------------------------------------------------------------------------- # Re-usable commands # ------------------------------------------------------------------------------------- setup_venv: &setup_venv - run: name: Setup Virtual Env working_directory: ~/ # use bash -x for debug early commands executed in .bashrc. shell: /bin/bash command: | set -e set -o pipefail python -m venv ~/venv echo ". ~/venv/bin/activate" >> $BASH_ENV . ~/venv/bin/activate python --version which python which pip pip install --upgrade pip # most recent LTS version install_dep_pytorch_lts: &install_dep_pytorch_lts - run: name: Install Dependencies with torch 1.8.2 (LTS) command: | # check if we have restored venv cache (/home/circleci/venv) correctly, if so, just skip if [ -f /home/circleci/venv/check_version.py ]; then python /home/circleci/venv/check_version.py torch eq 1.8 && exit 0; fi # start installing pip install --progress-bar off torch==1.8.2+cu111 torchvision==0.9.2+cu111 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cu111 pip install --progress-bar off -r requirements-dev.txt pip install --progress-bar off -r requirements-benchmarks.txt python -c 'import torch; print("Torch version:", torch.__version__)' python -c 'import torch; assert torch.__version__.split(".")[:2] == ["1", "8"], f"wrong torch version {torch.__version__}"' python -m torch.utils.collect_env wget -O /home/circleci/venv/check_version.py https://raw.githubusercontent.com/min-xu-ai/check_verion/main/check_version.py # most recent stable version install_dep_pytorch_stable: &install_dep_pytorch_stable - run: name: Install Dependencies with torch 1.12.0 command: | # check if we have restored venv cache (/home/circleci/venv) correctly, if so, just skip if [ -f /home/circleci/venv/check_version.py ]; then python /home/circleci/venv/check_version.py torch eq 1.11 && exit 0; fi # start installing pip install --progress-bar off torch==1.12.0+cu113 torchvision==0.13.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 pip install --progress-bar off -r requirements-dev.txt pip install --progress-bar off -r requirements-benchmarks.txt python -c 'import torch; print("Torch version:", torch.__version__)' python -c 'import torch; assert torch.__version__.split(".")[:2] == ["1", "12"], f"wrong torch version {torch.__version__}"' python -m torch.utils.collect_env wget -O /home/circleci/venv/check_version.py https://raw.githubusercontent.com/min-xu-ai/check_verion/main/check_version.py install_dep_pytorch_nightly: &install_dep_pytorch_nightly - run: name: Install Dependencies with a torch nightly preview build command: | # check if we have restored venv cache (/home/circleci/venv) correctly, if so, just skip if [ -f /home/circleci/venv/check_version.py ]; then python /home/circleci/venv/check_version.py torch eq 1.12 && exit 0; fi # start installing pip install --pre torch==1.13.0.dev20220825+cu113 torchvision==0.14.0.dev20220825+cu113 --extra-index-url https://download.pytorch.org/whl/nightly/cu113 pip install --progress-bar off -r requirements-dev.txt pip install --progress-bar off -r requirements-benchmarks.txt python -c 'import torch; print("Torch version:", torch.__version__)' python -c 'import torch; assert torch.__version__.split(".")[:2] == ["1", "13"], f"wrong torch version {torch.__version__}"' python -m torch.utils.collect_env wget -O /home/circleci/venv/check_version.py https://raw.githubusercontent.com/min-xu-ai/check_verion/main/check_version.py install_repo: &install_repo - run: name: Install Repository command: | pip install . # Test import. python -c 'import sys; sys.path = sys.path[1:]; import fairscale' check_test_list: &check_test_list - run: name: Verify that unit test list files are correct command: | bash ./tests/ci_test_list_check.sh upload_coverage: &upload_coverage - codecov/upload: file: 'coverage.xml' token: $CODECOV_TOKEN run_offload_benchmark: &run_offload_benchmark - run: name: Run Offload Benchmark command: | PYTHONPATH=. python benchmarks/experimental/offload.py --checkpoint_activation run_fsdp_benchmark: &run_fsdp_benchmark - run: name: Run FSDP Benchmark command: | PYTHONPATH=. python benchmarks/fsdp.py --use_synthetic_data run_pipe_benchmark: &run_pipe_benchmark - run: name: Run Pipe Benchmark command: | PYTHONPATH=. python benchmarks/pipe.py run_oss_benchmark: &run_oss_benchmark - run: name: Run OSS Benchmark command: | PYTHONPATH=. python benchmarks/oss.py --world_size 4 --epochs 2 PYTHONPATH=. python benchmarks/oss.py --check_regression --world_size 4 --optim_type oss_sharded_ddp --epochs 12 run_oss_gloo: &run_oss_gloo - run: name: Run OSS with Gloo command: | PYTHONPATH=. python benchmarks/oss.py --gloo --optim_type oss_ddp --epochs 2 PYTHONPATH=. python benchmarks/oss.py --gloo --optim_type oss_sharded_ddp --epochs 2 run_oss_amp: &run_oss_amp - run: name: Run OSS with Torch AMP command: | PYTHONPATH=. python benchmarks/oss.py --amp --epochs 3 --optim_type oss_sharded_ddp run_oss_for_each: &run_oss_for_each - run: name: Run OSS with Torch AMP and ForEach optmizer command: | PYTHONPATH=. python benchmarks/oss.py --amp --epochs 3 --optim_type oss_sharded_ddp --multi_tensor_optim run_doc_build: &run_doc_build - run: name: Testing doc build command: | cd docs pip install --progress-bar off -r requirements.txt make help make singlehtml | tee make.out ! tail make.out | grep -q warning # This is an alias to run all unit tests possible on a platform. run_unittests: &run_unittests - run: name: Run all unit tests. # We run all and not stopping on failure on CPU since docker time is cheaper. command: | ulimit -n 10000 pytest --junitxml=test-results/junit.xml --verbose --timeout 60 --cov-report=xml --cov=./ commands: # This is a command (like a function) that run tests from a given test_list_file. # If test_list_file is not given, this results in an error. run_unittests_from_list: parameters: test_list_file: type: string default: "/dev/non_exist" # Default to error out steps: - run: name: Run Unit Tests command: | ulimit -n 10000 if [ ! -f <> ]; then exit 1; fi pytest --junitxml=test-results/junit.xml --verbose --timeout 70 --cov-report=xml --cov=./ `cat <>` setup_pyenv: parameters: version: type: string steps: # Cache the pyenv download directory to avoid re-downloading over and over. - restore_cache: keys: - cache-key-pyenv-3-9-7-v1 - run: name: Setup pyenv # We used to use the following commands to update pyenv. # git clone https://github.com/pyenv/pyenv-update.git $(pyenv root)/plugins/pyenv-update # pyenv update # However, it is not deterministic since pyenv is being updated. # It is now fixed to a version. (v2.3.0 is broken since it cause bash to fail when it try to do "eval $(pyenv init -)") # # We use "-sf" to skip already installed version after cache restoring the pyenv dir. This should avoid redoing # downloading and installing. The key name has a "v1" in the end so that we can debug and try different cache objects # while doing development. command: | cd /opt/circleci/.pyenv/ git remote update git checkout v2.2.0 pyenv install -sf <> pyenv global <> - save_cache: paths: - /opt/circleci/.pyenv key: cache-key-pyenv-3-9-7-v1 # ------------------------------------------------------------------------------------- # Jobs to run # ------------------------------------------------------------------------------------- jobs: cpu_tests_py38: <<: *cpu_py38 working_directory: ~/fairscale steps: - checkout - <<: *check_test_list - <<: *setup_venv # Do this first to test repo dependencies. Basic import should work after this # installation. See issue #1042 for an example. - <<: *install_repo # Cache the venv directory that contains dependencies - restore_cache: keys: - cache-key-cpu-py38-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}} - <<: *install_dep_pytorch_stable - save_cache: paths: - ~/venv key: cache-key-cpu-py38-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}} - <<: *run_unittests - <<: *run_doc_build - store_test_results: path: test-results cpu_tests_py39: <<: *cpu_py39 working_directory: ~/fairscale steps: - checkout - <<: *check_test_list - <<: *setup_venv # Do this first to test repo dependencies. Basic import should work after this # installation. See issue #1042 for an example. - <<: *install_repo # Cache the venv directory that contains dependencies - restore_cache: keys: - cache-key-cpu-py39-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}} - <<: *install_dep_pytorch_stable - save_cache: paths: - ~/venv key: cache-key-cpu-py39-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}} - <<: *run_unittests - <<: *run_doc_build - store_test_results: path: test-results cpu_tests_py310: <<: *cpu_py310 working_directory: ~/fairscale steps: - checkout - <<: *check_test_list - <<: *setup_venv # Do this first to test repo dependencies. Basic import should work after this # installation. See issue #1042 for an example. - <<: *install_repo # Cache the venv directory that contains dependencies - restore_cache: keys: - cache-key-cpu-py310-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}} - <<: *install_dep_pytorch_stable - save_cache: paths: - ~/venv key: cache-key-cpu-py310-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}} - <<: *run_unittests - <<: *run_doc_build - store_test_results: path: test-results gpu_tests_lts: parameters: test_list_file: type: string default: "/dev/non_exist" <<: *gpu_cu_11_2_small_multi working_directory: ~/fairscale steps: - checkout - run: nvidia-smi # Run this to make sure we use python3 from the system. - setup_pyenv: version: 3.9.7 - <<: *setup_venv # Cache the venv directory that contains dependencies - restore_cache: keys: - cache-key-py-3-9-7-gpu-torch-1-8-2-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}} - <<: *install_dep_pytorch_lts - save_cache: paths: - ~/venv key: cache-key-py-3-9-7-gpu-torch-1-8-2-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}} - <<: *install_repo - run_unittests_from_list: test_list_file: <> - store_test_results: path: test-results - <<: *upload_coverage gpu_tests_stable: parameters: test_list_file: type: string default: "/dev/non_exist" <<: *gpu_cu_11_2_small_multi working_directory: ~/fairscale steps: - checkout - run: nvidia-smi # Run this to make sure we use python3 from the system. - setup_pyenv: version: 3.9.7 - <<: *setup_venv # Cache the venv directory that contains dependencies - restore_cache: keys: - cache-key-py-3-9-7-gpu-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}} - <<: *install_dep_pytorch_stable - save_cache: paths: - ~/venv key: cache-key-py-3-9-7-gpu-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}} - <<: *install_repo - run_unittests_from_list: test_list_file: <> - store_test_results: path: test-results gpu_tests_pytorch_nightly: parameters: test_list_file: type: string default: "/dev/non_exist" <<: *gpu_cu_11_2_medium_multi working_directory: ~/fairscale steps: - checkout - run: nvidia-smi # Run this to make sure we use python3 from the system. - setup_pyenv: version: 3.9.7 - <<: *setup_venv # Cache the venv directory that contains dependencies - restore_cache: keys: - cache-key-py-3-9-7-gpu-torch-1-12-0424-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}} - <<: *install_dep_pytorch_nightly - save_cache: paths: - ~/venv key: cache-key-py-3-9-7-gpu-torch-1-12-0424-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}} - <<: *install_repo - run_unittests_from_list: test_list_file: <> - store_test_results: path: test-results benchmarks_1: <<: *gpu_cu_11_2_small_multi working_directory: ~/fairscale steps: - checkout - run: nvidia-smi - setup_pyenv: version: 3.9.7 - <<: *setup_venv # Cache the venv directory that contains dependencies - restore_cache: keys: - cache-key-py-3-9-7-benchmarks-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}} # Cache the MNIST directory that contains benchmark data - restore_cache: keys: - cache-key-benchmark-MNIST-{{.Environment.CACHE_VERSION}}-{{checksum "benchmarks/datasets/mnist.py"}} - <<: *install_dep_pytorch_stable - save_cache: paths: - ~/venv key: cache-key-py-3-9-7-benchmarks-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}} - <<: *install_repo - <<: *run_pipe_benchmark - <<: *run_offload_benchmark - <<: *run_oss_amp - <<: *run_oss_for_each - <<: *run_oss_gloo - <<: *run_fsdp_benchmark - save_cache: paths: - /tmp/MNIST key: cache-key-benchmark-MNIST-{{.Environment.CACHE_VERSION}}-{{checksum "benchmarks/datasets/mnist.py"}} benchmarks_2: <<: *gpu_cu_11_2_medium_multi working_directory: ~/fairscale steps: - checkout - run: nvidia-smi - setup_pyenv: version: 3.9.7 - <<: *setup_venv # Do this first to test an issue like #1042. We keep benchmark_1 doing it # after requirements-dev.txt, but change benchmark_2 to do this earlier to # ensure both cases are tested. - <<: *install_repo # Cache the venv directory that contains dependencies - restore_cache: keys: - cache-key-py-3-9-7-benchmarks-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}} # Cache the MNIST directory that contains benchmark data - restore_cache: keys: - cache-key-benchmark-MNIST-{{.Environment.CACHE_VERSION}}-{{checksum "benchmarks/datasets/mnist.py"}} - <<: *install_dep_pytorch_stable - save_cache: paths: - ~/venv key: cache-key-py-3-9-7-benchmarks-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}} - <<: *run_oss_benchmark - save_cache: paths: - /tmp/MNIST key: cache-key-benchmark-MNIST-{{.Environment.CACHE_VERSION}}-{{checksum "benchmarks/datasets/mnist.py"}} workflows: version: 2 build: jobs: - cpu_tests_py38 - cpu_tests_py39 - cpu_tests_py310 - gpu_tests_lts: test_list_file: tests/ci_test_list_1.txt - gpu_tests_stable: test_list_file: tests/ci_test_list_1.txt - gpu_tests_pytorch_nightly: test_list_file: tests/ci_test_list_1.txt - gpu_tests_lts: test_list_file: tests/ci_test_list_2.txt - gpu_tests_stable: test_list_file: tests/ci_test_list_2.txt - gpu_tests_pytorch_nightly: test_list_file: tests/ci_test_list_2.txt - gpu_tests_lts: test_list_file: tests/ci_test_list_3.txt - gpu_tests_stable: test_list_file: tests/ci_test_list_3.txt - gpu_tests_pytorch_nightly: test_list_file: tests/ci_test_list_3.txt - benchmarks_1 - benchmarks_2