config.yml 22.2 KB
Newer Older
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
1
2
3
4
5
# Python CircleCI 2.0 configuration file
#
# Check https://circleci.com/docs/2.0/language-python/ for more details
#
# Adopted from
6
# https://github.com/facebookresearch/detectron2/blob/main/.circleci/config.yml
7
8
#
# Pro tip: download circle ci cli to validate the config locally during development.
9
10
#
# To reset/clean the cache update the CACHE_VERSION variable in project settings
Min Xu's avatar
Min Xu committed
11
# in the fairscale project in CircleCI's web UI. The CACHE_VERSION follows the convention
12
13
14
15
16
# v$(FAIRSCALE_VERSION)-${CACHE_NUMBER}. E.g. v0.4.2-1. CACHE_NUMBER must start
# at 1 and increase in whole numbers. When changing the CACHE_VERSION manually
# always set the FAIRSCALE_VERSION value to the fairscale version being tested.
# To reset the cache when not updating the fairscale version, only update the
# CACHE_NUMBER value.
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
17

Min Xu's avatar
Min Xu committed
18
version: 2.1
Min Xu's avatar
Min Xu committed
19

Min Xu's avatar
Min Xu committed
20
21
orbs:
  codecov: codecov/codecov@1.0.2
Min Xu's avatar
Min Xu committed
22

Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
23
24
25
# -------------------------------------------------------------------------------------
# Environments to run the jobs in
# -------------------------------------------------------------------------------------
26
27
28
cpu_py38: &cpu_py38
  docker:
    - image: circleci/python:3.8
29
  resource_class: large
30
31
32
33

cpu_py39: &cpu_py39
  docker:
    - image: circleci/python:3.9
34
  resource_class: large
35

Min Xu's avatar
Min Xu committed
36
37
38
39
40
cpu_py310: &cpu_py310
  docker:
    - image: circleci/python:3.10
  resource_class: large

Min Xu's avatar
Min Xu committed
41
42
43
44
45
cpu_py311: &cpu_py311
  docker:
    - image: cimg/python:3.11.0
  resource_class: large

46
# Here is the list of GPU images:
Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
47
# https://circleci.com/docs/2.0/configuration-reference/#available-linux-gpu-images
48
49
# We need to use multiple gpus for several jobs. The resource_class
# values are available here T101565170
Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
50
51
52
53
# gpu.nvidia.small.multi = 2 gpus with 16 GB ram each
# gpu.nvidia.medium.multi = 4 gpus with 16 GB ram each

gpu_cu_11_2_small_multi: &gpu_cu_11_2_small_multi
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
54
  environment:
Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
55
56
    CUDA_VERSION: "11.2"
    CUDA_HOME: /usr/local/cuda-11.2
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
57
  machine:
Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
58
59
    image: ubuntu-2004-cuda-11.2:202103-01
  resource_class: gpu.nvidia.small.multi
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
60

Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
61
gpu_cu_11_2_medium_multi: &gpu_cu_11_2_medium_multi
62
  environment:
63
64
    CUDA_VERSION: "11.2"
    CUDA_HOME: /usr/local/cuda-11.2
65
  machine:
66
    image: ubuntu-2004-cuda-11.2:202103-01
Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
67
  resource_class: gpu.nvidia.medium.multi
68

Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
69
70
71
72
73
74
75
# -------------------------------------------------------------------------------------
# Re-usable commands
# -------------------------------------------------------------------------------------
setup_venv: &setup_venv
  - run:
      name: Setup Virtual Env
      working_directory: ~/
76
77
      # use bash -x for debug early commands executed in .bashrc.
      shell: /bin/bash
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
78
      command: |
79
80
        set -e
        set -o pipefail
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
81
82
83
84
85
86
87
88
        python -m venv ~/venv
        echo ". ~/venv/bin/activate" >> $BASH_ENV
        . ~/venv/bin/activate
        python --version
        which python
        which pip
        pip install --upgrade pip

Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
89
# most recent LTS version
90
install_dep_pytorch_lts: &install_dep_pytorch_lts
91
  - run:
92
      name: Install Dependencies with torch 1.8.2 (LTS)
93
      command: |
94
        # check if we have restored venv cache (/home/circleci/venv) correctly, if so, just skip
Min Xu's avatar
Min Xu committed
95
        if [ -f /home/circleci/venv/check_version.py ]; then python /home/circleci/venv/check_version.py torch eq 1.8 && exit 0; fi
96
        # start installing
97
        pip install --progress-bar off torch==1.8.2+cu111 torchvision==0.9.2+cu111 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cu111
98
        pip install --progress-bar off -r requirements-dev.txt
99
        pip install --progress-bar off -r requirements-benchmarks.txt
100
        python -c 'import torch; print("Torch version:", torch.__version__)'
101
        python -c 'import torch; assert torch.__version__.split(".")[:2] == ["1", "8"], f"wrong torch version {torch.__version__}"'
102
        python -m torch.utils.collect_env
Min Xu's avatar
Min Xu committed
103
        wget -O /home/circleci/venv/check_version.py https://raw.githubusercontent.com/min-xu-ai/check_verion/main/check_version.py
104

Min Xu's avatar
Min Xu committed
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# most recent stable version on cpu
# for testing 3.8/9/10/11, make sure torch has the right whl files.
install_dep_pytorch_stable_cpu: &install_dep_pytorch_stable_cpu
  - run:
      name: Install Dependencies with torch 1.13.0 on CPU
      command: |
        # check if we have restored venv cache (/home/circleci/venv) correctly, if so, just skip
        if [ -f /home/circleci/venv/check_version.py ]; then python /home/circleci/venv/check_version.py torch eq 1.13 && exit 0; fi
        # start installing
        pip install --progress-bar off torch torchvision
        pip install --progress-bar off -r requirements-dev.txt
        python -c 'import torch; print("Torch version:", torch.__version__)'
        python -c 'import torch; assert torch.__version__.split(".")[:2] == ["1", "13"], f"wrong torch version {torch.__version__}"'
        python -m torch.utils.collect_env
        wget -O /home/circleci/venv/check_version.py https://raw.githubusercontent.com/min-xu-ai/check_verion/main/check_version.py

Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
121
# most recent stable version
122
install_dep_pytorch_stable: &install_dep_pytorch_stable
123
  - run:
124
      name: Install Dependencies with torch 1.13.0
125
126
      command: |
        # check if we have restored venv cache (/home/circleci/venv) correctly, if so, just skip
127
        if [ -f /home/circleci/venv/check_version.py ]; then python /home/circleci/venv/check_version.py torch eq 1.13 && exit 0; fi
128
        # start installing
Min Xu's avatar
Min Xu committed
129
130
        pip install --progress-bar off torch==1.13.0 --extra-index-url https://download.pytorch.org/whl/cu117
        pip install --progress-bar off torchvision==0.14.0 --extra-index-url https://download.pytorch.org/whl/cu117
131
        pip install --progress-bar off -r requirements-dev.txt
132
133
        pip install --progress-bar off -r requirements-benchmarks.txt
        python -c 'import torch; print("Torch version:", torch.__version__)'
134
        python -c 'import torch; assert torch.__version__.split(".")[:2] == ["1", "13"], f"wrong torch version {torch.__version__}"'
135
136
137
        python -m torch.utils.collect_env
        wget -O /home/circleci/venv/check_version.py https://raw.githubusercontent.com/min-xu-ai/check_verion/main/check_version.py

138
139
140
141
142
install_dep_pytorch_nightly: &install_dep_pytorch_nightly
  - run:
      name: Install Dependencies with a torch nightly preview build
      command: |
        # check if we have restored venv cache (/home/circleci/venv) correctly, if so, just skip
143
        if [ -f /home/circleci/venv/check_version.py ]; then python /home/circleci/venv/check_version.py torch eq 1.14 && exit 0; fi
144
        # start installing
145
        pip install --pre torch==1.14.0.dev20221121+cu117 torchvision==0.15.0.dev20221121+cu117 --extra-index-url https://download.pytorch.org/whl/nightly/cu117
146
        pip install --progress-bar off -r requirements-dev.txt
147
148
        pip install --progress-bar off -r requirements-benchmarks.txt
        python -c 'import torch; print("Torch version:", torch.__version__)'
149
        python -c 'import torch; assert torch.__version__.split(".")[:2] == ["1", "14"], f"wrong torch version {torch.__version__}"'
150
151
152
        python -m torch.utils.collect_env
        wget -O /home/circleci/venv/check_version.py https://raw.githubusercontent.com/min-xu-ai/check_verion/main/check_version.py

153
install_repo: &install_repo
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
154
155
156
  - run:
      name: Install Repository
      command: |
157
158
159
        pip install .
        # Test import.
        python -c 'import sys; sys.path = sys.path[1:]; import fairscale'
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
160

161
162
163
164
165
166
check_test_list: &check_test_list
  - run:
      name: Verify that unit test list files are correct
      command: |
        bash ./tests/ci_test_list_check.sh

167
168
169
170
upload_coverage: &upload_coverage
  - codecov/upload:
      file: 'coverage.xml'
      token: $CODECOV_TOKEN
msbaines's avatar
msbaines committed
171

172
173
174
175
run_offload_benchmark: &run_offload_benchmark
  - run:
      name: Run Offload Benchmark
      command: |
Crutcher Dunnavant's avatar
Crutcher Dunnavant committed
176
        PYTHONPATH=. python benchmarks/experimental/offload.py --checkpoint_activation
177

178
179
180
181
run_fsdp_benchmark: &run_fsdp_benchmark
  - run:
      name: Run FSDP Benchmark
      command: |
Crutcher Dunnavant's avatar
Crutcher Dunnavant committed
182
        PYTHONPATH=. python benchmarks/fsdp.py --use_synthetic_data
183

Jun Ru Anderson's avatar
Jun Ru Anderson committed
184
run_pipe_benchmark: &run_pipe_benchmark
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
185
  - run:
Jun Ru Anderson's avatar
Jun Ru Anderson committed
186
      name: Run Pipe Benchmark
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
187
      command: |
Crutcher Dunnavant's avatar
Crutcher Dunnavant committed
188
        PYTHONPATH=. python benchmarks/pipe.py
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
189

190
191
192
193
run_oss_benchmark: &run_oss_benchmark
  - run:
      name: Run OSS Benchmark
      command: |
Crutcher Dunnavant's avatar
Crutcher Dunnavant committed
194
195
        PYTHONPATH=. python benchmarks/oss.py --world_size 4 --epochs 2
        PYTHONPATH=. python benchmarks/oss.py --check_regression --world_size 4 --optim_type oss_sharded_ddp --epochs 12
196
197

run_oss_gloo: &run_oss_gloo
198
199
200
  - run:
      name: Run OSS with Gloo
      command: |
Crutcher Dunnavant's avatar
Crutcher Dunnavant committed
201
202
        PYTHONPATH=. python benchmarks/oss.py --gloo --optim_type oss_ddp --epochs 2
        PYTHONPATH=. python benchmarks/oss.py --gloo --optim_type oss_sharded_ddp --epochs 2
203

204
run_oss_amp: &run_oss_amp
205
206
207
   - run:
       name: Run OSS with Torch AMP
       command: |
Crutcher Dunnavant's avatar
Crutcher Dunnavant committed
208
         PYTHONPATH=. python benchmarks/oss.py --amp --epochs 3 --optim_type oss_sharded_ddp
209

210
211
212
213
run_oss_for_each: &run_oss_for_each
   - run:
       name: Run OSS with Torch AMP and ForEach optmizer
       command: |
Crutcher Dunnavant's avatar
Crutcher Dunnavant committed
214
         PYTHONPATH=. python benchmarks/oss.py --amp --epochs 3 --optim_type oss_sharded_ddp --multi_tensor_optim
215

216
217
218
219
220
221
222
223
224
run_doc_build: &run_doc_build
   - run:
       name: Testing doc build
       command: |
         cd docs
         pip install --progress-bar off -r requirements.txt
         make help
         make singlehtml | tee make.out
         ! tail make.out | grep -q warning
225

226
227
228
229
230
231
# This is an alias to run all unit tests possible on a platform.
run_unittests: &run_unittests
   - run:
       name: Run all unit tests.
       # We run all and not stopping on failure on CPU since docker time is cheaper.
       command: |
232
         ulimit -n 10000
233
         pytest --junitxml=test-results/junit.xml --verbose --timeout 60 --cov-report=xml --cov=./
234

Min Xu's avatar
Min Xu committed
235
commands:
236
237
238
239

   # This is a command (like a function) that run tests from a given test_list_file.
   # If test_list_file is not given, this results in an error.
   run_unittests_from_list:
Min Xu's avatar
Min Xu committed
240
     parameters:
241
       test_list_file:
Min Xu's avatar
Min Xu committed
242
         type: string
243
         default: "/dev/non_exist"  # Default to error out
Min Xu's avatar
Min Xu committed
244
245
246
247
     steps:
       - run:
           name: Run Unit Tests
           command: |
248
             ulimit -n 10000
249
             if [ ! -f <<parameters.test_list_file>> ]; then exit 1; fi
Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
250
             pytest --junitxml=test-results/junit.xml --verbose --timeout 70 --cov-report=xml --cov=./ `cat <<parameters.test_list_file>>`
Min Xu's avatar
Min Xu committed
251

252
253
254
255
256
   setup_pyenv:
     parameters:
       version:
         type: string
     steps:
257
258
259
260
261
       # Cache the pyenv download directory to avoid re-downloading over and over.
       - restore_cache:
          keys:
             - cache-key-pyenv-3-9-7-v1

262
263
       - run:
           name: Setup pyenv
264
265
266
267
268
           # We used to use the following commands to update pyenv.
           #   git clone https://github.com/pyenv/pyenv-update.git $(pyenv root)/plugins/pyenv-update
           #   pyenv update
           # However, it is not deterministic since pyenv is being updated.
           # It is now fixed to a version. (v2.3.0 is broken since it cause bash to fail when it try to do "eval $(pyenv init -)")
269
270
271
272
           #
           # We use "-sf" to skip already installed version after cache restoring the pyenv dir. This should avoid redoing
           # downloading and installing. The key name has a "v1" in the end so that we can debug and try different cache objects
           # while doing development.
273
           command: |
274
275
276
             cd /opt/circleci/.pyenv/
             git remote update
             git checkout v2.2.0
277
             pyenv install -sf <<parameters.version>>
278
279
             pyenv global <<parameters.version>>

280
281
282
283
284
       - save_cache:
          paths:
             - /opt/circleci/.pyenv
          key: cache-key-pyenv-3-9-7-v1

Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
285
286
287
288
289
# -------------------------------------------------------------------------------------
# Jobs to run
# -------------------------------------------------------------------------------------

jobs:
Min Xu's avatar
Min Xu committed
290
291
  cpu_tests_py38:
    <<: *cpu_py38
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
292
293
294
295
296

    working_directory: ~/fairscale

    steps:
      - checkout
297
      - <<: *check_test_list
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
298
299
      - <<: *setup_venv

300
301
302
303
      # Do this first to test repo dependencies. Basic import should work after this
      # installation. See issue #1042 for an example.
      - <<: *install_repo

Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
304
305
306
      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
Min Xu's avatar
Min Xu committed
307
308
            - cache-key-cpu-py38-torch-stable-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
      - <<: *install_dep_pytorch_stable_cpu
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
309
310
311
312

      - save_cache:
          paths:
            - ~/venv
Min Xu's avatar
Min Xu committed
313
          key: cache-key-cpu-py38-torch-stable-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
314

315
      - <<: *run_unittests
316
317
318
319
      - <<: *run_doc_build

      - store_test_results:
          path: test-results
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
320

Min Xu's avatar
Min Xu committed
321
322
  cpu_tests_py39:
    <<: *cpu_py39
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
323

324
    working_directory: ~/fairscale
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
325

326
327
    steps:
      - checkout
328
      - <<: *check_test_list
329
      - <<: *setup_venv
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
330

331
332
333
334
      # Do this first to test repo dependencies. Basic import should work after this
      # installation. See issue #1042 for an example.
      - <<: *install_repo

335
336
337
      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
Min Xu's avatar
Min Xu committed
338
            - cache-key-cpu-py39-torch-stable-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
Min Xu's avatar
Min Xu committed
339

Min Xu's avatar
Min Xu committed
340
      - <<: *install_dep_pytorch_stable_cpu
341
342
343
344

      - save_cache:
          paths:
            - ~/venv
Min Xu's avatar
Min Xu committed
345
          key: cache-key-cpu-py39-torch-stable-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
346

347
      - <<: *run_unittests
348
      - <<: *run_doc_build
349

Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
350
351
352
      - store_test_results:
          path: test-results

Min Xu's avatar
Min Xu committed
353
354
  cpu_tests_py310:
    <<: *cpu_py310
355
356
357
358
359

    working_directory: ~/fairscale

    steps:
      - checkout
360
      - <<: *check_test_list
361
362
      - <<: *setup_venv

363
364
365
366
      # Do this first to test repo dependencies. Basic import should work after this
      # installation. See issue #1042 for an example.
      - <<: *install_repo

367
368
369
      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
Min Xu's avatar
Min Xu committed
370
            - cache-key-cpu-py310-torch-stable-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
371

Min Xu's avatar
Min Xu committed
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
      - <<: *install_dep_pytorch_stable_cpu

      - save_cache:
          paths:
            - ~/venv
          key: cache-key-cpu-py310-torch-stable-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}

      - <<: *run_unittests
      - <<: *run_doc_build

      - store_test_results:
          path: test-results

  cpu_tests_py311:
    <<: *cpu_py311

    working_directory: ~/fairscale

    steps:
      - checkout
      - <<: *check_test_list
      - <<: *setup_venv

      # Do this first to test repo dependencies. Basic import should work after this
      # installation. See issue #1042 for an example.
      - <<: *install_repo

      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
            - cache-key-cpu-py311-torch-stable-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}

      - <<: *install_dep_pytorch_stable_cpu

      # torchvision doesn't have cp311 binary whl yet and it is needed by some tests.
      - run: pip install git+https://github.com/pytorch/vision@677fc939b21a8893f07db4c1f90482b648b6573f
      # numpy for 3.11 needs a newer binary whl.
      - run: pip install numpy==1.23.5
410
411
412
413

      - save_cache:
          paths:
            - ~/venv
Min Xu's avatar
Min Xu committed
414
          key: cache-key-cpu-py311-torch-stable-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
415

416
      - <<: *run_unittests
417
418
419
420
421
      - <<: *run_doc_build

      - store_test_results:
          path: test-results

Min Xu's avatar
Min Xu committed
422
  gpu_tests_lts:
Min Xu's avatar
Min Xu committed
423
    parameters:
424
      test_list_file:
Min Xu's avatar
Min Xu committed
425
        type: string
426
        default: "/dev/non_exist"
Min Xu's avatar
Min Xu committed
427

Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
428
    <<: *gpu_cu_11_2_small_multi
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
429
430
431
432
433
434
435
436

    working_directory: ~/fairscale

    steps:
      - checkout

      - run: nvidia-smi

437
      # Run this to make sure we use python3 from the system.
438
      - setup_pyenv:
439
          version: 3.9.7
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
440
441
442
443
444
445

      - <<: *setup_venv

      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
Min Xu's avatar
Min Xu committed
446
            - cache-key-py-3-9-7-gpu-torch-1-8-2-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
447

448
      - <<: *install_dep_pytorch_lts
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
449
450
451
452

      - save_cache:
          paths:
            - ~/venv
Min Xu's avatar
Min Xu committed
453
          key: cache-key-py-3-9-7-gpu-torch-1-8-2-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
454

455
      - <<: *install_repo
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
456

457
458
      - run_unittests_from_list:
          test_list_file: <<parameters.test_list_file>>
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
459
460
461

      - store_test_results:
          path: test-results
462

Min Xu's avatar
Min Xu committed
463
464
      # Disabled codecov since https://codecov.io/bash seems to be down.
      #- <<: *upload_coverage
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
465

Min Xu's avatar
Min Xu committed
466
  gpu_tests_stable:
Min Xu's avatar
Min Xu committed
467
    parameters:
468
      test_list_file:
Min Xu's avatar
Min Xu committed
469
        type: string
470
        default: "/dev/non_exist"
Min Xu's avatar
Min Xu committed
471

Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
472
    <<: *gpu_cu_11_2_small_multi
473
474
475
476
477
478
479
480

    working_directory: ~/fairscale

    steps:
      - checkout

      - run: nvidia-smi

481
      # Run this to make sure we use python3 from the system.
482
      - setup_pyenv:
483
          version: 3.9.7
484
485
486
487
488
489

      - <<: *setup_venv

      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
Min Xu's avatar
Min Xu committed
490
            - cache-key-py-3-9-7-gpu-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
491

492
      - <<: *install_dep_pytorch_stable
493
494
495
496

      - save_cache:
          paths:
            - ~/venv
Min Xu's avatar
Min Xu committed
497
          key: cache-key-py-3-9-7-gpu-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
498

499
      - <<: *install_repo
500

501
502
      - run_unittests_from_list:
          test_list_file: <<parameters.test_list_file>>
503
504
505

      - store_test_results:
          path: test-results
506

507
  gpu_tests_pytorch_nightly:
508
509
510
511
512
    parameters:
      test_list_file:
        type: string
        default: "/dev/non_exist"

Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
513
    <<: *gpu_cu_11_2_medium_multi
514
515
516
517
518
519
520
521
522

    working_directory: ~/fairscale

    steps:
      - checkout

      - run: nvidia-smi

      # Run this to make sure we use python3 from the system.
523
      - setup_pyenv:
524
          version: 3.9.7
525
526
527
528
529
530

      - <<: *setup_venv

      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
Min Xu's avatar
Min Xu committed
531
            - cache-key-py-3-9-7-gpu-torch-1-12-0424-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
532

533
      - <<: *install_dep_pytorch_nightly
534
535
536
537

      - save_cache:
          paths:
            - ~/venv
Min Xu's avatar
Min Xu committed
538
          key: cache-key-py-3-9-7-gpu-torch-1-12-0424-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
539
540
541
542
543
544
545
546
547

      - <<: *install_repo

      - run_unittests_from_list:
          test_list_file: <<parameters.test_list_file>>

      - store_test_results:
          path: test-results

548
  benchmarks_1:
Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
549
    <<: *gpu_cu_11_2_small_multi
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
550
551
552
553
554
555
556
557

    working_directory: ~/fairscale

    steps:
      - checkout

      - run: nvidia-smi

558
      - setup_pyenv:
559
          version: 3.9.7
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
560
561
562
563
564
565

      - <<: *setup_venv

      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
Min Xu's avatar
Min Xu committed
566
            - cache-key-py-3-9-7-benchmarks-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
567

568
569
570
      # Cache the MNIST directory that contains benchmark data
      - restore_cache:
          keys:
Min Xu's avatar
Min Xu committed
571
            - cache-key-benchmark-MNIST-{{.Environment.CACHE_VERSION}}-{{checksum "benchmarks/datasets/mnist.py"}}
572

573
      - <<: *install_dep_pytorch_stable
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
574
575
576
577

      - save_cache:
          paths:
            - ~/venv
Min Xu's avatar
Min Xu committed
578
          key: cache-key-py-3-9-7-benchmarks-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
579

580
      - <<: *install_repo
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
581

Min Xu's avatar
Min Xu committed
582
583
      # Skip this benchmark due to some download issue.
      #- <<: *run_pipe_benchmark
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
584

585
586
      - <<: *run_offload_benchmark

587
588
589
      - <<: *run_oss_amp

      - <<: *run_oss_for_each
590

591
592
      - <<: *run_oss_gloo

593
594
      - <<: *run_fsdp_benchmark

595
596
597
      - save_cache:
          paths:
            - /tmp/MNIST
Min Xu's avatar
Min Xu committed
598
          key: cache-key-benchmark-MNIST-{{.Environment.CACHE_VERSION}}-{{checksum "benchmarks/datasets/mnist.py"}}
599

600
  benchmarks_2:
Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
601
    <<: *gpu_cu_11_2_medium_multi
602
603
604
605
606
607
608
609

    working_directory: ~/fairscale

    steps:
      - checkout

      - run: nvidia-smi

610
      - setup_pyenv:
611
          version: 3.9.7
612
613
614

      - <<: *setup_venv

615
616
617
618
619
      # Do this first to test an issue like #1042. We keep benchmark_1 doing it
      # after requirements-dev.txt, but change benchmark_2 to do this earlier to
      # ensure both cases are tested.
      - <<: *install_repo

620
621
622
      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
Min Xu's avatar
Min Xu committed
623
            - cache-key-py-3-9-7-benchmarks-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
624

625
626
627
628

      # Cache the MNIST directory that contains benchmark data
      - restore_cache:
          keys:
Min Xu's avatar
Min Xu committed
629
            - cache-key-benchmark-MNIST-{{.Environment.CACHE_VERSION}}-{{checksum "benchmarks/datasets/mnist.py"}}
630

631
      - <<: *install_dep_pytorch_stable
632
633
634
635

      - save_cache:
          paths:
            - ~/venv
Min Xu's avatar
Min Xu committed
636
          key: cache-key-py-3-9-7-benchmarks-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
637
638

      - <<: *run_oss_benchmark
639

640
641
642
      - save_cache:
          paths:
            - /tmp/MNIST
643
          key: cache-key-benchmark-MNIST-{{.Environment.CACHE_VERSION}}-{{checksum "benchmarks/datasets/mnist.py"}}
644

Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
645
646
647
648
649

workflows:
  version: 2
  build:
    jobs:
650
651
      - cpu_tests_py38
      - cpu_tests_py39
Min Xu's avatar
Min Xu committed
652
      - cpu_tests_py310
Min Xu's avatar
Min Xu committed
653
      - cpu_tests_py311
Min Xu's avatar
Min Xu committed
654
      - gpu_tests_lts:
655
          test_list_file: tests/ci_test_list_1.txt
Min Xu's avatar
Min Xu committed
656
      - gpu_tests_stable:
657
          test_list_file: tests/ci_test_list_1.txt
658
659
      - gpu_tests_pytorch_nightly:
          test_list_file: tests/ci_test_list_1.txt
Min Xu's avatar
Min Xu committed
660
      - gpu_tests_lts:
661
          test_list_file: tests/ci_test_list_2.txt
Min Xu's avatar
Min Xu committed
662
      - gpu_tests_stable:
663
          test_list_file: tests/ci_test_list_2.txt
664
665
      - gpu_tests_pytorch_nightly:
          test_list_file: tests/ci_test_list_2.txt
Min Xu's avatar
Min Xu committed
666
      - gpu_tests_lts:
667
          test_list_file: tests/ci_test_list_3.txt
Min Xu's avatar
Min Xu committed
668
      - gpu_tests_stable:
669
          test_list_file: tests/ci_test_list_3.txt
670
671
      - gpu_tests_pytorch_nightly:
          test_list_file: tests/ci_test_list_3.txt
672
673
      - benchmarks_1
      - benchmarks_2