config.yml 19.8 KB
Newer Older
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
1
2
3
4
5
# Python CircleCI 2.0 configuration file
#
# Check https://circleci.com/docs/2.0/language-python/ for more details
#
# Adopted from
6
# https://github.com/facebookresearch/detectron2/blob/main/.circleci/config.yml
7
8
#
# Pro tip: download circle ci cli to validate the config locally during development.
9
10
#
# To reset/clean the cache update the CACHE_VERSION variable in project settings
Min Xu's avatar
Min Xu committed
11
# in the fairscale project in CircleCI's web UI. The CACHE_VERSION follows the convention
12
13
14
15
16
# v$(FAIRSCALE_VERSION)-${CACHE_NUMBER}. E.g. v0.4.2-1. CACHE_NUMBER must start
# at 1 and increase in whole numbers. When changing the CACHE_VERSION manually
# always set the FAIRSCALE_VERSION value to the fairscale version being tested.
# To reset the cache when not updating the fairscale version, only update the
# CACHE_NUMBER value.
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
17

Min Xu's avatar
Min Xu committed
18
version: 2.1
Min Xu's avatar
Min Xu committed
19

Min Xu's avatar
Min Xu committed
20
21
orbs:
  codecov: codecov/codecov@1.0.2
Min Xu's avatar
Min Xu committed
22

Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
23
24
25
# -------------------------------------------------------------------------------------
# Environments to run the jobs in
# -------------------------------------------------------------------------------------
26
27
28
cpu_py38: &cpu_py38
  docker:
    - image: circleci/python:3.8
29
  resource_class: large
30
31
32
33

cpu_py39: &cpu_py39
  docker:
    - image: circleci/python:3.9
34
  resource_class: large
35

Min Xu's avatar
Min Xu committed
36
37
38
39
40
cpu_py310: &cpu_py310
  docker:
    - image: circleci/python:3.10
  resource_class: large

41
# Here is the list of GPU images:
Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
42
# https://circleci.com/docs/2.0/configuration-reference/#available-linux-gpu-images
43
44
# We need to use multiple gpus for several jobs. The resource_class
# values are available here T101565170
Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
45
46
47
48
# gpu.nvidia.small.multi = 2 gpus with 16 GB ram each
# gpu.nvidia.medium.multi = 4 gpus with 16 GB ram each

gpu_cu_11_2_small_multi: &gpu_cu_11_2_small_multi
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
49
  environment:
Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
50
51
    CUDA_VERSION: "11.2"
    CUDA_HOME: /usr/local/cuda-11.2
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
52
  machine:
Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
53
54
    image: ubuntu-2004-cuda-11.2:202103-01
  resource_class: gpu.nvidia.small.multi
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
55

Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
56
gpu_cu_11_2_medium_multi: &gpu_cu_11_2_medium_multi
57
  environment:
58
59
    CUDA_VERSION: "11.2"
    CUDA_HOME: /usr/local/cuda-11.2
60
  machine:
61
    image: ubuntu-2004-cuda-11.2:202103-01
Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
62
  resource_class: gpu.nvidia.medium.multi
63

Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
64
65
66
67
68
69
70
# -------------------------------------------------------------------------------------
# Re-usable commands
# -------------------------------------------------------------------------------------
setup_venv: &setup_venv
  - run:
      name: Setup Virtual Env
      working_directory: ~/
71
72
      # use bash -x for debug early commands executed in .bashrc.
      shell: /bin/bash
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
73
      command: |
74
75
        set -e
        set -o pipefail
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
76
77
78
79
80
81
82
83
        python -m venv ~/venv
        echo ". ~/venv/bin/activate" >> $BASH_ENV
        . ~/venv/bin/activate
        python --version
        which python
        which pip
        pip install --upgrade pip

Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
84
# most recent LTS version
85
install_dep_pytorch_lts: &install_dep_pytorch_lts
86
  - run:
87
      name: Install Dependencies with torch 1.8.2 (LTS)
88
      command: |
89
        # check if we have restored venv cache (/home/circleci/venv) correctly, if so, just skip
Min Xu's avatar
Min Xu committed
90
        if [ -f /home/circleci/venv/check_version.py ]; then python /home/circleci/venv/check_version.py torch eq 1.8 && exit 0; fi
91
        # start installing
92
        pip install --progress-bar off torch==1.8.2+cu111 torchvision==0.9.2+cu111 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cu111
93
        pip install --progress-bar off -r requirements-dev.txt
94
        pip install --progress-bar off -r requirements-benchmarks.txt
95
        python -c 'import torch; print("Torch version:", torch.__version__)'
96
        python -c 'import torch; assert torch.__version__.split(".")[:2] == ["1", "8"], f"wrong torch version {torch.__version__}"'
97
        python -m torch.utils.collect_env
Min Xu's avatar
Min Xu committed
98
        wget -O /home/circleci/venv/check_version.py https://raw.githubusercontent.com/min-xu-ai/check_verion/main/check_version.py
99

Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
100
# most recent stable version
101
install_dep_pytorch_stable: &install_dep_pytorch_stable
102
  - run:
103
      name: Install Dependencies with torch 1.12.0
104
105
      command: |
        # check if we have restored venv cache (/home/circleci/venv) correctly, if so, just skip
Min Xu's avatar
Min Xu committed
106
        if [ -f /home/circleci/venv/check_version.py ]; then python /home/circleci/venv/check_version.py torch eq 1.11 && exit 0; fi
107
        # start installing
108
        pip install --progress-bar off torch==1.12.0+cu113 torchvision==0.13.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
109
        pip install --progress-bar off -r requirements-dev.txt
110
111
        pip install --progress-bar off -r requirements-benchmarks.txt
        python -c 'import torch; print("Torch version:", torch.__version__)'
112
        python -c 'import torch; assert torch.__version__.split(".")[:2] == ["1", "12"], f"wrong torch version {torch.__version__}"'
113
114
115
        python -m torch.utils.collect_env
        wget -O /home/circleci/venv/check_version.py https://raw.githubusercontent.com/min-xu-ai/check_verion/main/check_version.py

116
117
118
119
120
install_dep_pytorch_nightly: &install_dep_pytorch_nightly
  - run:
      name: Install Dependencies with a torch nightly preview build
      command: |
        # check if we have restored venv cache (/home/circleci/venv) correctly, if so, just skip
121
        if [ -f /home/circleci/venv/check_version.py ]; then python /home/circleci/venv/check_version.py torch eq 1.12 && exit 0; fi
122
        # start installing
123
        pip install --pre torch==1.13.0.dev20220825+cu113 torchvision==0.14.0.dev20220825+cu113 --extra-index-url https://download.pytorch.org/whl/nightly/cu113
124
        pip install --progress-bar off -r requirements-dev.txt
125
126
        pip install --progress-bar off -r requirements-benchmarks.txt
        python -c 'import torch; print("Torch version:", torch.__version__)'
127
        python -c 'import torch; assert torch.__version__.split(".")[:2] == ["1", "13"], f"wrong torch version {torch.__version__}"'
128
129
130
        python -m torch.utils.collect_env
        wget -O /home/circleci/venv/check_version.py https://raw.githubusercontent.com/min-xu-ai/check_verion/main/check_version.py

131
install_repo: &install_repo
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
132
133
134
  - run:
      name: Install Repository
      command: |
135
136
137
        pip install .
        # Test import.
        python -c 'import sys; sys.path = sys.path[1:]; import fairscale'
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
138

139
140
141
142
143
144
check_test_list: &check_test_list
  - run:
      name: Verify that unit test list files are correct
      command: |
        bash ./tests/ci_test_list_check.sh

145
146
147
148
upload_coverage: &upload_coverage
  - codecov/upload:
      file: 'coverage.xml'
      token: $CODECOV_TOKEN
msbaines's avatar
msbaines committed
149

150
151
152
153
run_offload_benchmark: &run_offload_benchmark
  - run:
      name: Run Offload Benchmark
      command: |
Crutcher Dunnavant's avatar
Crutcher Dunnavant committed
154
        PYTHONPATH=. python benchmarks/experimental/offload.py --checkpoint_activation
155

156
157
158
159
run_fsdp_benchmark: &run_fsdp_benchmark
  - run:
      name: Run FSDP Benchmark
      command: |
Crutcher Dunnavant's avatar
Crutcher Dunnavant committed
160
        PYTHONPATH=. python benchmarks/fsdp.py --use_synthetic_data
161

Jun Ru Anderson's avatar
Jun Ru Anderson committed
162
run_pipe_benchmark: &run_pipe_benchmark
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
163
  - run:
Jun Ru Anderson's avatar
Jun Ru Anderson committed
164
      name: Run Pipe Benchmark
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
165
      command: |
Crutcher Dunnavant's avatar
Crutcher Dunnavant committed
166
        PYTHONPATH=. python benchmarks/pipe.py
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
167

168
169
170
171
run_oss_benchmark: &run_oss_benchmark
  - run:
      name: Run OSS Benchmark
      command: |
Crutcher Dunnavant's avatar
Crutcher Dunnavant committed
172
173
        PYTHONPATH=. python benchmarks/oss.py --world_size 4 --epochs 2
        PYTHONPATH=. python benchmarks/oss.py --check_regression --world_size 4 --optim_type oss_sharded_ddp --epochs 12
174
175

run_oss_gloo: &run_oss_gloo
176
177
178
  - run:
      name: Run OSS with Gloo
      command: |
Crutcher Dunnavant's avatar
Crutcher Dunnavant committed
179
180
        PYTHONPATH=. python benchmarks/oss.py --gloo --optim_type oss_ddp --epochs 2
        PYTHONPATH=. python benchmarks/oss.py --gloo --optim_type oss_sharded_ddp --epochs 2
181

182
run_oss_amp: &run_oss_amp
183
184
185
   - run:
       name: Run OSS with Torch AMP
       command: |
Crutcher Dunnavant's avatar
Crutcher Dunnavant committed
186
         PYTHONPATH=. python benchmarks/oss.py --amp --epochs 3 --optim_type oss_sharded_ddp
187

188
189
190
191
run_oss_for_each: &run_oss_for_each
   - run:
       name: Run OSS with Torch AMP and ForEach optmizer
       command: |
Crutcher Dunnavant's avatar
Crutcher Dunnavant committed
192
         PYTHONPATH=. python benchmarks/oss.py --amp --epochs 3 --optim_type oss_sharded_ddp --multi_tensor_optim
193

194
195
196
197
198
199
200
201
202
run_doc_build: &run_doc_build
   - run:
       name: Testing doc build
       command: |
         cd docs
         pip install --progress-bar off -r requirements.txt
         make help
         make singlehtml | tee make.out
         ! tail make.out | grep -q warning
203

204
205
206
207
208
209
# This is an alias to run all unit tests possible on a platform.
run_unittests: &run_unittests
   - run:
       name: Run all unit tests.
       # We run all and not stopping on failure on CPU since docker time is cheaper.
       command: |
210
         ulimit -n 10000
211
         pytest --junitxml=test-results/junit.xml --verbose --timeout 60 --cov-report=xml --cov=./
212

Min Xu's avatar
Min Xu committed
213
commands:
214
215
216
217

   # This is a command (like a function) that run tests from a given test_list_file.
   # If test_list_file is not given, this results in an error.
   run_unittests_from_list:
Min Xu's avatar
Min Xu committed
218
     parameters:
219
       test_list_file:
Min Xu's avatar
Min Xu committed
220
         type: string
221
         default: "/dev/non_exist"  # Default to error out
Min Xu's avatar
Min Xu committed
222
223
224
225
     steps:
       - run:
           name: Run Unit Tests
           command: |
226
             ulimit -n 10000
227
             if [ ! -f <<parameters.test_list_file>> ]; then exit 1; fi
Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
228
             pytest --junitxml=test-results/junit.xml --verbose --timeout 70 --cov-report=xml --cov=./ `cat <<parameters.test_list_file>>`
Min Xu's avatar
Min Xu committed
229

230
231
232
233
234
   setup_pyenv:
     parameters:
       version:
         type: string
     steps:
235
236
237
238
239
       # Cache the pyenv download directory to avoid re-downloading over and over.
       - restore_cache:
          keys:
             - cache-key-pyenv-3-9-7-v1

240
241
       - run:
           name: Setup pyenv
242
243
244
245
246
           # We used to use the following commands to update pyenv.
           #   git clone https://github.com/pyenv/pyenv-update.git $(pyenv root)/plugins/pyenv-update
           #   pyenv update
           # However, it is not deterministic since pyenv is being updated.
           # It is now fixed to a version. (v2.3.0 is broken since it cause bash to fail when it try to do "eval $(pyenv init -)")
247
248
249
250
           #
           # We use "-sf" to skip already installed version after cache restoring the pyenv dir. This should avoid redoing
           # downloading and installing. The key name has a "v1" in the end so that we can debug and try different cache objects
           # while doing development.
251
           command: |
252
253
254
             cd /opt/circleci/.pyenv/
             git remote update
             git checkout v2.2.0
255
             pyenv install -sf <<parameters.version>>
256
257
             pyenv global <<parameters.version>>

258
259
260
261
262
       - save_cache:
          paths:
             - /opt/circleci/.pyenv
          key: cache-key-pyenv-3-9-7-v1

Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
263
264
265
266
267
# -------------------------------------------------------------------------------------
# Jobs to run
# -------------------------------------------------------------------------------------

jobs:
Min Xu's avatar
Min Xu committed
268
269
  cpu_tests_py38:
    <<: *cpu_py38
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
270
271
272
273
274

    working_directory: ~/fairscale

    steps:
      - checkout
275
      - <<: *check_test_list
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
276
277
      - <<: *setup_venv

278
279
280
281
      # Do this first to test repo dependencies. Basic import should work after this
      # installation. See issue #1042 for an example.
      - <<: *install_repo

Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
282
283
284
      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
Min Xu's avatar
Min Xu committed
285
            - cache-key-cpu-py38-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
286
      - <<: *install_dep_pytorch_stable
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
287
288
289
290

      - save_cache:
          paths:
            - ~/venv
Min Xu's avatar
Min Xu committed
291
          key: cache-key-cpu-py38-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
292

293
      - <<: *run_unittests
294
295
296
297
      - <<: *run_doc_build

      - store_test_results:
          path: test-results
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
298

Min Xu's avatar
Min Xu committed
299
300
  cpu_tests_py39:
    <<: *cpu_py39
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
301

302
    working_directory: ~/fairscale
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
303

304
305
    steps:
      - checkout
306
      - <<: *check_test_list
307
      - <<: *setup_venv
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
308

309
310
311
312
      # Do this first to test repo dependencies. Basic import should work after this
      # installation. See issue #1042 for an example.
      - <<: *install_repo

313
314
315
      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
Min Xu's avatar
Min Xu committed
316
317
            - cache-key-cpu-py39-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}

318
      - <<: *install_dep_pytorch_stable
319
320
321
322

      - save_cache:
          paths:
            - ~/venv
Min Xu's avatar
Min Xu committed
323
          key: cache-key-cpu-py39-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
324

325
      - <<: *run_unittests
326
      - <<: *run_doc_build
327

Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
328
329
330
      - store_test_results:
          path: test-results

Min Xu's avatar
Min Xu committed
331
332
  cpu_tests_py310:
    <<: *cpu_py310
333
334
335
336
337

    working_directory: ~/fairscale

    steps:
      - checkout
338
      - <<: *check_test_list
339
340
      - <<: *setup_venv

341
342
343
344
      # Do this first to test repo dependencies. Basic import should work after this
      # installation. See issue #1042 for an example.
      - <<: *install_repo

345
346
347
      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
Min Xu's avatar
Min Xu committed
348
            - cache-key-cpu-py310-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
349

350
      - <<: *install_dep_pytorch_stable
351
352
353
354

      - save_cache:
          paths:
            - ~/venv
Min Xu's avatar
Min Xu committed
355
          key: cache-key-cpu-py310-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
356

357
      - <<: *run_unittests
358
359
360
361
362
      - <<: *run_doc_build

      - store_test_results:
          path: test-results

Min Xu's avatar
Min Xu committed
363
  gpu_tests_lts:
Min Xu's avatar
Min Xu committed
364
    parameters:
365
      test_list_file:
Min Xu's avatar
Min Xu committed
366
        type: string
367
        default: "/dev/non_exist"
Min Xu's avatar
Min Xu committed
368

Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
369
    <<: *gpu_cu_11_2_small_multi
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
370
371
372
373
374
375
376
377

    working_directory: ~/fairscale

    steps:
      - checkout

      - run: nvidia-smi

378
      # Run this to make sure we use python3 from the system.
379
      - setup_pyenv:
380
          version: 3.9.7
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
381
382
383
384
385
386

      - <<: *setup_venv

      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
Min Xu's avatar
Min Xu committed
387
            - cache-key-py-3-9-7-gpu-torch-1-8-2-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
388

389
      - <<: *install_dep_pytorch_lts
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
390
391
392
393

      - save_cache:
          paths:
            - ~/venv
Min Xu's avatar
Min Xu committed
394
          key: cache-key-py-3-9-7-gpu-torch-1-8-2-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
395

396
      - <<: *install_repo
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
397

398
399
      - run_unittests_from_list:
          test_list_file: <<parameters.test_list_file>>
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
400
401
402

      - store_test_results:
          path: test-results
403

Min Xu's avatar
Min Xu committed
404
405
      # Disabled codecov since https://codecov.io/bash seems to be down.
      #- <<: *upload_coverage
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
406

Min Xu's avatar
Min Xu committed
407
  gpu_tests_stable:
Min Xu's avatar
Min Xu committed
408
    parameters:
409
      test_list_file:
Min Xu's avatar
Min Xu committed
410
        type: string
411
        default: "/dev/non_exist"
Min Xu's avatar
Min Xu committed
412

Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
413
    <<: *gpu_cu_11_2_small_multi
414
415
416
417
418
419
420
421

    working_directory: ~/fairscale

    steps:
      - checkout

      - run: nvidia-smi

422
      # Run this to make sure we use python3 from the system.
423
      - setup_pyenv:
424
          version: 3.9.7
425
426
427
428
429
430

      - <<: *setup_venv

      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
Min Xu's avatar
Min Xu committed
431
            - cache-key-py-3-9-7-gpu-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
432

433
      - <<: *install_dep_pytorch_stable
434
435
436
437

      - save_cache:
          paths:
            - ~/venv
Min Xu's avatar
Min Xu committed
438
          key: cache-key-py-3-9-7-gpu-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
439

440
      - <<: *install_repo
441

442
443
      - run_unittests_from_list:
          test_list_file: <<parameters.test_list_file>>
444
445
446

      - store_test_results:
          path: test-results
447

448
  gpu_tests_pytorch_nightly:
449
450
451
452
453
    parameters:
      test_list_file:
        type: string
        default: "/dev/non_exist"

Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
454
    <<: *gpu_cu_11_2_medium_multi
455
456
457
458
459
460
461
462
463

    working_directory: ~/fairscale

    steps:
      - checkout

      - run: nvidia-smi

      # Run this to make sure we use python3 from the system.
464
      - setup_pyenv:
465
          version: 3.9.7
466
467
468
469
470
471

      - <<: *setup_venv

      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
Min Xu's avatar
Min Xu committed
472
            - cache-key-py-3-9-7-gpu-torch-1-12-0424-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
473

474
      - <<: *install_dep_pytorch_nightly
475
476
477
478

      - save_cache:
          paths:
            - ~/venv
Min Xu's avatar
Min Xu committed
479
          key: cache-key-py-3-9-7-gpu-torch-1-12-0424-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
480
481
482
483
484
485
486
487
488

      - <<: *install_repo

      - run_unittests_from_list:
          test_list_file: <<parameters.test_list_file>>

      - store_test_results:
          path: test-results

489
  benchmarks_1:
Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
490
    <<: *gpu_cu_11_2_small_multi
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
491
492
493
494
495
496
497
498

    working_directory: ~/fairscale

    steps:
      - checkout

      - run: nvidia-smi

499
      - setup_pyenv:
500
          version: 3.9.7
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
501
502
503
504
505
506

      - <<: *setup_venv

      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
Min Xu's avatar
Min Xu committed
507
            - cache-key-py-3-9-7-benchmarks-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
508

509
510
511
      # Cache the MNIST directory that contains benchmark data
      - restore_cache:
          keys:
Min Xu's avatar
Min Xu committed
512
            - cache-key-benchmark-MNIST-{{.Environment.CACHE_VERSION}}-{{checksum "benchmarks/datasets/mnist.py"}}
513

514
      - <<: *install_dep_pytorch_stable
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
515
516
517
518

      - save_cache:
          paths:
            - ~/venv
Min Xu's avatar
Min Xu committed
519
          key: cache-key-py-3-9-7-benchmarks-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
520

521
      - <<: *install_repo
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
522

Jun Ru Anderson's avatar
Jun Ru Anderson committed
523
      - <<: *run_pipe_benchmark
Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
524

525
526
      - <<: *run_offload_benchmark

527
528
529
      - <<: *run_oss_amp

      - <<: *run_oss_for_each
530

531
532
      - <<: *run_oss_gloo

533
534
      - <<: *run_fsdp_benchmark

535
536
537
      - save_cache:
          paths:
            - /tmp/MNIST
Min Xu's avatar
Min Xu committed
538
          key: cache-key-benchmark-MNIST-{{.Environment.CACHE_VERSION}}-{{checksum "benchmarks/datasets/mnist.py"}}
539

540
  benchmarks_2:
Anupam Bhatnagar's avatar
Anupam Bhatnagar committed
541
    <<: *gpu_cu_11_2_medium_multi
542
543
544
545
546
547
548
549

    working_directory: ~/fairscale

    steps:
      - checkout

      - run: nvidia-smi

550
      - setup_pyenv:
551
          version: 3.9.7
552
553
554

      - <<: *setup_venv

555
556
557
558
559
      # Do this first to test an issue like #1042. We keep benchmark_1 doing it
      # after requirements-dev.txt, but change benchmark_2 to do this earlier to
      # ensure both cases are tested.
      - <<: *install_repo

560
561
562
      # Cache the venv directory that contains dependencies
      - restore_cache:
          keys:
Min Xu's avatar
Min Xu committed
563
            - cache-key-py-3-9-7-benchmarks-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
564

565
566
567
568

      # Cache the MNIST directory that contains benchmark data
      - restore_cache:
          keys:
Min Xu's avatar
Min Xu committed
569
            - cache-key-benchmark-MNIST-{{.Environment.CACHE_VERSION}}-{{checksum "benchmarks/datasets/mnist.py"}}
570

571
      - <<: *install_dep_pytorch_stable
572
573
574
575

      - save_cache:
          paths:
            - ~/venv
Min Xu's avatar
Min Xu committed
576
          key: cache-key-py-3-9-7-benchmarks-torch-1-11-0-{{.Environment.CACHE_VERSION}}-{{checksum "setup.py"}}-{{checksum "requirements-dev.txt"}}
577
578

      - <<: *run_oss_benchmark
579

580
581
582
      - save_cache:
          paths:
            - /tmp/MNIST
583
          key: cache-key-benchmark-MNIST-{{.Environment.CACHE_VERSION}}-{{checksum "benchmarks/datasets/mnist.py"}}
584

Mandeep Singh Baines's avatar
Mandeep Singh Baines committed
585
586
587
588
589

workflows:
  version: 2
  build:
    jobs:
590
591
      - cpu_tests_py38
      - cpu_tests_py39
Min Xu's avatar
Min Xu committed
592
593
      - cpu_tests_py310
      - gpu_tests_lts:
594
          test_list_file: tests/ci_test_list_1.txt
Min Xu's avatar
Min Xu committed
595
      - gpu_tests_stable:
596
          test_list_file: tests/ci_test_list_1.txt
597
598
      - gpu_tests_pytorch_nightly:
          test_list_file: tests/ci_test_list_1.txt
Min Xu's avatar
Min Xu committed
599
      - gpu_tests_lts:
600
          test_list_file: tests/ci_test_list_2.txt
Min Xu's avatar
Min Xu committed
601
      - gpu_tests_stable:
602
          test_list_file: tests/ci_test_list_2.txt
603
604
      - gpu_tests_pytorch_nightly:
          test_list_file: tests/ci_test_list_2.txt
Min Xu's avatar
Min Xu committed
605
      - gpu_tests_lts:
606
          test_list_file: tests/ci_test_list_3.txt
Min Xu's avatar
Min Xu committed
607
      - gpu_tests_stable:
608
          test_list_file: tests/ci_test_list_3.txt
609
610
      - gpu_tests_pytorch_nightly:
          test_list_file: tests/ci_test_list_3.txt
611
612
      - benchmarks_1
      - benchmarks_2