[feature] Add a torch AMP benchmark option and test job (#175)

* oss benchmark: add an --amp option * add a circleCI test

[feature] Add a torch AMP benchmark option and test job (#175)
* oss benchmark: add an --amp option * add a circleCI test
cc766aa5 · Benjamin Lefaudeux · GitHub · 0d1f058b · cc766aa5 · cc766aa5
Unverified Commit cc766aa5 authored Nov 05, 2020 by Benjamin Lefaudeux Committed by GitHub Nov 05, 2020
Show whitespace changes
Inline Side-by-side

Showing with 17 additions and 3 deletions

.circleci/config.yml .circleci/config.yml +7 -0

benchmarks/oss.py benchmarks/oss.py +10 -3

No files found.
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -126,6 +126,11 @@ run_oss_gloo: &run_oss_gloo
    command: |
      python benchmarks/oss.py --gloo --optim_type oss_ddp --epochs 3
+run_oss_amp: &run_oss_amp
+- run:
+    name: Run OSS with Torch AMP
+    command: |
+      python benchmarks/oss.py --amp --epochs 3 --optim_type oss_sharded_ddp
 # -------------------------------------------------------------------------------------
 # Jobs to run
@@ -316,6 +321,8 @@ jobs:
      - <<: *run_oss_gloo
+      - <<: *run_oss_amp

--- a/benchmarks/oss.py
+++ b/benchmarks/oss.py
@@ -140,9 +140,15 @@ def train(
                            next(model.parameters()).norm().item(), next(model.parameters()).grad.norm().item()
                        )
                    )
+                if not args.cpu and args.amp:
+                    # Automatically computes the FW pass in half precision
+                    with torch.cuda.amp.autocast():
                        outputs = model(batch["inputs"])
                        loss = loss_fn(outputs, batch["label"])
+                else:
+                    outputs = model(batch["inputs"])
+                    loss = loss_fn(outputs, batch["label"])
                loss.backward()
                if optim_type == OptimType.oss_sharded_ddp:
@@ -244,7 +250,8 @@ if __name__ == "__main__":
    parser.add_argument("--profile", action="store_true", default=False)
    parser.add_argument("--cpu", action="store_true", default=False)
    parser.add_argument("--torchvision_model", type=str, help="Any torchvision model name (str)", default="resnet101")
-    parser.add_argument("--debug", action="store_true", default=False)
+    parser.add_argument("--debug", action="store_true", default=False, help="Display additional debug information")
+    parser.add_argument("--amp", action="store_true", default=False, help="Activate torch AMP")
    args = parser.parse_args()