[feat] Add CPU support for pipe.py benchmarks (#188)

* Add CPU support for pipe.py benchmarks, CUDA-free

[feat] Add CPU support for pipe.py benchmarks (#188)
* Add CPU support for pipe.py benchmarks, CUDA-free
a842a927 · Yuanyuan (Ana) Shen · GitHub · f80b303c · a842a927 · a842a927
Unverified Commit a842a927 authored Nov 18, 2020 by Yuanyuan (Ana) Shen Committed by GitHub Nov 18, 2020
Show whitespace changes
Inline Side-by-side

Showing with 18 additions and 10 deletions

benchmarks/pipe.py benchmarks/pipe.py +17 -9

fairscale/nn/pipe/pipe.py fairscale/nn/pipe/pipe.py +1 -1

No files found.
--- a/benchmarks/pipe.py
+++ b/benchmarks/pipe.py
@@ -283,7 +283,9 @@ def train(lm_dataloader, model, criterion, optimizer, vocab_size, args):
    num_params = reduce(operator.add, (reduce(operator.mul, x.size()) for x in model.parameters()))
    if model.group:
-        total = torch.Tensor([num_params]).cuda()
+        total = torch.Tensor([num_params])
+        if torch.cuda.is_available():
+            total = total.cuda()
        torch.distributed.all_reduce(total, group=model.group)
        logging.info(
            f"training model, #prams = {num_params}, group: {model.group.rank()}, grank:"
@@ -305,6 +307,8 @@ def train(lm_dataloader, model, criterion, optimizer, vocab_size, args):
        if isinstance(model, DDP):
            model = model.module
+        if not torch.cuda.is_available():
+            return torch.device("cpu")
        if model.devices:
            return model.devices[0]
        else:
@@ -313,6 +317,9 @@ def train(lm_dataloader, model, criterion, optimizer, vocab_size, args):
    def get_last_device(model):
        if isinstance(model, DDP):
            model = model.module
+        if not torch.cuda.is_available():
+            return torch.device("cpu")
        if model.devices:
            return model.devices[-1]
        else:
@@ -491,8 +498,8 @@ def generate_balance(num_devices, num_layers):
 def make_model_and_data(args, device, new_data: bool = True):
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    if new_data:
-        device = torch.device("cuda")
        vocab_size = 10000
        model, criterion, optimizer, scaler = make_model(args, device, vocab_size)
        lm_dataset = BenchmarkLMDataset()
@@ -507,7 +514,6 @@ def make_model_and_data(args, device, new_data: bool = True):
            "vocab_size": vocab_size,
        }
    else:
-        device = torch.device("cuda")
        data = get_data(device)
        ntokens, train_data, val_data, test_data = data
        model, criterion, optimizer, scaler = make_model(args, device, ntokens)
@@ -520,10 +526,10 @@ def make_model_and_data(args, device, new_data: bool = True):
 def bench_single_process(args):
-    num_devices = torch.cuda.device_count()
+    num_devices = torch.cuda.device_count() if torch.cuda.is_available() else 1
    assert num_devices > 0
    init_random_seed(0)
-    device = torch.device("cuda")
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    new_data = True
@@ -557,12 +563,13 @@ def run_mp_worker(args, available_workers):
        style=Pipe.AsyncSchedule,
        chunks=args.chunks,
        worker_map=get_worker_map(),
-        input_device=torch.cuda.current_device(),
+        input_device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"),
        pipelined_backward=args.pipelined_backward,
        checkpoint=args.checkpoint,
        # loss_fn=blob["criterion"],
-    ).cuda()
+    )
+    if torch.cuda.is_available():
+        p = p.cuda()
    if args.all_at_once and p.pipeline:
        print(f"running all at once")
        p.pipeline.all_at_once = True
@@ -678,7 +685,8 @@ parser.set_defaults(pipelined_backward=True)
 if __name__ == "__main__":
    args = parser.parse_args()
-    # bench_multi_process(args, all_at_once=True)
+    bench_multi_process(args, all_at_once=True)
    if args.no_mpi or "OMPI_COMM_WORLD_RANK" not in os.environ:
        print(f"Running benchmark with args: {args}")
        bench_single_process(args)

--- a/fairscale/nn/pipe/pipe.py
+++ b/fairscale/nn/pipe/pipe.py
@@ -736,7 +736,7 @@ class Pipe(Module):
                    from .phony import get_phony
-                    phony = get_phony(torch.device(torch.cuda.current_device()), requires_grad=True)
+                    phony = get_phony(torch.device(torch.cuda.current_device() if torch.cuda.is_available() else "cpu"), requires_grad=True)
                    output = PipelinedBackwardPass.apply(output, batches, phony, True)  # self.retain_graph)
                else:
                    output = microbatch.gather(batches)