Make main_amp.py more profiling-friendly

f29b3f8d · Michael Carilli · 4b9858ec · f29b3f8d · f29b3f8d
Commit f29b3f8d authored Jun 21, 2019 by Michael Carilli
Hide whitespace changes
Inline Side-by-side

Showing with 29 additions and 14 deletions

examples/imagenet/README.md examples/imagenet/README.md +6 -0

examples/imagenet/main_amp.py examples/imagenet/main_amp.py +23 -14

No files found.
--- a/examples/imagenet/README.md
+++ b/examples/imagenet/README.md
@@ -173,3 +173,9 @@ Running with the `--deterministic` flag should produce bitwise identical outputs
 regardless of what other options are used (see [Pytorch docs on reproducibility](https://pytorch.org/docs/stable/notes/randomness.html)).
 Since `--deterministic` disables `torch.backends.cudnn.benchmark`, `--deterministic` may
 cause a modest performance decrease.
+
+## Profiling
+
+If you're curious how the network actually looks on the CPU and GPU timelines (for example, how good is the overall utilization?
+Is the prefetcher really overlapping data transfers?) try profiling `main_amp.py`.
+[Detailed instructions can be found here](https://gist.github.com/mcarilli/213a4e698e4a0ae2234ddee56f4f3f95).
--- a/examples/imagenet/main_amp.py
+++ b/examples/imagenet/main_amp.py
@@ -60,7 +60,7 @@ parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
 parser.add_argument('--pretrained', dest='pretrained', action='store_true',
                    help='use pre-trained model')

-parser.add_argument('--prof', dest='prof', action='store_true',
+parser.add_argument('--prof', default=-1, type=int,
                    help='Only run 10 iterations for profiling.')
 parser.add_argument('--deterministic', action='store_true')

@@ -236,8 +236,7 @@ def main():

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)
-        if args.prof:
-            break
+
        # evaluate on validation set
        prec1 = validate(val_loader, model, criterion)

@@ -323,33 +322,34 @@ def train(train_loader, model, criterion, optimizer, epoch):
    i = 0
    while input is not None:
        i += 1
+        if args.prof >= 0 and i == args.prof:
+            print("Profiling begun at iteration {}".format(i))
+            torch.cuda.cudart().cudaProfilerStart()

-        adjust_learning_rate(optimizer, epoch, i, len(train_loader))
+        if args.prof >= 0: torch.cuda.nvtx.range_push("Body of iteration {}".format(i))

-        if args.prof:
-            if i > 10:
-                break
+        adjust_learning_rate(optimizer, epoch, i, len(train_loader))

        # compute output
-        if args.prof: torch.cuda.nvtx.range_push("forward")
+        if args.prof >= 0: torch.cuda.nvtx.range_push("forward")
        output = model(input)
-        if args.prof: torch.cuda.nvtx.range_pop()
+        if args.prof >= 0: torch.cuda.nvtx.range_pop()
        loss = criterion(output, target)

        # compute gradient and do SGD step
        optimizer.zero_grad()

-        if args.prof: torch.cuda.nvtx.range_push("backward")
+        if args.prof >= 0: torch.cuda.nvtx.range_push("backward")
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
-        if args.prof: torch.cuda.nvtx.range_pop()
+        if args.prof >= 0: torch.cuda.nvtx.range_pop()

        # for param in model.parameters():
        #     print(param.data.double().sum().item(), param.grad.data.double().sum().item())

-        if args.prof: torch.cuda.nvtx.range_push("step")
+        if args.prof >= 0: torch.cuda.nvtx.range_push("optimizer.step()")
        optimizer.step()
-        if args.prof: torch.cuda.nvtx.range_pop()
+        if args.prof >= 0: torch.cuda.nvtx.range_pop()

        if i%args.print_freq == 0:
            # Every print_freq iterations, check the loss, accuracy, and speed.
@@ -388,8 +388,17 @@ def train(train_loader, model, criterion, optimizer, epoch):
                       args.world_size*args.batch_size/batch_time.avg,
                       batch_time=batch_time,
                       loss=losses, top1=top1, top5=top5))
-
+        if args.prof >= 0: torch.cuda.nvtx.range_push("prefetcher.next()")
        input, target = prefetcher.next()
+        if args.prof >= 0: torch.cuda.nvtx.range_pop()
+
+        # Pop range "Body of iteration {}".format(i)
+        if args.prof >= 0: torch.cuda.nvtx.range_pop()
+
+        if args.prof >= 0 and i == args.prof + 10:
+            print("Profiling ended at iteration {}".format(i))
+            torch.cuda.cudart().cudaProfilerStop()
+            quit()


 def validate(val_loader, model, criterion):