Commit f29b3f8d authored by Michael Carilli's avatar Michael Carilli
Browse files

Make main_amp.py more profiling-friendly

parent 4b9858ec
...@@ -173,3 +173,9 @@ Running with the `--deterministic` flag should produce bitwise identical outputs ...@@ -173,3 +173,9 @@ Running with the `--deterministic` flag should produce bitwise identical outputs
regardless of what other options are used (see [Pytorch docs on reproducibility](https://pytorch.org/docs/stable/notes/randomness.html)). regardless of what other options are used (see [Pytorch docs on reproducibility](https://pytorch.org/docs/stable/notes/randomness.html)).
Since `--deterministic` disables `torch.backends.cudnn.benchmark`, `--deterministic` may Since `--deterministic` disables `torch.backends.cudnn.benchmark`, `--deterministic` may
cause a modest performance decrease. cause a modest performance decrease.
## Profiling
If you're curious how the network actually looks on the CPU and GPU timelines (for example, how good is the overall utilization?
Is the prefetcher really overlapping data transfers?) try profiling `main_amp.py`.
[Detailed instructions can be found here](https://gist.github.com/mcarilli/213a4e698e4a0ae2234ddee56f4f3f95).
...@@ -60,7 +60,7 @@ parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', ...@@ -60,7 +60,7 @@ parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
parser.add_argument('--pretrained', dest='pretrained', action='store_true', parser.add_argument('--pretrained', dest='pretrained', action='store_true',
help='use pre-trained model') help='use pre-trained model')
parser.add_argument('--prof', dest='prof', action='store_true', parser.add_argument('--prof', default=-1, type=int,
help='Only run 10 iterations for profiling.') help='Only run 10 iterations for profiling.')
parser.add_argument('--deterministic', action='store_true') parser.add_argument('--deterministic', action='store_true')
...@@ -236,8 +236,7 @@ def main(): ...@@ -236,8 +236,7 @@ def main():
# train for one epoch # train for one epoch
train(train_loader, model, criterion, optimizer, epoch) train(train_loader, model, criterion, optimizer, epoch)
if args.prof:
break
# evaluate on validation set # evaluate on validation set
prec1 = validate(val_loader, model, criterion) prec1 = validate(val_loader, model, criterion)
...@@ -323,33 +322,34 @@ def train(train_loader, model, criterion, optimizer, epoch): ...@@ -323,33 +322,34 @@ def train(train_loader, model, criterion, optimizer, epoch):
i = 0 i = 0
while input is not None: while input is not None:
i += 1 i += 1
if args.prof >= 0 and i == args.prof:
print("Profiling begun at iteration {}".format(i))
torch.cuda.cudart().cudaProfilerStart()
adjust_learning_rate(optimizer, epoch, i, len(train_loader)) if args.prof >= 0: torch.cuda.nvtx.range_push("Body of iteration {}".format(i))
if args.prof: adjust_learning_rate(optimizer, epoch, i, len(train_loader))
if i > 10:
break
# compute output # compute output
if args.prof: torch.cuda.nvtx.range_push("forward") if args.prof >= 0: torch.cuda.nvtx.range_push("forward")
output = model(input) output = model(input)
if args.prof: torch.cuda.nvtx.range_pop() if args.prof >= 0: torch.cuda.nvtx.range_pop()
loss = criterion(output, target) loss = criterion(output, target)
# compute gradient and do SGD step # compute gradient and do SGD step
optimizer.zero_grad() optimizer.zero_grad()
if args.prof: torch.cuda.nvtx.range_push("backward") if args.prof >= 0: torch.cuda.nvtx.range_push("backward")
with amp.scale_loss(loss, optimizer) as scaled_loss: with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward() scaled_loss.backward()
if args.prof: torch.cuda.nvtx.range_pop() if args.prof >= 0: torch.cuda.nvtx.range_pop()
# for param in model.parameters(): # for param in model.parameters():
# print(param.data.double().sum().item(), param.grad.data.double().sum().item()) # print(param.data.double().sum().item(), param.grad.data.double().sum().item())
if args.prof: torch.cuda.nvtx.range_push("step") if args.prof >= 0: torch.cuda.nvtx.range_push("optimizer.step()")
optimizer.step() optimizer.step()
if args.prof: torch.cuda.nvtx.range_pop() if args.prof >= 0: torch.cuda.nvtx.range_pop()
if i%args.print_freq == 0: if i%args.print_freq == 0:
# Every print_freq iterations, check the loss, accuracy, and speed. # Every print_freq iterations, check the loss, accuracy, and speed.
...@@ -388,8 +388,17 @@ def train(train_loader, model, criterion, optimizer, epoch): ...@@ -388,8 +388,17 @@ def train(train_loader, model, criterion, optimizer, epoch):
args.world_size*args.batch_size/batch_time.avg, args.world_size*args.batch_size/batch_time.avg,
batch_time=batch_time, batch_time=batch_time,
loss=losses, top1=top1, top5=top5)) loss=losses, top1=top1, top5=top5))
if args.prof >= 0: torch.cuda.nvtx.range_push("prefetcher.next()")
input, target = prefetcher.next() input, target = prefetcher.next()
if args.prof >= 0: torch.cuda.nvtx.range_pop()
# Pop range "Body of iteration {}".format(i)
if args.prof >= 0: torch.cuda.nvtx.range_pop()
if args.prof >= 0 and i == args.prof + 10:
print("Profiling ended at iteration {}".format(i))
torch.cuda.cudart().cudaProfilerStop()
quit()
def validate(val_loader, model, criterion): def validate(val_loader, model, criterion):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment