Commit f29b3f8d authored by Michael Carilli's avatar Michael Carilli
Browse files

Make main_amp.py more profiling-friendly

parent 4b9858ec
......@@ -173,3 +173,9 @@ Running with the `--deterministic` flag should produce bitwise identical outputs
regardless of what other options are used (see [Pytorch docs on reproducibility](https://pytorch.org/docs/stable/notes/randomness.html)).
Since `--deterministic` disables `torch.backends.cudnn.benchmark`, `--deterministic` may
cause a modest performance decrease.
## Profiling
If you're curious how the network actually looks on the CPU and GPU timelines (for example, how good is the overall utilization?
Is the prefetcher really overlapping data transfers?) try profiling `main_amp.py`.
[Detailed instructions can be found here](https://gist.github.com/mcarilli/213a4e698e4a0ae2234ddee56f4f3f95).
......@@ -60,7 +60,7 @@ parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
help='use pre-trained model')
parser.add_argument('--prof', dest='prof', action='store_true',
parser.add_argument('--prof', default=-1, type=int,
help='Only run 10 iterations for profiling.')
parser.add_argument('--deterministic', action='store_true')
......@@ -236,8 +236,7 @@ def main():
# train for one epoch
train(train_loader, model, criterion, optimizer, epoch)
if args.prof:
break
# evaluate on validation set
prec1 = validate(val_loader, model, criterion)
......@@ -323,33 +322,34 @@ def train(train_loader, model, criterion, optimizer, epoch):
i = 0
while input is not None:
i += 1
if args.prof >= 0 and i == args.prof:
print("Profiling begun at iteration {}".format(i))
torch.cuda.cudart().cudaProfilerStart()
adjust_learning_rate(optimizer, epoch, i, len(train_loader))
if args.prof >= 0: torch.cuda.nvtx.range_push("Body of iteration {}".format(i))
if args.prof:
if i > 10:
break
adjust_learning_rate(optimizer, epoch, i, len(train_loader))
# compute output
if args.prof: torch.cuda.nvtx.range_push("forward")
if args.prof >= 0: torch.cuda.nvtx.range_push("forward")
output = model(input)
if args.prof: torch.cuda.nvtx.range_pop()
if args.prof >= 0: torch.cuda.nvtx.range_pop()
loss = criterion(output, target)
# compute gradient and do SGD step
optimizer.zero_grad()
if args.prof: torch.cuda.nvtx.range_push("backward")
if args.prof >= 0: torch.cuda.nvtx.range_push("backward")
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
if args.prof: torch.cuda.nvtx.range_pop()
if args.prof >= 0: torch.cuda.nvtx.range_pop()
# for param in model.parameters():
# print(param.data.double().sum().item(), param.grad.data.double().sum().item())
if args.prof: torch.cuda.nvtx.range_push("step")
if args.prof >= 0: torch.cuda.nvtx.range_push("optimizer.step()")
optimizer.step()
if args.prof: torch.cuda.nvtx.range_pop()
if args.prof >= 0: torch.cuda.nvtx.range_pop()
if i%args.print_freq == 0:
# Every print_freq iterations, check the loss, accuracy, and speed.
......@@ -388,8 +388,17 @@ def train(train_loader, model, criterion, optimizer, epoch):
args.world_size*args.batch_size/batch_time.avg,
batch_time=batch_time,
loss=losses, top1=top1, top5=top5))
if args.prof >= 0: torch.cuda.nvtx.range_push("prefetcher.next()")
input, target = prefetcher.next()
if args.prof >= 0: torch.cuda.nvtx.range_pop()
# Pop range "Body of iteration {}".format(i)
if args.prof >= 0: torch.cuda.nvtx.range_pop()
if args.prof >= 0 and i == args.prof + 10:
print("Profiling ended at iteration {}".format(i))
torch.cuda.cudart().cudaProfilerStop()
quit()
def validate(val_loader, model, criterion):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment