version 1

64b02fb6 · liangjing · 64b02fb6 · 64b02fb6 · 64b02fb6 · 64b02fb6
Commit 64b02fb6 authored Apr 11, 2023 by liangjing
20 changed files
--- a/engine.py
+++ b/engine.py
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import sys
+import time
+import torch
+import torch.distributed as dist
+
+from mlperf_logger import mllogger
+from mlperf_logging.mllog.constants import (EPOCH_START, EPOCH_STOP, EVAL_START, EVAL_STOP, EVAL_ACCURACY)
+
+import utils
+from mlperf_common.scaleoutbridge import ScaleoutBridgeBase as SBridge
+
+from async_executor import async_executor
+
+
+def preprocessing(images, targets, model_ptr, data_layout):
+    # TODO: can be parallelized? should we use DALI? there must be a better way
+    target_per_image = []
+    for i in range(len(images)):
+        # create List[Dict] mapping for targets, used only for preprocessing.
+        # only 'boxes', and perhaps 'keypoints', are used for preprocessing.
+        dict_ = {'boxes': targets['boxes'][i]}
+        assert ('keypoints' not in targets)
+        target_per_image.append(dict_)
+
+    images, targets_ = model_ptr.transform(images, target_per_image)
+
+    # List[Dict] -> Dict[List]
+    for i in range(len(targets_)):
+        targets['boxes'][i] = targets_[i]['boxes']
+
+    images = images.tensors
+    if data_layout == 'channels_last':
+        images = images.to(memory_format=torch.channels_last)
+
+    return images, targets
+
+
+def init_scratchpad(images, targets, batch_size, num_classes, amp, fused_focal_loss,
+                    max_boxes, cls_head_padded, reg_head_padded, cuda_graphs):
+    device = targets['labels'][0].device
+
+    # TODO: should we skip validation in deployment?
+    # model_ptr.validate_input(images, targets)
+
+    # one-time init
+    if utils.ScratchPad.target_labels_padded is None and cls_head_padded:
+        utils.ScratchPad.target_labels_padded = torch.zeros([batch_size, max_boxes + 1],
+                                                            device=device, dtype=torch.int64)
+        utils.ScratchPad.target_labels_padded[:, -1] = num_classes if not fused_focal_loss else -1
+    if utils.ScratchPad.target_boxes_padded is None and reg_head_padded:
+        utils.ScratchPad.target_boxes_padded = torch.zeros([batch_size, max_boxes, 4], device=device)
+    if utils.ScratchPad.target_n is None and (cls_head_padded or reg_head_padded):
+        utils.ScratchPad.target_n = torch.zeros([batch_size, 1], device=device, dtype=torch.int64)
+    if utils.ScratchPad.target_matched_idxs is None and cuda_graphs:
+        utils.ScratchPad.target_matched_idxs = torch.zeros_like(targets['matched_idxs'], device=device)
+
+    # these allocations are used to avoid allocations per iteration
+    if utils.ScratchPad.gt_classes_target is None:
+        if not fused_focal_loss:
+            utils.ScratchPad.gt_classes_target = \
+                torch.zeros(torch.Size([batch_size, 120087, num_classes + (1 if cls_head_padded else 0)]),
+                            dtype=torch.float32 if not amp else torch.float16).to(device)
+        else:
+            utils.ScratchPad.gt_classes_target = \
+                torch.zeros(torch.Size([batch_size, 120087]), device=device, dtype=torch.int64)
+    if utils.ScratchPad.batch_size_vector is None:
+        utils.ScratchPad.batch_size_vector = torch.arange(len(targets['boxes'])).unsqueeze(1).cuda()
+
+    # data init
+    if cls_head_padded:
+        utils.ScratchPad.target_labels_padded[:, :-1].fill_(0)
+    if reg_head_padded:
+        utils.ScratchPad.target_boxes_padded.fill_(0)
+    if cuda_graphs:
+        utils.ScratchPad.target_matched_idxs.copy_(targets['matched_idxs'])
+
+    for i in range(images.size(0)):
+        # debug
+        # assert targets['labels'][i].size(0) < max_boxes
+        labels_n = targets['labels'][i].size(0)
+        if cls_head_padded:
+            utils.ScratchPad.target_labels_padded[i][:labels_n] = targets['labels'][i][:labels_n]
+
+            # debug: if args.apex_focal_loss than the -1 pos remains num_classes and not overridden
+            # assert ((not fused_focal_loss and (utils.ScratchPad.target_labels_padded[:, -1] == num_classes).all())
+            #        or fused_focal_loss)
+        if reg_head_padded:
+            utils.ScratchPad.target_boxes_padded[i][:labels_n] = targets['boxes'][i][:labels_n]
+        if cls_head_padded or reg_head_padded:
+            utils.ScratchPad.target_n[i] = labels_n
+
+    utils.ScratchPad.gt_classes_target.fill_(0 if not fused_focal_loss else -1)
+
+
+def compute_matched_idxs(targets_boxes, model_ptr):
+    matched_idxs = model_ptr.get_matched_idxs(targets_boxes)
+
+    return matched_idxs
+
+
+def loss_preprocessing(targets_boxes, targets_labels, matched_idxs, model_ptr, fused_focal_loss, max_boxes,
+                       cls_head_padded, reg_head_padded):
+    # classification loss prologues
+    if cls_head_padded:
+        gt_classes_target, num_foreground, valid_idxs = \
+            model_ptr.head.classification_head.compute_loss_prologue_padded(targets_labels,
+                                                                            matched_idxs,
+                                                                            one_hot=(not fused_focal_loss),
+                                                                            max_boxes=max_boxes)
+    else:
+        gt_classes_target, num_foreground, valid_idxs = \
+            model_ptr.head.classification_head.compute_loss_prologue(targets_labels, matched_idxs,
+                                                                     one_hot=(not fused_focal_loss))
+
+    # regression loss prologues
+    if reg_head_padded:
+        target_regression, _, foreground_idxs_mask = \
+            model_ptr.head.regression_head.compute_loss_prologue_padded(targets_boxes, matched_idxs, model_ptr.anchors)
+    else:
+        target_regression, _, foreground_idxs_mask = \
+            model_ptr.head.regression_head.compute_loss_prologue(targets_boxes, matched_idxs, model_ptr.anchors)
+
+    return gt_classes_target, target_regression, num_foreground, valid_idxs, foreground_idxs_mask
+
+
+def compute_loss(model_ptr, cls_logits, bbox_regression, valid_idxs, gt_classes_target, num_foreground,
+                 target_regression, foreground_idxs_mask, fused_focal_loss, reg_head_padded):
+
+    cls_loss = model_ptr.head.classification_head.compute_loss_core(cls_logits, gt_classes_target,
+                                                                    valid_idxs, num_foreground,
+                                                                    fused_focal_loss=fused_focal_loss)
+
+    if reg_head_padded:
+        reg_loss = model_ptr.head.regression_head.compute_loss_core_padded(bbox_regression, target_regression,
+                                                                           foreground_idxs_mask, num_foreground)
+    else:
+        reg_loss = model_ptr.head.regression_head.compute_loss_core(bbox_regression, target_regression,
+                                                                    foreground_idxs_mask, num_foreground)
+
+    return cls_loss, reg_loss
+
+
+def train_one_epoch(model, optimizer, scaler, data_loader, device, epoch, train_group, args,
+                    graphed_model=None, static_input=None, static_loss=None, static_prologues_out=None,
+                    sbridge=SBridge()):
+    mllogger.start(key=EPOCH_START, value=epoch, metadata={"epoch_num": epoch}, sync=True, sync_group=train_group)
+    sbridge.start_epoch_prof()
+    model.train()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    if not args.skip_metric_loss:
+        metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+
+    # direct pointer to the model
+    model_ptr = model.module if args.distributed else model
+
+    lr_scheduler = None
+    if epoch < args.warmup_epochs:
+        # Convert epochs to iterations
+        # we want to control warmup at the epoch level, but update lr every iteration
+        start_iter = epoch*len(data_loader)
+        warmup_iters = args.warmup_epochs*len(data_loader)
+        lr_scheduler = utils.warmup_lr_scheduler(optimizer, start_iter, warmup_iters, args.warmup_factor)
+
+    accuracy = None
+    for images, targets in metric_logger.log_every(data_loader, args.print_freq, header):
+        sbridge.start_prof(SBridge.ITER_TIME)
+        if args.syn_dataset:
+            images = list(image.to(device, non_blocking=True) for image in images)
+            images = torch.stack(images)
+
+            targets = {k: [dic[k].to(device, non_blocking=True) for dic in targets] for k in targets[0]}
+            targets['matched_idxs'] = torch.stack(targets['matched_idxs'])
+        else:
+            # DALI iterator provides data as needed
+            if not args.dali:
+                images = list(image.to(device, non_blocking=True) for image in images)
+                # arrange "targets" as a Dict[List], instead of a List[Dict], so later it will be easier to use targets
+                # data in parallel (e.g., to get the entire batch "boxes", one can just use targets['boxes']).
+                # TODO: there might be some unused fields in the targets tensor, so perhaps can avoid some transfers
+                targets = {k: [dic[k].to(device, non_blocking=True) for dic in targets] for k in targets[0]}
+
+                # preprocessing
+                images, targets = preprocessing(images, targets, model_ptr, args.data_layout)
+
+            # DALI can compute matched_idxs and put it in targets, but if it doesn't do so, do it here
+            if 'matched_idxs' not in targets:
+                with torch.cuda.amp.autocast(enabled=args.amp):
+                    targets['matched_idxs'] = compute_matched_idxs(targets['boxes'], model_ptr)
+
+        if not args.cuda_graphs:
+            optimizer.zero_grad()
+
+        # init necessary data in the scratchpad
+        with torch.cuda.amp.autocast(enabled=args.amp):
+            init_scratchpad(images, targets, args.batch_size, args.num_classes, args.amp,
+                            args.apex_focal_loss, args.max_boxes, args.cls_head_pad, args.reg_head_pad,
+                            args.cuda_graphs)
+
+        if lr_scheduler is not None:
+            lr_scheduler.step()
+
+        if args.cuda_graphs:
+            if args.not_graphed_prologues:
+                with torch.cuda.amp.autocast(enabled=args.amp):
+                    # loss prologue: preprocess everything that does not require model forward and backward
+                    # use the padded scratchpad buffers if reg_head_pad/cls_head_pad are toggled
+                    targets_boxes = targets['boxes'] if not args.reg_head_pad else utils.ScratchPad.target_boxes_padded
+                    targets_labels = targets['labels'] if not args.cls_head_pad else utils.ScratchPad.target_labels_padded
+
+                    gt_classes_target, target_regression, num_foreground, valid_idxs, foreground_idxs_mask = \
+                        loss_preprocessing(targets_boxes, targets_labels, targets['matched_idxs'], model_ptr,
+                                           args.apex_focal_loss, args.max_boxes, args.cls_head_pad, args.reg_head_pad)
+
+            static_input.copy_(images)
+            # All necessary data is copied to the graph buffers in init_scratchpad
+            # The graph is programmed to use pointers to the scratchpad (besides images)
+
+            if args.not_graphed_prologues:
+                static_prologues_out[0].copy_(gt_classes_target)
+                static_prologues_out[1].copy_(target_regression)
+                static_prologues_out[2].copy_(num_foreground)
+                static_prologues_out[3].copy_(valid_idxs)
+                static_prologues_out[4].copy_(foreground_idxs_mask)
+
+            # graphed model comprises loss_preprocessing->forward->compute_loss->backward
+            graphed_model.replay()
+            if not args.skip_metric_loss:
+                dist.all_reduce(tensor=static_loss, group=train_group)
+                losses_reduced = static_loss / utils.get_world_size()
+            if args.sync_after_graph_replay:
+                torch.cuda.synchronize()
+            sbridge.start_prof(SBridge.OPT_TIME)
+            scaler.step(optimizer)
+            scaler.update()
+            sbridge.stop_prof(SBridge.OPT_TIME)
+
+        else:
+            with torch.cuda.amp.autocast(enabled=args.amp):
+                # loss prologue: preprocess everything that does not require model forward and backward
+                # use the padded scratchpad buffers if reg_head_pad/cls_head_pad are toggled
+                targets_boxes = utils.ScratchPad.target_boxes_padded if args.reg_head_pad else targets['boxes']
+                targets_labels = utils.ScratchPad.target_labels_padded if args.cls_head_pad else targets['labels']
+
+                gt_classes_target, target_regression, num_foreground, valid_idxs, foreground_idxs_mask = \
+                    loss_preprocessing(targets_boxes, targets_labels, targets['matched_idxs'], model_ptr,
+                                       args.apex_focal_loss, args.max_boxes, args.cls_head_pad, args.reg_head_pad)
+
+                # forward
+                sbridge.start_prof(SBridge.FWD_TIME)
+
+                model_output = model(images)
+                # features = model_output[0:5]
+                # head_outputs = {'cls_logits': model_output[5], 'bbox_regression': model_output[6]}
+
+                # loss (given the prologue computations)
+                cls_loss, reg_loss = compute_loss(model_ptr, model_output[5], model_output[6], valid_idxs,
+                                                  gt_classes_target, num_foreground, target_regression,
+                                                  foreground_idxs_mask, args.apex_focal_loss, args.reg_head_pad)
+                loss_dict = {'classification': cls_loss, 'bbox_regression': reg_loss}
+                losses = sum(loss for loss in loss_dict.values())
+
+                # --- old loss (for debug)
+                # loss_dict_ = model_ptr.compute_loss(targets, head_outputs)
+                # assert(torch.allclose(loss_dict['classification'], loss_dict_['classification']))
+                # assert(torch.allclose(loss_dict['bbox_regression'], loss_dict_['bbox_regression']))
+
+                # reduce losses over all GPUs for logging purposes
+                # TODO: remove
+                loss_dict_reduced = utils.reduce_dict(loss_dict, group=train_group)
+                losses_reduced = sum(loss for loss in loss_dict_reduced.values())
+                loss_value = losses_reduced.item()
+                sbridge.stop_prof(SBridge.FWD_TIME)
+
+                if not math.isfinite(loss_value):
+                    print("Loss is {}, stopping training".format(loss_value))
+                    print(loss_dict_reduced)
+                    sys.exit(1)
+
+            # backward
+            sbridge.start_prof(SBridge.BWD_TIME)
+            scaler.scale(losses).backward()
+            sbridge.stop_start_prof(SBridge.BWD_TIME, SBridge.OPT_TIME)
+            scaler.step(optimizer)
+            scaler.update()
+            sbridge.stop_prof(SBridge.OPT_TIME)
+
+        if not args.skip_metric_loss:
+            if not args.cuda_graphs:
+                metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
+            else:
+                metric_logger.update(loss=losses_reduced)
+            metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+
+        # Check async coco result
+        if args.async_coco and not (metric_logger.current_iter % args.async_coco_check_freq):
+            # FIXME(ahmadki): --num-eval-ranks
+            if args.eval_rank == 0:
+                results = async_executor.pop_if_done()
+                # in case of multiple results are returned, get the highest mAP
+                if results and len(results) > 0:
+                    accuracy = max([result['bbox'][0] for result in results.values() if result], default=-1)
+
+            if args.distributed:
+                accuracy = utils.broadcast(accuracy, 0, group=train_group)
+
+            if args.target_map and accuracy and accuracy >= args.target_map:
+                break
+
+        sbridge.stop_prof(SBridge.ITER_TIME)
+
+    sbridge.stop_epoch_prof()
+    mllogger.end(key=EPOCH_STOP, value=epoch, metadata={"epoch_num": epoch}, sync=True, sync_group=train_group)
+    summary = metric_logger.summary
+    if summary['samples'] > 0:
+        throughput = summary['samples'] / (summary['end_time'] - summary['start_time'])
+        mllogger.event(key='tracked_stats', value={'throughput': throughput}, metadata={'step': (epoch + 1)})
+    return metric_logger, accuracy
+
+
+@torch.no_grad()
+def evaluate(model, data_loader, device, epoch, eval_group, args,
+             graphed_model=None, static_input=None, static_output=None, sbridge=SBridge()):
+    sbridge.start_eval_prof()
+    mllogger.start(key=EVAL_START, value=epoch, metadata={"epoch_num": epoch},
+                   sync=True, sync_group=eval_group)
+
+    n_threads = torch.get_num_threads()
+    # FIXME remove this and make paste_masks_in_image run on the GPU
+    torch.set_num_threads(1)
+    cpu_device = torch.device("cpu")
+    model.eval()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    header = 'Test:'
+
+    iou_types = ["bbox"]
+    if args.cocoeval == 'python':
+        from coco.python_coco import PythonCocoEvaluator
+        coco_evaluator = PythonCocoEvaluator(annotations_file=args.val_annotations_file,
+                                             iou_types=iou_types, group=eval_group)
+    elif args.cocoeval == 'nvidia':
+        from coco.nvidia_coco import NVCocoEvaluator
+        coco_evaluator = NVCocoEvaluator(annotations_file=args.val_annotations_file,
+                                         iou_types=iou_types, num_threads=args.coco_threads, group=eval_group)
+    else:
+        assert False, f"Unknown coco evaluator implementation: {args.coco}"
+
+    model_ptr = model.module if args.distributed else model
+
+    for images, targets in metric_logger.log_every(data_loader, args.eval_print_freq, header):
+        if args.dali_eval and images.shape[0] == 0:
+            continue
+
+        if not args.dali_eval:
+            images = list(img.to(device, non_blocking=True) for img in images)
+
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+
+            # preprocessing
+            for i, (image, target) in enumerate(zip(images, targets)):
+                # add the original image size to targets
+                targets[i]['original_image_size'] = image.shape[-2:]
+            images, targets = model_ptr.transform(images, targets)
+
+            images = images.tensors
+
+        if args.data_layout == 'channels_last':
+            images = images.to(memory_format=torch.channels_last)
+
+        model_time = time.time()
+
+        if not args.cuda_graphs_eval:
+            with torch.cuda.amp.autocast(enabled=args.amp):
+                model_output = model(images)
+        else:
+            if images.size(0) != args.eval_batch_size:
+                static_input.fill_(0)
+            static_input[:images.size(0)].copy_(images)
+            graphed_model.replay()
+            model_output = static_output
+            if images.size(0) != args.eval_batch_size:
+                model_output = [x[:images.size(0)] for x in model_output]
+
+        with torch.cuda.amp.autocast(enabled=args.amp):
+            features = model_output[0:5]
+            head_outputs = {'cls_logits': model_output[5], 'bbox_regression': model_output[6]}
+
+            outputs = model_ptr.eval_postprocess(images, features, targets, head_outputs, targets_dict=args.dali_eval)
+
+        outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
+        model_time = time.time() - model_time
+
+        if args.dali_eval:
+            res = {target.item(): output for target, output in zip(targets['image_id'], outputs)}
+        else:
+            res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}
+        evaluator_time = time.time()
+        coco_evaluator.update(res)
+        evaluator_time = time.time() - evaluator_time
+        metric_logger.update(model_time=model_time, evaluator_time=evaluator_time)
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes(group=eval_group)
+    print("Averaged stats:", metric_logger)
+    coco_evaluator.synchronize_between_processes()
+
+    def log_callback(future):
+        stats = future.result()
+        accuracy = stats['bbox'][0]
+        mllogger.event(key=EVAL_ACCURACY, value=accuracy, metadata={"epoch_num": epoch})
+        mllogger.end(key=EVAL_STOP, value=epoch, metadata={"epoch_num": epoch})
+
+    accuracy = None
+    if (not args.distributed) or args.eval_rank == 0:
+        if args.async_coco:
+            async_executor.submit(tag=str(epoch),
+                                  fn=coco_evaluator.get_stats_from_evaluator,
+                                  evaluator=coco_evaluator)
+            async_executor.add_done_callback(tag=str(epoch), fn=log_callback)
+        else:
+            stats = coco_evaluator.get_stats_from_evaluator(coco_evaluator)
+            accuracy = stats['bbox'][0]
+            mllogger.event(key=EVAL_ACCURACY, value=accuracy, metadata={"epoch_num": epoch})
+            mllogger.end(key=EVAL_STOP, value=epoch, metadata={"epoch_num": epoch})
+
+    if (not args.async_coco) and args.distributed:
+        accuracy = utils.broadcast(accuracy, 0, group=eval_group)
+
+    torch.set_num_threads(n_threads)
+    sbridge.stop_eval_prof()
+    return accuracy
--- a/mll-status
+++ b/mll-status
+#!/usr/bin/awk -f
+BEGIN {
+    OFS="\t"
+    if (ARGV[1] == "--header") {
+        print_header()
+        ARGV[1]=""              # tell awk that '--header' isn't a file name
+    }
+}
+
+function print_header() {
+    print("file", "gpus", "batch", "total", "converge", "init", "epoch1", "epoch_avg", "[epoch", "eval_time", "mAP]*")
+}
+
+
+function get_mll_string_val(line, key) {
+    myregex= "\"" key "\": \"([^\"]*)\""
+    match(line, myregex, result_array)
+    return result_array[1]
+}
+
+function get_mll_int_val(line, key) {
+    myregex= "\"" key "\": ([0-9]*)"
+    match(line, myregex, result_array)
+    return result_array[1]
+}
+
+function get_mll_float_val(line, key) {
+    myregex= "\"" key "\": ([0-9.e+-]*)"
+    match(line, myregex, result_array)
+    return result_array[1]
+}
+
+function get_mll_time(line) {
+    return get_mll_int_val(line, "time_ms")/1000
+}
+
+function get_mll_epoch_num(line) {
+    return get_mll_int_val(line, "epoch_num")
+}
+
+BEGINFILE {
+    stop_status = "notdone"
+    last_eval_epoch = -1
+    run_start_time = 0
+    run_stop_time = 0
+    avg_epoch_time = -1
+    init_time = -1
+    training_time = 0
+    last_eval_time = 0
+    delete eval_time
+    delete eval_acc
+    delete time_at_epoch
+    ranks = 0
+    global_batch = -1
+}
+
+# make sure all the relevant lines have the fields in the positions we expect
+# them (sometimes the parallel output causes multiple (or none) "0: " at the
+# beginning instead of the single one expected, just make it none)
+/:::MLL/ {
+    sub(/^.*:::MLL/, ":::MLL")
+}
+
+/:::MLL.*"key": "init_start"/ {
+    ranks = ranks+1
+}
+
+/:::MLL.*"key": "global_batch_size"/ {
+    global_batch = get_mll_int_val($0, "value")
+}
+
+/:::MLL.*"key": "epoch_start"/ {
+    epoch_num = get_mll_epoch_num($0)
+    epoch_start_time = get_mll_time($0)
+    time_at_epoch[epoch_num] = epoch_start_time
+    if (epoch_num == 1) {
+        init_time=epoch_start_time-run_start_time
+    }
+}
+
+/:::MLL.*"key": "epoch_stop"/ {
+    epoch_num = get_mll_epoch_num($0)
+    current_time = get_mll_time($0)
+    training_time = training_time + (current_time-time_at_epoch[epoch_num])
+    if (epoch_num > 1 && epoch_num <= 39) {
+        avg_epoch_time = (current_time - time_at_epoch[2])/(epoch_num-1)
+    }
+}
+
+/:::MLL.*"key": "eval_start"/ {
+    epoch_num = get_mll_epoch_num($0)
+    current_time = get_mll_time($0)
+    eval_time[epoch_num] = current_time
+    last_eval_time = current_time
+}
+
+/:::MLL.*"key": "eval_accuracy"/ {
+    eval_acc[get_mll_epoch_num($0)] = get_mll_float_val($0, "value")
+}
+
+/:::MLL.*"key": "eval_stop"/ {
+    epoch_num = get_mll_epoch_num($0)
+    eval_time[epoch_num] = get_mll_time($0) - eval_time[epoch_num]
+    last_eval_epoch = epoch_num
+}
+
+
+/:::MLL.*"key": "run_start"/ {
+    run_start_time=get_mll_time($0)
+}
+
+function printall(fname, total_time, last_eval_epoch, init_time, avg_epoch_time, eval_time, eval_acc) {
+    if (ranks > 0) {
+        local_batch=global_batch/ranks
+    } else {
+        local_batch = -1
+    }
+    printf("%s\t%d\t%d\t%.2f\t%s\t%.4f\t%.4f\t%.4f\t%.2f\t%.2f", fname, ranks, local_batch, total_time, last_eval_epoch, init_time, time_at_epoch[2]-time_at_epoch[1], avg_epoch_time, training_time, last_eval_time)
+    for (i in eval_time) {
+        printf("\t%d\t%.4f\t%.4f", i, eval_time[i], eval_acc[i])
+    }
+    printf("\n")
+}
+
+/:::MLL.*"key": "run_stop"/ {
+    stop_status = get_mll_string_val($0, "status")
+    run_stop_time = get_mll_time($0)
+    if (last_eval_time > 0) {
+        last_eval_time = run_stop_time-last_eval_time
+    }
+    if (stop_status == "success") {
+        stop_status = last_eval_epoch
+    }
+}
+
+ENDFILE {
+    printall(FILENAME, run_stop_time-run_start_time, stop_status, init_time, avg_epoch_time, eval_time, eval_acc)
+}
--- a/mlperf_logger.py
+++ b/mlperf_logger.py
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from mlperf_common.logging import MLLoggerWrapper
+from mlperf_common.frameworks.pyt import PyTCommunicationHandler
+
+mllogger = MLLoggerWrapper(PyTCommunicationHandler(), value=None)
--- a/model/__init__.py
+++ b/model/__init__.py
--- a/model/__pycache__/__init__.cpython-37.pyc
+++ b/model/__pycache__/__init__.cpython-37.pyc
--- a/model/__pycache__/anchor_utils.cpython-37.pyc
+++ b/model/__pycache__/anchor_utils.cpython-37.pyc
--- a/model/__pycache__/backbone_utils.cpython-37.pyc
+++ b/model/__pycache__/backbone_utils.cpython-37.pyc
--- a/model/__pycache__/boxes.cpython-37.pyc
+++ b/model/__pycache__/boxes.cpython-37.pyc
--- a/model/__pycache__/feature_pyramid_network.cpython-37.pyc
+++ b/model/__pycache__/feature_pyramid_network.cpython-37.pyc
--- a/model/__pycache__/focal_loss.cpython-37.pyc
+++ b/model/__pycache__/focal_loss.cpython-37.pyc
--- a/model/__pycache__/frozen_bn.cpython-37.pyc
+++ b/model/__pycache__/frozen_bn.cpython-37.pyc
--- a/model/__pycache__/image_list.cpython-37.pyc
+++ b/model/__pycache__/image_list.cpython-37.pyc
--- a/model/__pycache__/jit_fn.cpython-37.pyc
+++ b/model/__pycache__/jit_fn.cpython-37.pyc
--- a/model/__pycache__/resnet.cpython-37.pyc
+++ b/model/__pycache__/resnet.cpython-37.pyc
--- a/model/__pycache__/retinanet.cpython-37.pyc
+++ b/model/__pycache__/retinanet.cpython-37.pyc
--- a/model/__pycache__/roi_heads.cpython-37.pyc
+++ b/model/__pycache__/roi_heads.cpython-37.pyc
--- a/model/__pycache__/transform.cpython-37.pyc
+++ b/model/__pycache__/transform.cpython-37.pyc
--- a/model/__pycache__/utils.cpython-37.pyc
+++ b/model/__pycache__/utils.cpython-37.pyc
--- a/model/aaa.py
+++ b/model/aaa.py
+import warnings
+from typing import Callable, List, Optional, Sequence, Tuple, Union
+
+import torch
+from torch import Tensor
+
+from ..utils import _log_api_usage_once, _make_ntuple
+
+
+interpolate = torch.nn.functional.interpolate
+
+
+[docs]class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed
+
+    Args:
+        num_features (int): Number of features ``C`` from an expected input of size ``(N, C, H, W)``
+        eps (float): a value added to the denominator for numerical stability. Default: 1e-5
+    """
+
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 1e-5,
+    ):
+        super().__init__()
+        _log_api_usage_once(self)
+        self.eps = eps
+        self.register_buffer("weight", torch.ones(num_features))
+        self.register_buffer("bias", torch.zeros(num_features))
+        self.register_buffer("running_mean", torch.zeros(num_features))
+        self.register_buffer("running_var", torch.ones(num_features))
+
+    def _load_from_state_dict(
+        self,
+        state_dict: dict,
+        prefix: str,
+        local_metadata: dict,
+        strict: bool,
+        missing_keys: List[str],
+        unexpected_keys: List[str],
+        error_msgs: List[str],
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+[docs]    def forward(self, x: Tensor) -> Tensor:
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        scale = w * (rv + self.eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.weight.shape[0]}, eps={self.eps})"
+
+
+class ConvNormActivation(torch.nn.Sequential):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, ...]] = 3,
+        stride: Union[int, Tuple[int, ...]] = 1,
+        padding: Optional[Union[int, Tuple[int, ...], str]] = None,
+        groups: int = 1,
+        norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm2d,
+        activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
+        dilation: Union[int, Tuple[int, ...]] = 1,
+        inplace: Optional[bool] = True,
+        bias: Optional[bool] = None,
+        conv_layer: Callable[..., torch.nn.Module] = torch.nn.Conv2d,
+    ) -> None:
+
+        if padding is None:
+            if isinstance(kernel_size, int) and isinstance(dilation, int):
+                padding = (kernel_size - 1) // 2 * dilation
+            else:
+                _conv_dim = len(kernel_size) if isinstance(kernel_size, Sequence) else len(dilation)
+                kernel_size = _make_ntuple(kernel_size, _conv_dim)
+                dilation = _make_ntuple(dilation, _conv_dim)
+                padding = tuple((kernel_size[i] - 1) // 2 * dilation[i] for i in range(_conv_dim))
+        if bias is None:
+            bias = norm_layer is None
+
+        layers = [
+            conv_layer(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding,
+                dilation=dilation,
+                groups=groups,
+                bias=bias,
+            )
+        ]
+
+        if norm_layer is not None:
+            layers.append(norm_layer(out_channels))
+
+        if activation_layer is not None:
+            params = {} if inplace is None else {"inplace": inplace}
+            layers.append(activation_layer(**params))
+        super().__init__(*layers)
+        _log_api_usage_once(self)
+        self.out_channels = out_channels
+
+        if self.__class__ == ConvNormActivation:
+            warnings.warn(
+                "Don't use ConvNormActivation directly, please use Conv2dNormActivation and Conv3dNormActivation instead."
+            )
+
+
+[docs]class Conv2dNormActivation(ConvNormActivation):
+    """
+    Configurable block used for Convolution2d-Normalization-Activation blocks.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the Convolution-Normalization-Activation block
+        kernel_size: (int, optional): Size of the convolving kernel. Default: 3
+        stride (int, optional): Stride of the convolution. Default: 1
+        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm2d``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
+        dilation (int): Spacing between kernel elements. Default: 1
+        inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
+        bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
+
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int]] = 3,
+        stride: Union[int, Tuple[int, int]] = 1,
+        padding: Optional[Union[int, Tuple[int, int], str]] = None,
+        groups: int = 1,
+        norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm2d,
+        activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
+        dilation: Union[int, Tuple[int, int]] = 1,
+        inplace: Optional[bool] = True,
+        bias: Optional[bool] = None,
+    ) -> None:
+
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            groups,
+            norm_layer,
+            activation_layer,
+            dilation,
+            inplace,
+            bias,
+            torch.nn.Conv2d,
+        )
+
+
+[docs]class Conv3dNormActivation(ConvNormActivation):
+    """
+    Configurable block used for Convolution3d-Normalization-Activation blocks.
+
+    Args:
+        in_channels (int): Number of channels in the input video.
+        out_channels (int): Number of channels produced by the Convolution-Normalization-Activation block
+        kernel_size: (int, optional): Size of the convolving kernel. Default: 3
+        stride (int, optional): Stride of the convolution. Default: 1
+        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm3d``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
+        dilation (int): Spacing between kernel elements. Default: 1
+        inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
+        bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int, int]] = 3,
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        padding: Optional[Union[int, Tuple[int, int, int], str]] = None,
+        groups: int = 1,
+        norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm3d,
+        activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
+        dilation: Union[int, Tuple[int, int, int]] = 1,
+        inplace: Optional[bool] = True,
+        bias: Optional[bool] = None,
+    ) -> None:
+
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            groups,
+            norm_layer,
+            activation_layer,
+            dilation,
+            inplace,
+            bias,
+            torch.nn.Conv3d,
+        )
+
+
+[docs]class SqueezeExcitation(torch.nn.Module):
+    """
+    This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1).
+    Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in eq. 3.
+
+    Args:
+        input_channels (int): Number of channels in the input image
+        squeeze_channels (int): Number of squeeze channels
+        activation (Callable[..., torch.nn.Module], optional): ``delta`` activation. Default: ``torch.nn.ReLU``
+        scale_activation (Callable[..., torch.nn.Module]): ``sigma`` activation. Default: ``torch.nn.Sigmoid``
+    """
+
+    def __init__(
+        self,
+        input_channels: int,
+        squeeze_channels: int,
+        activation: Callable[..., torch.nn.Module] = torch.nn.ReLU,
+        scale_activation: Callable[..., torch.nn.Module] = torch.nn.Sigmoid,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.avgpool = torch.nn.AdaptiveAvgPool2d(1)
+        self.fc1 = torch.nn.Conv2d(input_channels, squeeze_channels, 1)
+        self.fc2 = torch.nn.Conv2d(squeeze_channels, input_channels, 1)
+        self.activation = activation()
+        self.scale_activation = scale_activation()
+
+    def _scale(self, input: Tensor) -> Tensor:
+        scale = self.avgpool(input)
+        scale = self.fc1(scale)
+        scale = self.activation(scale)
+        scale = self.fc2(scale)
+        return self.scale_activation(scale)
+
+[docs]    def forward(self, input: Tensor) -> Tensor:
+        scale = self._scale(input)
+        return scale * input
+
+
+[docs]class MLP(torch.nn.Sequential):
+    """This block implements the multi-layer perceptron (MLP) module.
+
+    Args:
+        in_channels (int): Number of channels of the input
+        hidden_channels (List[int]): List of the hidden channel dimensions
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the linear layer. If ``None`` this layer wont be used. Default: ``None``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the linear layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
+        inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
+        bias (bool): Whether to use bias in the linear layer. Default ``True``
+        dropout (float): The probability for the dropout layer. Default: 0.0
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        hidden_channels: List[int],
+        norm_layer: Optional[Callable[..., torch.nn.Module]] = None,
+        activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
+        inplace: Optional[bool] = True,
+        bias: bool = True,
+        dropout: float = 0.0,
+    ):
+        # The addition of `norm_layer` is inspired from the implementation of TorchMultimodal:
+        # https://github.com/facebookresearch/multimodal/blob/5dec8a/torchmultimodal/modules/layers/mlp.py
+        params = {} if inplace is None else {"inplace": inplace}
+
+        layers = []
+        in_dim = in_channels
+        for hidden_dim in hidden_channels[:-1]:
+            layers.append(torch.nn.Linear(in_dim, hidden_dim, bias=bias))
+            if norm_layer is not None:
+                layers.append(norm_layer(hidden_dim))
+            layers.append(activation_layer(**params))
+            layers.append(torch.nn.Dropout(dropout, **params))
+            in_dim = hidden_dim
+
+        layers.append(torch.nn.Linear(in_dim, hidden_channels[-1], bias=bias))
+        layers.append(torch.nn.Dropout(dropout, **params))
+
+        super().__init__(*layers)
+        _log_api_usage_once(self)
+
+
+[docs]class Permute(torch.nn.Module):
+    """This module returns a view of the tensor input with its dimensions permuted.
+
+    Args:
+        dims (List[int]): The desired ordering of dimensions
+    """
+
+    def __init__(self, dims: List[int]):
+        super().__init__()
+        self.dims = dims
+
+[docs]    def forward(self, x: Tensor) -> Tensor:
+        return torch.permute(x, self.dims)
--- a/model/anchor_utils.py
+++ b/model/anchor_utils.py
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import torch
+from torch import nn, Tensor
+
+from typing import List, Optional
+from model.image_list import ImageList
+
+
+class AnchorGenerator(nn.Module):
+    """
+    Module that generates anchors for a set of feature maps and
+    image sizes.
+
+    The module support computing anchors at multiple sizes and aspect ratios
+    per feature map. This module assumes aspect ratio = height / width for
+    each anchor.
+
+    sizes and aspect_ratios should have the same number of elements, and it should
+    correspond to the number of feature maps.
+
+    sizes[i] and aspect_ratios[i] can have an arbitrary number of elements,
+    and AnchorGenerator will output a set of sizes[i] * aspect_ratios[i] anchors
+    per spatial location for feature map i.
+
+    Args:
+        sizes (Tuple[Tuple[int]]):
+        aspect_ratios (Tuple[Tuple[float]]):
+    """
+
+    __annotations__ = {
+        "cell_anchors": List[torch.Tensor],
+    }
+
+    def __init__(
+        self,
+        sizes=((128, 256, 512),),
+        aspect_ratios=((0.5, 1.0, 2.0),),
+    ):
+        super(AnchorGenerator, self).__init__()
+
+        if not isinstance(sizes[0], (list, tuple)):
+            # TODO change this
+            sizes = tuple((s,) for s in sizes)
+        if not isinstance(aspect_ratios[0], (list, tuple)):
+            aspect_ratios = (aspect_ratios,) * len(sizes)
+
+        assert len(sizes) == len(aspect_ratios)
+
+        self.sizes = sizes
+        self.aspect_ratios = aspect_ratios
+        self.cell_anchors = [self.generate_anchors(size, aspect_ratio)
+                             for size, aspect_ratio in zip(sizes, aspect_ratios)]
+
+    # TODO: https://github.com/pytorch/pytorch/issues/26792
+    # For every (aspect_ratios, scales) combination, output a zero-centered anchor with those values.
+    # (scales, aspect_ratios) are usually an element of zip(self.scales, self.aspect_ratios)
+    # This method assumes aspect ratio = height / width for an anchor.
+    def generate_anchors(self, scales: List[int], aspect_ratios: List[float], dtype: torch.dtype = torch.float32,
+                         device: torch.device = torch.device("cpu")):
+        scales = torch.as_tensor(scales, dtype=dtype, device=device)
+        aspect_ratios = torch.as_tensor(aspect_ratios, dtype=dtype, device=device)
+        h_ratios = torch.sqrt(aspect_ratios)
+        w_ratios = 1 / h_ratios
+
+        ws = (w_ratios[:, None] * scales[None, :]).view(-1)
+        hs = (h_ratios[:, None] * scales[None, :]).view(-1)
+
+        base_anchors = torch.stack([-ws, -hs, ws, hs], dim=1) / 2
+        return base_anchors.round()
+
+    def set_cell_anchors(self, dtype: torch.dtype, device: torch.device):
+        self.cell_anchors = [cell_anchor.to(dtype=dtype, device=device)
+                             for cell_anchor in self.cell_anchors]
+
+    def num_anchors_per_location(self):
+        return [len(s) * len(a) for s, a in zip(self.sizes, self.aspect_ratios)]
+
+    # For every combination of (a, (g, s), i) in (self.cell_anchors, zip(grid_sizes, strides), 0:2),
+    # output g[i] anchors that are s[i] distance apart in direction i, with the same dimensions as a.
+    def grid_anchors(self, grid_sizes: List[List[int]], strides: List[List[Tensor]]) -> List[Tensor]:
+        anchors = []
+        cell_anchors = self.cell_anchors
+        assert cell_anchors is not None
+
+        if not (len(grid_sizes) == len(strides) == len(cell_anchors)):
+            raise ValueError("Anchors should be Tuple[Tuple[int]] because each feature "
+                             "map could potentially have different sizes and aspect ratios. "
+                             "There needs to be a match between the number of "
+                             "feature maps passed and the number of sizes / aspect ratios specified.")
+
+        for size, stride, base_anchors in zip(
+            grid_sizes, strides, cell_anchors
+        ):
+            grid_height, grid_width = size
+            stride_height, stride_width = stride
+            device = base_anchors.device
+
+            # For output anchor, compute [x_center, y_center, x_center, y_center]
+            shifts_x = torch.arange(
+                0, grid_width, dtype=torch.float32, device=device
+            ) * stride_width
+            shifts_y = torch.arange(
+                0, grid_height, dtype=torch.float32, device=device
+            ) * stride_height
+            shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
+            shift_x = shift_x.reshape(-1)
+            shift_y = shift_y.reshape(-1)
+            shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1)
+
+            # For every (base anchor, output anchor) pair,
+            # offset each zero-centered base anchor by the center of the output anchor.
+            anchors.append(
+                (shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4)
+            )
+
+        return anchors
+
+    def forward(self, image_list: Tensor, feature_maps: List[Tensor]) -> List[Tensor]:
+        grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps]
+        image_size = image_list.shape[-2:]
+        dtype, device = feature_maps[0].dtype, feature_maps[0].device
+        strides = [[torch.tensor(image_size[0] // g[0], dtype=torch.int64, device=device),
+                    torch.tensor(image_size[1] // g[1], dtype=torch.int64, device=device)] for g in grid_sizes]
+        self.set_cell_anchors(dtype, device)
+        anchors_over_all_feature_maps = self.grid_anchors(grid_sizes, strides)
+        anchors: List[List[torch.Tensor]] = []
+        for _ in range(image_list.size(0)):
+            anchors_in_image = [anchors_per_feature_map for anchors_per_feature_map in anchors_over_all_feature_maps]
+            anchors.append(anchors_in_image)
+        anchors = [torch.cat(anchors_per_image) for anchors_per_image in anchors]
+        return anchors
+
+    def forward_opt(self, image_shape: torch.Size = None, grid_sizes: List[torch.Size] = None,
+                    device: torch.device = None, dtype=torch.float16):
+        assert(device is not None)
+
+        image_size = torch.Size([800, 800]) if image_shape is None else image_shape[-2:]
+        grid_sizes = [torch.Size([100, 100]), torch.Size([50, 50]),
+                      torch.Size([25, 25]), torch.Size([13, 13]),
+                      torch.Size([7, 7])] if grid_sizes is None else grid_sizes
+
+        strides = [[torch.tensor(image_size[0] // g[0], dtype=torch.int64, device=device),
+                    torch.tensor(image_size[1] // g[1], dtype=torch.int64, device=device)] for g in grid_sizes]
+
+        self.set_cell_anchors(dtype, device)
+
+        anchors_over_all_feature_maps = self.grid_anchors(grid_sizes, strides)
+        anchors: List[List[torch.Tensor]] = []
+        for _ in range(image_shape[0]):
+            anchors_in_image = [anchors_per_feature_map for anchors_per_feature_map in anchors_over_all_feature_maps]
+            anchors.append(anchors_in_image)
+        anchors = [torch.cat(anchors_per_image) for anchors_per_image in anchors]
+
+        return anchors