yolov9_pytorch

f969ca34 · dongchy920 · f969ca34 · f969ca34 · f969ca34 · f969ca34
Commit f969ca34 authored May 17, 2024 by dongchy920
20 changed files
--- a/utils/loggers/wandb/wandb_utils.py
+++ b/utils/loggers/wandb/wandb_utils.py
+"""Utilities and tools for tracking runs with Weights & Biases."""
+import logging
+import os
+import sys
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Dict
+import yaml
+from tqdm import tqdm
+FILE = Path(__file__).resolve()
+ROOT = FILE.parents[3]  # YOLOv5 root directory
+if str(ROOT) not in sys.path:
+    sys.path.append(str(ROOT))  # add ROOT to PATH
+from utils.dataloaders import LoadImagesAndLabels, img2label_paths
+from utils.general import LOGGER, check_dataset, check_file
+try:
+    import wandb
+    assert hasattr(wandb, '__version__')  # verify package import not local dir
+except (ImportError, AssertionError):
+    wandb = None
+RANK = int(os.getenv('RANK', -1))
+WANDB_ARTIFACT_PREFIX = 'wandb-artifact://'
+def remove_prefix(from_string, prefix=WANDB_ARTIFACT_PREFIX):
+    return from_string[len(prefix):]
+def check_wandb_config_file(data_config_file):
+    wandb_config = '_wandb.'.join(data_config_file.rsplit('.', 1))  # updated data.yaml path
+    if Path(wandb_config).is_file():
+        return wandb_config
+    return data_config_file
+def check_wandb_dataset(data_file):
+    is_trainset_wandb_artifact = False
+    is_valset_wandb_artifact = False
+    if isinstance(data_file, dict):
+        # In that case another dataset manager has already processed it and we don't have to
+        return data_file
+    if check_file(data_file) and data_file.endswith('.yaml'):
+        with open(data_file, errors='ignore') as f:
+            data_dict = yaml.safe_load(f)
+        is_trainset_wandb_artifact = isinstance(data_dict['train'],
+                                                str) and data_dict['train'].startswith(WANDB_ARTIFACT_PREFIX)
+        is_valset_wandb_artifact = isinstance(data_dict['val'],
+                                              str) and data_dict['val'].startswith(WANDB_ARTIFACT_PREFIX)
+    if is_trainset_wandb_artifact or is_valset_wandb_artifact:
+        return data_dict
+    else:
+        return check_dataset(data_file)
+def get_run_info(run_path):
+    run_path = Path(remove_prefix(run_path, WANDB_ARTIFACT_PREFIX))
+    run_id = run_path.stem
+    project = run_path.parent.stem
+    entity = run_path.parent.parent.stem
+    model_artifact_name = 'run_' + run_id + '_model'
+    return entity, project, run_id, model_artifact_name
+def check_wandb_resume(opt):
+    process_wandb_config_ddp_mode(opt) if RANK not in [-1, 0] else None
+    if isinstance(opt.resume, str):
+        if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
+            if RANK not in [-1, 0]:  # For resuming DDP runs
+                entity, project, run_id, model_artifact_name = get_run_info(opt.resume)
+                api = wandb.Api()
+                artifact = api.artifact(entity + '/' + project + '/' + model_artifact_name + ':latest')
+                modeldir = artifact.download()
+                opt.weights = str(Path(modeldir) / "last.pt")
+            return True
+    return None
+def process_wandb_config_ddp_mode(opt):
+    with open(check_file(opt.data), errors='ignore') as f:
+        data_dict = yaml.safe_load(f)  # data dict
+    train_dir, val_dir = None, None
+    if isinstance(data_dict['train'], str) and data_dict['train'].startswith(WANDB_ARTIFACT_PREFIX):
+        api = wandb.Api()
+        train_artifact = api.artifact(remove_prefix(data_dict['train']) + ':' + opt.artifact_alias)
+        train_dir = train_artifact.download()
+        train_path = Path(train_dir) / 'data/images/'
+        data_dict['train'] = str(train_path)
+    if isinstance(data_dict['val'], str) and data_dict['val'].startswith(WANDB_ARTIFACT_PREFIX):
+        api = wandb.Api()
+        val_artifact = api.artifact(remove_prefix(data_dict['val']) + ':' + opt.artifact_alias)
+        val_dir = val_artifact.download()
+        val_path = Path(val_dir) / 'data/images/'
+        data_dict['val'] = str(val_path)
+    if train_dir or val_dir:
+        ddp_data_path = str(Path(val_dir) / 'wandb_local_data.yaml')
+        with open(ddp_data_path, 'w') as f:
+            yaml.safe_dump(data_dict, f)
+        opt.data = ddp_data_path
+class WandbLogger():
+    """Log training runs, datasets, models, and predictions to Weights & Biases.
+    This logger sends information to W&B at wandb.ai. By default, this information
+    includes hyperparameters, system configuration and metrics, model metrics,
+    and basic data metrics and analyses.
+    By providing additional command line arguments to train.py, datasets,
+    models and predictions can also be logged.
+    For more on how this logger is used, see the Weights & Biases documentation:
+    https://docs.wandb.com/guides/integrations/yolov5
+    """
+    def __init__(self, opt, run_id=None, job_type='Training'):
+        """
+        - Initialize WandbLogger instance
+        - Upload dataset if opt.upload_dataset is True
+        - Setup training processes if job_type is 'Training'
+        arguments:
+        opt (namespace) -- Commandline arguments for this run
+        run_id (str) -- Run ID of W&B run to be resumed
+        job_type (str) -- To set the job_type for this run
+       """
+        # Temporary-fix
+        if opt.upload_dataset:
+            opt.upload_dataset = False
+            # LOGGER.info("Uploading Dataset functionality is not being supported temporarily due to a bug.")
+        # Pre-training routine --
+        self.job_type = job_type
+        self.wandb, self.wandb_run = wandb, None if not wandb else wandb.run
+        self.val_artifact, self.train_artifact = None, None
+        self.train_artifact_path, self.val_artifact_path = None, None
+        self.result_artifact = None
+        self.val_table, self.result_table = None, None
+        self.bbox_media_panel_images = []
+        self.val_table_path_map = None
+        self.max_imgs_to_log = 16
+        self.wandb_artifact_data_dict = None
+        self.data_dict = None
+        # It's more elegant to stick to 1 wandb.init call,
+        #  but useful config data is overwritten in the WandbLogger's wandb.init call
+        if isinstance(opt.resume, str):  # checks resume from artifact
+            if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
+                entity, project, run_id, model_artifact_name = get_run_info(opt.resume)
+                model_artifact_name = WANDB_ARTIFACT_PREFIX + model_artifact_name
+                assert wandb, 'install wandb to resume wandb runs'
+                # Resume wandb-artifact:// runs here| workaround for not overwriting wandb.config
+                self.wandb_run = wandb.init(id=run_id,
+                                            project=project,
+                                            entity=entity,
+                                            resume='allow',
+                                            allow_val_change=True)
+                opt.resume = model_artifact_name
+        elif self.wandb:
+            self.wandb_run = wandb.init(config=opt,
+                                        resume="allow",
+                                        project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem,
+                                        entity=opt.entity,
+                                        name=opt.name if opt.name != 'exp' else None,
+                                        job_type=job_type,
+                                        id=run_id,
+                                        allow_val_change=True) if not wandb.run else wandb.run
+        if self.wandb_run:
+            if self.job_type == 'Training':
+                if opt.upload_dataset:
+                    if not opt.resume:
+                        self.wandb_artifact_data_dict = self.check_and_upload_dataset(opt)
+                if isinstance(opt.data, dict):
+                    # This means another dataset manager has already processed the dataset info (e.g. ClearML)
+                    # and they will have stored the already processed dict in opt.data
+                    self.data_dict = opt.data
+                elif opt.resume:
+                    # resume from artifact
+                    if isinstance(opt.resume, str) and opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
+                        self.data_dict = dict(self.wandb_run.config.data_dict)
+                    else:  # local resume
+                        self.data_dict = check_wandb_dataset(opt.data)
+                else:
+                    self.data_dict = check_wandb_dataset(opt.data)
+                    self.wandb_artifact_data_dict = self.wandb_artifact_data_dict or self.data_dict
+                    # write data_dict to config. useful for resuming from artifacts. Do this only when not resuming.
+                    self.wandb_run.config.update({'data_dict': self.wandb_artifact_data_dict}, allow_val_change=True)
+                self.setup_training(opt)
+            if self.job_type == 'Dataset Creation':
+                self.wandb_run.config.update({"upload_dataset": True})
+                self.data_dict = self.check_and_upload_dataset(opt)
+    def check_and_upload_dataset(self, opt):
+        """
+        Check if the dataset format is compatible and upload it as W&B artifact
+        arguments:
+        opt (namespace)-- Commandline arguments for current run
+        returns:
+        Updated dataset info dictionary where local dataset paths are replaced by WAND_ARFACT_PREFIX links.
+        """
+        assert wandb, 'Install wandb to upload dataset'
+        config_path = self.log_dataset_artifact(opt.data, opt.single_cls,
+                                                'YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem)
+        with open(config_path, errors='ignore') as f:
+            wandb_data_dict = yaml.safe_load(f)
+        return wandb_data_dict
+    def setup_training(self, opt):
+        """
+        Setup the necessary processes for training YOLO models:
+          - Attempt to download model checkpoint and dataset artifacts if opt.resume stats with WANDB_ARTIFACT_PREFIX
+          - Update data_dict, to contain info of previous run if resumed and the paths of dataset artifact if downloaded
+          - Setup log_dict, initialize bbox_interval
+        arguments:
+        opt (namespace) -- commandline arguments for this run
+        """
+        self.log_dict, self.current_epoch = {}, 0
+        self.bbox_interval = opt.bbox_interval
+        if isinstance(opt.resume, str):
+            modeldir, _ = self.download_model_artifact(opt)
+            if modeldir:
+                self.weights = Path(modeldir) / "last.pt"
+                config = self.wandb_run.config
+                opt.weights, opt.save_period, opt.batch_size, opt.bbox_interval, opt.epochs, opt.hyp, opt.imgsz = str(
+                    self.weights), config.save_period, config.batch_size, config.bbox_interval, config.epochs,\
+                    config.hyp, config.imgsz
+        data_dict = self.data_dict
+        if self.val_artifact is None:  # If --upload_dataset is set, use the existing artifact, don't download
+            self.train_artifact_path, self.train_artifact = self.download_dataset_artifact(
+                data_dict.get('train'), opt.artifact_alias)
+            self.val_artifact_path, self.val_artifact = self.download_dataset_artifact(
+                data_dict.get('val'), opt.artifact_alias)
+        if self.train_artifact_path is not None:
+            train_path = Path(self.train_artifact_path) / 'data/images/'
+            data_dict['train'] = str(train_path)
+        if self.val_artifact_path is not None:
+            val_path = Path(self.val_artifact_path) / 'data/images/'
+            data_dict['val'] = str(val_path)
+        if self.val_artifact is not None:
+            self.result_artifact = wandb.Artifact("run_" + wandb.run.id + "_progress", "evaluation")
+            columns = ["epoch", "id", "ground truth", "prediction"]
+            columns.extend(self.data_dict['names'])
+            self.result_table = wandb.Table(columns)
+            self.val_table = self.val_artifact.get("val")
+            if self.val_table_path_map is None:
+                self.map_val_table_path()
+        if opt.bbox_interval == -1:
+            self.bbox_interval = opt.bbox_interval = (opt.epochs // 10) if opt.epochs > 10 else 1
+            if opt.evolve or opt.noplots:
+                self.bbox_interval = opt.bbox_interval = opt.epochs + 1  # disable bbox_interval
+        train_from_artifact = self.train_artifact_path is not None and self.val_artifact_path is not None
+        # Update the the data_dict to point to local artifacts dir
+        if train_from_artifact:
+            self.data_dict = data_dict
+    def download_dataset_artifact(self, path, alias):
+        """
+        download the model checkpoint artifact if the path starts with WANDB_ARTIFACT_PREFIX
+        arguments:
+        path -- path of the dataset to be used for training
+        alias (str)-- alias of the artifact to be download/used for training
+        returns:
+        (str, wandb.Artifact) -- path of the downladed dataset and it's corresponding artifact object if dataset
+        is found otherwise returns (None, None)
+        """
+        if isinstance(path, str) and path.startswith(WANDB_ARTIFACT_PREFIX):
+            artifact_path = Path(remove_prefix(path, WANDB_ARTIFACT_PREFIX) + ":" + alias)
+            dataset_artifact = wandb.use_artifact(artifact_path.as_posix().replace("\\", "/"))
+            assert dataset_artifact is not None, "'Error: W&B dataset artifact doesn\'t exist'"
+            datadir = dataset_artifact.download()
+            return datadir, dataset_artifact
+        return None, None
+    def download_model_artifact(self, opt):
+        """
+        download the model checkpoint artifact if the resume path starts with WANDB_ARTIFACT_PREFIX
+        arguments:
+        opt (namespace) -- Commandline arguments for this run
+        """
+        if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
+            model_artifact = wandb.use_artifact(remove_prefix(opt.resume, WANDB_ARTIFACT_PREFIX) + ":latest")
+            assert model_artifact is not None, 'Error: W&B model artifact doesn\'t exist'
+            modeldir = model_artifact.download()
+            # epochs_trained = model_artifact.metadata.get('epochs_trained')
+            total_epochs = model_artifact.metadata.get('total_epochs')
+            is_finished = total_epochs is None
+            assert not is_finished, 'training is finished, can only resume incomplete runs.'
+            return modeldir, model_artifact
+        return None, None
+    def log_model(self, path, opt, epoch, fitness_score, best_model=False):
+        """
+        Log the model checkpoint as W&B artifact
+        arguments:
+        path (Path)   -- Path of directory containing the checkpoints
+        opt (namespace) -- Command line arguments for this run
+        epoch (int)  -- Current epoch number
+        fitness_score (float) -- fitness score for current epoch
+        best_model (boolean) -- Boolean representing if the current checkpoint is the best yet.
+        """
+        model_artifact = wandb.Artifact('run_' + wandb.run.id + '_model',
+                                        type='model',
+                                        metadata={
+                                            'original_url': str(path),
+                                            'epochs_trained': epoch + 1,
+                                            'save period': opt.save_period,
+                                            'project': opt.project,
+                                            'total_epochs': opt.epochs,
+                                            'fitness_score': fitness_score})
+        model_artifact.add_file(str(path / 'last.pt'), name='last.pt')
+        wandb.log_artifact(model_artifact,
+                           aliases=['latest', 'last', 'epoch ' + str(self.current_epoch), 'best' if best_model else ''])
+        LOGGER.info(f"Saving model artifact on epoch {epoch + 1}")
+    def log_dataset_artifact(self, data_file, single_cls, project, overwrite_config=False):
+        """
+        Log the dataset as W&B artifact and return the new data file with W&B links
+        arguments:
+        data_file (str) -- the .yaml file with information about the dataset like - path, classes etc.
+        single_class (boolean)  -- train multi-class data as single-class
+        project (str) -- project name. Used to construct the artifact path
+        overwrite_config (boolean) -- overwrites the data.yaml file if set to true otherwise creates a new
+        file with _wandb postfix. Eg -> data_wandb.yaml
+        returns:
+        the new .yaml file with artifact links. it can be used to start training directly from artifacts
+        """
+        upload_dataset = self.wandb_run.config.upload_dataset
+        log_val_only = isinstance(upload_dataset, str) and upload_dataset == 'val'
+        self.data_dict = check_dataset(data_file)  # parse and check
+        data = dict(self.data_dict)
+        nc, names = (1, ['item']) if single_cls else (int(data['nc']), data['names'])
+        names = {k: v for k, v in enumerate(names)}  # to index dictionary
+        # log train set
+        if not log_val_only:
+            self.train_artifact = self.create_dataset_table(LoadImagesAndLabels(data['train'], rect=True, batch_size=1),
+                                                            names,
+                                                            name='train') if data.get('train') else None
+            if data.get('train'):
+                data['train'] = WANDB_ARTIFACT_PREFIX + str(Path(project) / 'train')
+        self.val_artifact = self.create_dataset_table(
+            LoadImagesAndLabels(data['val'], rect=True, batch_size=1), names, name='val') if data.get('val') else None
+        if data.get('val'):
+            data['val'] = WANDB_ARTIFACT_PREFIX + str(Path(project) / 'val')
+        path = Path(data_file)
+        # create a _wandb.yaml file with artifacts links if both train and test set are logged
+        if not log_val_only:
+            path = (path.stem if overwrite_config else path.stem + '_wandb') + '.yaml'  # updated data.yaml path
+            path = ROOT / 'data' / path
+            data.pop('download', None)
+            data.pop('path', None)
+            with open(path, 'w') as f:
+                yaml.safe_dump(data, f)
+                LOGGER.info(f"Created dataset config file {path}")
+        if self.job_type == 'Training':  # builds correct artifact pipeline graph
+            if not log_val_only:
+                self.wandb_run.log_artifact(
+                    self.train_artifact)  # calling use_artifact downloads the dataset. NOT NEEDED!
+            self.wandb_run.use_artifact(self.val_artifact)
+            self.val_artifact.wait()
+            self.val_table = self.val_artifact.get('val')
+            self.map_val_table_path()
+        else:
+            self.wandb_run.log_artifact(self.train_artifact)
+            self.wandb_run.log_artifact(self.val_artifact)
+        return path
+    def map_val_table_path(self):
+        """
+        Map the validation dataset Table like name of file -> it's id in the W&B Table.
+        Useful for - referencing artifacts for evaluation.
+        """
+        self.val_table_path_map = {}
+        LOGGER.info("Mapping dataset")
+        for i, data in enumerate(tqdm(self.val_table.data)):
+            self.val_table_path_map[data[3]] = data[0]
+    def create_dataset_table(self, dataset: LoadImagesAndLabels, class_to_id: Dict[int, str], name: str = 'dataset'):
+        """
+        Create and return W&B artifact containing W&B Table of the dataset.
+        arguments:
+        dataset -- instance of LoadImagesAndLabels class used to iterate over the data to build Table
+        class_to_id -- hash map that maps class ids to labels
+        name -- name of the artifact
+        returns:
+        dataset artifact to be logged or used
+        """
+        # TODO: Explore multiprocessing to slpit this loop parallely| This is essential for speeding up the the logging
+        artifact = wandb.Artifact(name=name, type="dataset")
+        img_files = tqdm([dataset.path]) if isinstance(dataset.path, str) and Path(dataset.path).is_dir() else None
+        img_files = tqdm(dataset.im_files) if not img_files else img_files
+        for img_file in img_files:
+            if Path(img_file).is_dir():
+                artifact.add_dir(img_file, name='data/images')
+                labels_path = 'labels'.join(dataset.path.rsplit('images', 1))
+                artifact.add_dir(labels_path, name='data/labels')
+            else:
+                artifact.add_file(img_file, name='data/images/' + Path(img_file).name)
+                label_file = Path(img2label_paths([img_file])[0])
+                artifact.add_file(str(label_file), name='data/labels/' +
+                                  label_file.name) if label_file.exists() else None
+        table = wandb.Table(columns=["id", "train_image", "Classes", "name"])
+        class_set = wandb.Classes([{'id': id, 'name': name} for id, name in class_to_id.items()])
+        for si, (img, labels, paths, shapes) in enumerate(tqdm(dataset)):
+            box_data, img_classes = [], {}
+            for cls, *xywh in labels[:, 1:].tolist():
+                cls = int(cls)
+                box_data.append({
+                    "position": {
+                        "middle": [xywh[0], xywh[1]],
+                        "width": xywh[2],
+                        "height": xywh[3]},
+                    "class_id": cls,
+                    "box_caption": "%s" % (class_to_id[cls])})
+                img_classes[cls] = class_to_id[cls]
+            boxes = {"ground_truth": {"box_data": box_data, "class_labels": class_to_id}}  # inference-space
+            table.add_data(si, wandb.Image(paths, classes=class_set, boxes=boxes), list(img_classes.values()),
+                           Path(paths).name)
+        artifact.add(table, name)
+        return artifact
+    def log_training_progress(self, predn, path, names):
+        """
+        Build evaluation Table. Uses reference from validation dataset table.
+        arguments:
+        predn (list): list of predictions in the native space in the format - [xmin, ymin, xmax, ymax, confidence, class]
+        path (str): local path of the current evaluation image
+        names (dict(int, str)): hash map that maps class ids to labels
+        """
+        class_set = wandb.Classes([{'id': id, 'name': name} for id, name in names.items()])
+        box_data = []
+        avg_conf_per_class = [0] * len(self.data_dict['names'])
+        pred_class_count = {}
+        for *xyxy, conf, cls in predn.tolist():
+            if conf >= 0.25:
+                cls = int(cls)
+                box_data.append({
+                    "position": {
+                        "minX": xyxy[0],
+                        "minY": xyxy[1],
+                        "maxX": xyxy[2],
+                        "maxY": xyxy[3]},
+                    "class_id": cls,
+                    "box_caption": f"{names[cls]} {conf:.3f}",
+                    "scores": {
+                        "class_score": conf},
+                    "domain": "pixel"})
+                avg_conf_per_class[cls] += conf
+                if cls in pred_class_count:
+                    pred_class_count[cls] += 1
+                else:
+                    pred_class_count[cls] = 1
+        for pred_class in pred_class_count.keys():
+            avg_conf_per_class[pred_class] = avg_conf_per_class[pred_class] / pred_class_count[pred_class]
+        boxes = {"predictions": {"box_data": box_data, "class_labels": names}}  # inference-space
+        id = self.val_table_path_map[Path(path).name]
+        self.result_table.add_data(self.current_epoch, id, self.val_table.data[id][1],
+                                   wandb.Image(self.val_table.data[id][1], boxes=boxes, classes=class_set),
+                                   *avg_conf_per_class)
+    def val_one_image(self, pred, predn, path, names, im):
+        """
+        Log validation data for one image. updates the result Table if validation dataset is uploaded and log bbox media panel
+        arguments:
+        pred (list): list of scaled predictions in the format - [xmin, ymin, xmax, ymax, confidence, class]
+        predn (list): list of predictions in the native space - [xmin, ymin, xmax, ymax, confidence, class]
+        path (str): local path of the current evaluation image
+        """
+        if self.val_table and self.result_table:  # Log Table if Val dataset is uploaded as artifact
+            self.log_training_progress(predn, path, names)
+        if len(self.bbox_media_panel_images) < self.max_imgs_to_log and self.current_epoch > 0:
+            if self.current_epoch % self.bbox_interval == 0:
+                box_data = [{
+                    "position": {
+                        "minX": xyxy[0],
+                        "minY": xyxy[1],
+                        "maxX": xyxy[2],
+                        "maxY": xyxy[3]},
+                    "class_id": int(cls),
+                    "box_caption": f"{names[int(cls)]} {conf:.3f}",
+                    "scores": {
+                        "class_score": conf},
+                    "domain": "pixel"} for *xyxy, conf, cls in pred.tolist()]
+                boxes = {"predictions": {"box_data": box_data, "class_labels": names}}  # inference-space
+                self.bbox_media_panel_images.append(wandb.Image(im, boxes=boxes, caption=path.name))
+    def log(self, log_dict):
+        """
+        save the metrics to the logging dictionary
+        arguments:
+        log_dict (Dict) -- metrics/media to be logged in current step
+        """
+        if self.wandb_run:
+            for key, value in log_dict.items():
+                self.log_dict[key] = value
+    def end_epoch(self, best_result=False):
+        """
+        commit the log_dict, model artifacts and Tables to W&B and flush the log_dict.
+        arguments:
+        best_result (boolean): Boolean representing if the result of this evaluation is best or not
+        """
+        if self.wandb_run:
+            with all_logging_disabled():
+                if self.bbox_media_panel_images:
+                    self.log_dict["BoundingBoxDebugger"] = self.bbox_media_panel_images
+                try:
+                    wandb.log(self.log_dict)
+                except BaseException as e:
+                    LOGGER.info(
+                        f"An error occurred in wandb logger. The training will proceed without interruption. More info\n{e}"
+                    )
+                    self.wandb_run.finish()
+                    self.wandb_run = None
+                self.log_dict = {}
+                self.bbox_media_panel_images = []
+            if self.result_artifact:
+                self.result_artifact.add(self.result_table, 'result')
+                wandb.log_artifact(self.result_artifact,
+                                   aliases=[
+                                       'latest', 'last', 'epoch ' + str(self.current_epoch),
+                                       ('best' if best_result else '')])
+                wandb.log({"evaluation": self.result_table})
+                columns = ["epoch", "id", "ground truth", "prediction"]
+                columns.extend(self.data_dict['names'])
+                self.result_table = wandb.Table(columns)
+                self.result_artifact = wandb.Artifact("run_" + wandb.run.id + "_progress", "evaluation")
+    def finish_run(self):
+        """
+        Log metrics if any and finish the current W&B run
+        """
+        if self.wandb_run:
+            if self.log_dict:
+                with all_logging_disabled():
+                    wandb.log(self.log_dict)
+            wandb.run.finish()
+@contextmanager
+def all_logging_disabled(highest_level=logging.CRITICAL):
+    """ source - https://gist.github.com/simon-weber/7853144
+    A context manager that will prevent any logging messages triggered during the body from being processed.
+    :param highest_level: the maximum logging level in use.
+      This would only need to be changed if a custom level greater than CRITICAL is defined.
+    """
+    previous_level = logging.root.manager.disable
+    logging.disable(highest_level)
+    try:
+        yield
+    finally:
+        logging.disable(previous_level)
--- a/utils/loss.py
+++ b/utils/loss.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from utils.metrics import bbox_iou
+from utils.torch_utils import de_parallel
+def smooth_BCE(eps=0.1):  # https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441
+    # return positive, negative label smoothing BCE targets
+    return 1.0 - 0.5 * eps, 0.5 * eps
+class BCEBlurWithLogitsLoss(nn.Module):
+    # BCEwithLogitLoss() with reduced missing label effects.
+    def __init__(self, alpha=0.05):
+        super().__init__()
+        self.loss_fcn = nn.BCEWithLogitsLoss(reduction='none')  # must be nn.BCEWithLogitsLoss()
+        self.alpha = alpha
+    def forward(self, pred, true):
+        loss = self.loss_fcn(pred, true)
+        pred = torch.sigmoid(pred)  # prob from logits
+        dx = pred - true  # reduce only missing label effects
+        # dx = (pred - true).abs()  # reduce missing label and false label effects
+        alpha_factor = 1 - torch.exp((dx - 1) / (self.alpha + 1e-4))
+        loss *= alpha_factor
+        return loss.mean()
+class FocalLoss(nn.Module):
+    # Wraps focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5)
+    def __init__(self, loss_fcn, gamma=1.5, alpha=0.25):
+        super().__init__()
+        self.loss_fcn = loss_fcn  # must be nn.BCEWithLogitsLoss()
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = loss_fcn.reduction
+        self.loss_fcn.reduction = 'none'  # required to apply FL to each element
+    def forward(self, pred, true):
+        loss = self.loss_fcn(pred, true)
+        # p_t = torch.exp(-loss)
+        # loss *= self.alpha * (1.000001 - p_t) ** self.gamma  # non-zero power for gradient stability
+        # TF implementation https://github.com/tensorflow/addons/blob/v0.7.1/tensorflow_addons/losses/focal_loss.py
+        pred_prob = torch.sigmoid(pred)  # prob from logits
+        p_t = true * pred_prob + (1 - true) * (1 - pred_prob)
+        alpha_factor = true * self.alpha + (1 - true) * (1 - self.alpha)
+        modulating_factor = (1.0 - p_t) ** self.gamma
+        loss *= alpha_factor * modulating_factor
+        if self.reduction == 'mean':
+            return loss.mean()
+        elif self.reduction == 'sum':
+            return loss.sum()
+        else:  # 'none'
+            return loss
+class QFocalLoss(nn.Module):
+    # Wraps Quality focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5)
+    def __init__(self, loss_fcn, gamma=1.5, alpha=0.25):
+        super().__init__()
+        self.loss_fcn = loss_fcn  # must be nn.BCEWithLogitsLoss()
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = loss_fcn.reduction
+        self.loss_fcn.reduction = 'none'  # required to apply FL to each element
+    def forward(self, pred, true):
+        loss = self.loss_fcn(pred, true)
+        pred_prob = torch.sigmoid(pred)  # prob from logits
+        alpha_factor = true * self.alpha + (1 - true) * (1 - self.alpha)
+        modulating_factor = torch.abs(true - pred_prob) ** self.gamma
+        loss *= alpha_factor * modulating_factor
+        if self.reduction == 'mean':
+            return loss.mean()
+        elif self.reduction == 'sum':
+            return loss.sum()
+        else:  # 'none'
+            return loss
+class ComputeLoss:
+    sort_obj_iou = False
+    # Compute losses
+    def __init__(self, model, autobalance=False):
+        device = next(model.parameters()).device  # get model device
+        h = model.hyp  # hyperparameters
+        # Define criteria
+        BCEcls = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['cls_pw']], device=device))
+        BCEobj = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['obj_pw']], device=device))
+        # Class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3
+        self.cp, self.cn = smooth_BCE(eps=h.get('label_smoothing', 0.0))  # positive, negative BCE targets
+        # Focal loss
+        g = h['fl_gamma']  # focal loss gamma
+        if g > 0:
+            BCEcls, BCEobj = FocalLoss(BCEcls, g), FocalLoss(BCEobj, g)
+        m = de_parallel(model).model[-1]  # Detect() module
+        self.balance = {3: [4.0, 1.0, 0.4]}.get(m.nl, [4.0, 1.0, 0.25, 0.06, 0.02])  # P3-P7
+        self.ssi = list(m.stride).index(16) if autobalance else 0  # stride 16 index
+        self.BCEcls, self.BCEobj, self.gr, self.hyp, self.autobalance = BCEcls, BCEobj, 1.0, h, autobalance
+        self.nc = m.nc  # number of classes
+        self.nl = m.nl  # number of layers
+        self.anchors = m.anchors
+        self.device = device
+    def __call__(self, p, targets):  # predictions, targets
+        bs = p[0].shape[0]  # batch size
+        loss = torch.zeros(3, device=self.device)  # [box, obj, cls] losses
+        tcls, tbox, indices = self.build_targets(p, targets)  # targets
+        # Losses
+        for i, pi in enumerate(p):  # layer index, layer predictions
+            b, gj, gi = indices[i]  # image, anchor, gridy, gridx
+            tobj = torch.zeros((pi.shape[0], pi.shape[2], pi.shape[3]), dtype=pi.dtype, device=self.device)  # tgt obj
+            n_labels = b.shape[0]  # number of labels
+            if n_labels:
+                # pxy, pwh, _, pcls = pi[b, a, gj, gi].tensor_split((2, 4, 5), dim=1)  # faster, requires torch 1.8.0
+                pxy, pwh, _, pcls = pi[b, :, gj, gi].split((2, 2, 1, self.nc), 1)  # target-subset of predictions
+                # Regression
+                # pwh = (pwh.sigmoid() * 2) ** 2 * anchors[i]
+                # pwh = (0.0 + (pwh - 1.09861).sigmoid() * 4) * anchors[i]
+                # pwh = (0.33333 + (pwh - 1.09861).sigmoid() * 2.66667) * anchors[i]
+                # pwh = (0.25 + (pwh - 1.38629).sigmoid() * 3.75) * anchors[i]
+                # pwh = (0.20 + (pwh - 1.60944).sigmoid() * 4.8) * anchors[i]
+                # pwh = (0.16667 + (pwh - 1.79175).sigmoid() * 5.83333) * anchors[i]
+                pxy = pxy.sigmoid() * 1.6 - 0.3
+                pwh = (0.2 + pwh.sigmoid() * 4.8) * self.anchors[i]
+                pbox = torch.cat((pxy, pwh), 1)  # predicted box
+                iou = bbox_iou(pbox, tbox[i], CIoU=True).squeeze()  # iou(prediction, target)
+                loss[0] += (1.0 - iou).mean()  # box loss
+                # Objectness
+                iou = iou.detach().clamp(0).type(tobj.dtype)
+                if self.sort_obj_iou:
+                    j = iou.argsort()
+                    b, gj, gi, iou = b[j], gj[j], gi[j], iou[j]
+                if self.gr < 1:
+                    iou = (1.0 - self.gr) + self.gr * iou
+                tobj[b, gj, gi] = iou  # iou ratio
+                # Classification
+                if self.nc > 1:  # cls loss (only if multiple classes)
+                    t = torch.full_like(pcls, self.cn, device=self.device)  # targets
+                    t[range(n_labels), tcls[i]] = self.cp
+                    loss[2] += self.BCEcls(pcls, t)  # cls loss
+            obji = self.BCEobj(pi[:, 4], tobj)
+            loss[1] += obji * self.balance[i]  # obj loss
+            if self.autobalance:
+                self.balance[i] = self.balance[i] * 0.9999 + 0.0001 / obji.detach().item()
+        if self.autobalance:
+            self.balance = [x / self.balance[self.ssi] for x in self.balance]
+        loss[0] *= self.hyp['box']
+        loss[1] *= self.hyp['obj']
+        loss[2] *= self.hyp['cls']
+        return loss.sum() * bs, loss.detach()  # [box, obj, cls] losses
+    def build_targets(self, p, targets):
+        # Build targets for compute_loss(), input targets(image,class,x,y,w,h)
+        nt = targets.shape[0]  # number of anchors, targets
+        tcls, tbox, indices = [], [], []
+        gain = torch.ones(6, device=self.device)  # normalized to gridspace gain
+        g = 0.3  # bias
+        off = torch.tensor(
+            [
+                [0, 0],
+                [1, 0],
+                [0, 1],
+                [-1, 0],
+                [0, -1],  # j,k,l,m
+                # [1, 1], [1, -1], [-1, 1], [-1, -1],  # jk,jm,lk,lm
+            ],
+            device=self.device).float() * g  # offsets
+        for i in range(self.nl):
+            shape = p[i].shape
+            gain[2:6] = torch.tensor(shape)[[3, 2, 3, 2]]  # xyxy gain
+            # Match targets to anchors
+            t = targets * gain  # shape(3,n,7)
+            if nt:
+                # Matches
+                r = t[..., 4:6] / self.anchors[i]  # wh ratio
+                j = torch.max(r, 1 / r).max(1)[0] < self.hyp['anchor_t']  # compare
+                # j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t']  # iou(3,n)=wh_iou(anchors(3,2), gwh(n,2))
+                t = t[j]  # filter
+                # Offsets
+                gxy = t[:, 2:4]  # grid xy
+                gxi = gain[[2, 3]] - gxy  # inverse
+                j, k = ((gxy % 1 < g) & (gxy > 1)).T
+                l, m = ((gxi % 1 < g) & (gxi > 1)).T
+                j = torch.stack((torch.ones_like(j), j, k, l, m))
+                t = t.repeat((5, 1, 1))[j]
+                offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j]
+            else:
+                t = targets[0]
+                offsets = 0
+            # Define
+            bc, gxy, gwh = t.chunk(3, 1)  # (image, class), grid xy, grid wh
+            b, c = bc.long().T  # image, class
+            gij = (gxy - offsets).long()
+            gi, gj = gij.T  # grid indices
+            # Append
+            indices.append((b, gj.clamp_(0, shape[2] - 1), gi.clamp_(0, shape[3] - 1)))  # image, grid_y, grid_x indices
+            tbox.append(torch.cat((gxy - gij, gwh), 1))  # box
+            tcls.append(c)  # class
+        return tcls, tbox, indices
+class ComputeLoss_NEW:
+    sort_obj_iou = False
+    # Compute losses
+    def __init__(self, model, autobalance=False):
+        device = next(model.parameters()).device  # get model device
+        h = model.hyp  # hyperparameters
+        # Define criteria
+        BCEcls = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['cls_pw']], device=device))
+        BCEobj = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['obj_pw']], device=device))
+        # Class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3
+        self.cp, self.cn = smooth_BCE(eps=h.get('label_smoothing', 0.0))  # positive, negative BCE targets
+        # Focal loss
+        g = h['fl_gamma']  # focal loss gamma
+        if g > 0:
+            BCEcls, BCEobj = FocalLoss(BCEcls, g), FocalLoss(BCEobj, g)
+        m = de_parallel(model).model[-1]  # Detect() module
+        self.balance = {3: [4.0, 1.0, 0.4]}.get(m.nl, [4.0, 1.0, 0.25, 0.06, 0.02])  # P3-P7
+        self.ssi = list(m.stride).index(16) if autobalance else 0  # stride 16 index
+        self.BCEcls, self.BCEobj, self.gr, self.hyp, self.autobalance = BCEcls, BCEobj, 1.0, h, autobalance
+        self.nc = m.nc  # number of classes
+        self.nl = m.nl  # number of layers
+        self.anchors = m.anchors
+        self.device = device
+        self.BCE_base = nn.BCEWithLogitsLoss(reduction='none')
+    def __call__(self, p, targets):  # predictions, targets
+        tcls, tbox, indices = self.build_targets(p, targets)  # targets
+        bs = p[0].shape[0]  # batch size
+        n_labels = targets.shape[0]  # number of labels
+        loss = torch.zeros(3, device=self.device)  # [box, obj, cls] losses
+        # Compute all losses
+        all_loss = []
+        for i, pi in enumerate(p):  # layer index, layer predictions
+            b, gj, gi = indices[i]  # image, anchor, gridy, gridx
+            if n_labels:
+                pxy, pwh, pobj, pcls = pi[b, :, gj, gi].split((2, 2, 1, self.nc), 2)  # target-subset of predictions
+                # Regression
+                pbox = torch.cat((pxy.sigmoid() * 1.6 - 0.3, (0.2 + pwh.sigmoid() * 4.8) * self.anchors[i]), 2)
+                iou = bbox_iou(pbox, tbox[i], CIoU=True).squeeze()  # iou(predicted_box, target_box)
+                obj_target = iou.detach().clamp(0).type(pi.dtype)  # objectness targets
+                all_loss.append([(1.0 - iou) * self.hyp['box'],
+                                 self.BCE_base(pobj.squeeze(), torch.ones_like(obj_target)) * self.hyp['obj'],
+                                 self.BCE_base(pcls, F.one_hot(tcls[i], self.nc).float()).mean(2) * self.hyp['cls'],
+                                 obj_target,
+                                 tbox[i][..., 2] > 0.0])  # valid
+        # Lowest 3 losses per label
+        n_assign = 4  # top n matches
+        cat_loss = [torch.cat(x, 1) for x in zip(*all_loss)]
+        ij = torch.zeros_like(cat_loss[0]).bool()  # top 3 mask
+        sum_loss = cat_loss[0] + cat_loss[2]
+        for col in torch.argsort(sum_loss, dim=1).T[:n_assign]:
+            # ij[range(n_labels), col] = True
+            ij[range(n_labels), col] = cat_loss[4][range(n_labels), col]
+        loss[0] = cat_loss[0][ij].mean() * self.nl  # box loss
+        loss[2] = cat_loss[2][ij].mean() * self.nl  # cls loss
+        # Obj loss
+        for i, (h, pi) in enumerate(zip(ij.chunk(self.nl, 1), p)):  # layer index, layer predictions
+            b, gj, gi = indices[i]  # image, anchor, gridy, gridx
+            tobj = torch.zeros((pi.shape[0], pi.shape[2], pi.shape[3]), dtype=pi.dtype, device=self.device)  # obj
+            if n_labels:  # if any labels
+                tobj[b[h], gj[h], gi[h]] = all_loss[i][3][h]
+            loss[1] += self.BCEobj(pi[:, 4], tobj) * (self.balance[i] * self.hyp['obj'])
+        return loss.sum() * bs, loss.detach()  # [box, obj, cls] losses
+    def build_targets(self, p, targets):
+        # Build targets for compute_loss(), input targets(image,class,x,y,w,h)
+        nt = targets.shape[0]  # number of anchors, targets
+        tcls, tbox, indices = [], [], []
+        gain = torch.ones(6, device=self.device)  # normalized to gridspace gain
+        g = 0.3  # bias
+        off = torch.tensor(
+            [
+                [0, 0],
+                [1, 0],
+                [0, 1],
+                [-1, 0],
+                [0, -1],  # j,k,l,m
+                # [1, 1], [1, -1], [-1, 1], [-1, -1],  # jk,jm,lk,lm
+            ],
+            device=self.device).float()  # offsets
+        for i in range(self.nl):
+            shape = p[i].shape
+            gain[2:6] = torch.tensor(shape)[[3, 2, 3, 2]]  # xyxy gain
+            # Match targets to anchors
+            t = targets * gain  # shape(3,n,7)
+            if nt:
+                # # Matches
+                r = t[..., 4:6] / self.anchors[i]  # wh ratio
+                a = torch.max(r, 1 / r).max(1)[0] < self.hyp['anchor_t']  # compare
+                # a = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t']  # iou(3,n)=wh_iou(anchors(3,2), gwh(n,2))
+                # t = t[a]  # filter
+                # # Offsets
+                gxy = t[:, 2:4]  # grid xy
+                gxi = gain[[2, 3]] - gxy  # inverse
+                j, k = ((gxy % 1 < g) & (gxy > 1)).T
+                l, m = ((gxi % 1 < g) & (gxi > 1)).T
+                j = torch.stack((torch.ones_like(j), j, k, l, m)) & a
+                t = t.repeat((5, 1, 1))
+                offsets = torch.zeros_like(gxy)[None] + off[:, None]
+                t[..., 4:6][~j] = 0.0  # move unsuitable targets far away
+            else:
+                t = targets[0]
+                offsets = 0
+            # Define
+            bc, gxy, gwh = t.chunk(3, 2)  # (image, class), grid xy, grid wh
+            b, c = bc.long().transpose(0, 2).contiguous()  # image, class
+            gij = (gxy - offsets).long()
+            gi, gj = gij.transpose(0, 2).contiguous()  # grid indices
+            # Append
+            indices.append((b, gj.clamp_(0, shape[2] - 1), gi.clamp_(0, shape[3] - 1)))  # image, grid_y, grid_x indices
+            tbox.append(torch.cat((gxy - gij, gwh), 2).permute(1, 0, 2).contiguous())  # box
+            tcls.append(c)  # class
+            # # Unique
+            # n1 = torch.cat((b.view(-1, 1), tbox[i].view(-1, 4)), 1).shape[0]
+            # n2 = tbox[i].view(-1, 4).unique(dim=0).shape[0]
+            # print(f'targets-unique {n1}-{n2} diff={n1-n2}')
+        return tcls, tbox, indices
--- a/utils/loss_tal.py
+++ b/utils/loss_tal.py
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from utils.general import xywh2xyxy
+from utils.metrics import bbox_iou
+from utils.tal.anchor_generator import dist2bbox, make_anchors, bbox2dist
+from utils.tal.assigner import TaskAlignedAssigner
+from utils.torch_utils import de_parallel
+def smooth_BCE(eps=0.1):  # https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441
+    # return positive, negative label smoothing BCE targets
+    return 1.0 - 0.5 * eps, 0.5 * eps
+class VarifocalLoss(nn.Module):
+    # Varifocal loss by Zhang et al. https://arxiv.org/abs/2008.13367
+    def __init__(self):
+        super().__init__()
+    def forward(self, pred_score, gt_score, label, alpha=0.75, gamma=2.0):
+        weight = alpha * pred_score.sigmoid().pow(gamma) * (1 - label) + gt_score * label
+        with torch.cuda.amp.autocast(enabled=False):
+            loss = (F.binary_cross_entropy_with_logits(pred_score.float(), gt_score.float(),
+                                                       reduction="none") * weight).sum()
+        return loss
+class FocalLoss(nn.Module):
+    # Wraps focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5)
+    def __init__(self, loss_fcn, gamma=1.5, alpha=0.25):
+        super().__init__()
+        self.loss_fcn = loss_fcn  # must be nn.BCEWithLogitsLoss()
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = loss_fcn.reduction
+        self.loss_fcn.reduction = "none"  # required to apply FL to each element
+    def forward(self, pred, true):
+        loss = self.loss_fcn(pred, true)
+        # p_t = torch.exp(-loss)
+        # loss *= self.alpha * (1.000001 - p_t) ** self.gamma  # non-zero power for gradient stability
+        # TF implementation https://github.com/tensorflow/addons/blob/v0.7.1/tensorflow_addons/losses/focal_loss.py
+        pred_prob = torch.sigmoid(pred)  # prob from logits
+        p_t = true * pred_prob + (1 - true) * (1 - pred_prob)
+        alpha_factor = true * self.alpha + (1 - true) * (1 - self.alpha)
+        modulating_factor = (1.0 - p_t) ** self.gamma
+        loss *= alpha_factor * modulating_factor
+        if self.reduction == "mean":
+            return loss.mean()
+        elif self.reduction == "sum":
+            return loss.sum()
+        else:  # 'none'
+            return loss
+class BboxLoss(nn.Module):
+    def __init__(self, reg_max, use_dfl=False):
+        super().__init__()
+        self.reg_max = reg_max
+        self.use_dfl = use_dfl
+    def forward(self, pred_dist, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask):
+        # iou loss
+        bbox_mask = fg_mask.unsqueeze(-1).repeat([1, 1, 4])  # (b, h*w, 4)
+        pred_bboxes_pos = torch.masked_select(pred_bboxes, bbox_mask).view(-1, 4)
+        target_bboxes_pos = torch.masked_select(target_bboxes, bbox_mask).view(-1, 4)
+        bbox_weight = torch.masked_select(target_scores.sum(-1), fg_mask).unsqueeze(-1)
+        iou = bbox_iou(pred_bboxes_pos, target_bboxes_pos, xywh=False, CIoU=True)
+        loss_iou = 1.0 - iou
+        loss_iou *= bbox_weight
+        loss_iou = loss_iou.sum() / target_scores_sum
+        # dfl loss
+        if self.use_dfl:
+            dist_mask = fg_mask.unsqueeze(-1).repeat([1, 1, (self.reg_max + 1) * 4])
+            pred_dist_pos = torch.masked_select(pred_dist, dist_mask).view(-1, 4, self.reg_max + 1)
+            target_ltrb = bbox2dist(anchor_points, target_bboxes, self.reg_max)
+            target_ltrb_pos = torch.masked_select(target_ltrb, bbox_mask).view(-1, 4)
+            loss_dfl = self._df_loss(pred_dist_pos, target_ltrb_pos) * bbox_weight
+            loss_dfl = loss_dfl.sum() / target_scores_sum
+        else:
+            loss_dfl = torch.tensor(0.0).to(pred_dist.device)
+        return loss_iou, loss_dfl, iou
+    def _df_loss(self, pred_dist, target):
+        target_left = target.to(torch.long)
+        target_right = target_left + 1
+        weight_left = target_right.to(torch.float) - target
+        weight_right = 1 - weight_left
+        loss_left = F.cross_entropy(pred_dist.view(-1, self.reg_max + 1), target_left.view(-1), reduction="none").view(
+            target_left.shape) * weight_left
+        loss_right = F.cross_entropy(pred_dist.view(-1, self.reg_max + 1), target_right.view(-1),
+                                     reduction="none").view(target_left.shape) * weight_right
+        return (loss_left + loss_right).mean(-1, keepdim=True)
+class ComputeLoss:
+    # Compute losses
+    def __init__(self, model, use_dfl=True):
+        device = next(model.parameters()).device  # get model device
+        h = model.hyp  # hyperparameters
+        # Define criteria
+        BCEcls = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h["cls_pw"]], device=device), reduction='none')
+        # Class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3
+        self.cp, self.cn = smooth_BCE(eps=h.get("label_smoothing", 0.0))  # positive, negative BCE targets
+        # Focal loss
+        g = h["fl_gamma"]  # focal loss gamma
+        if g > 0:
+            BCEcls = FocalLoss(BCEcls, g)
+        m = de_parallel(model).model[-1]  # Detect() module
+        self.balance = {3: [4.0, 1.0, 0.4]}.get(m.nl, [4.0, 1.0, 0.25, 0.06, 0.02])  # P3-P7
+        self.BCEcls = BCEcls
+        self.hyp = h
+        self.stride = m.stride  # model strides
+        self.nc = m.nc  # number of classes
+        self.nl = m.nl  # number of layers
+        self.no = m.no
+        self.reg_max = m.reg_max
+        self.device = device
+        self.assigner = TaskAlignedAssigner(topk=int(os.getenv('YOLOM', 10)),
+                                            num_classes=self.nc,
+                                            alpha=float(os.getenv('YOLOA', 0.5)),
+                                            beta=float(os.getenv('YOLOB', 6.0)))
+        self.bbox_loss = BboxLoss(m.reg_max - 1, use_dfl=use_dfl).to(device)
+        self.proj = torch.arange(m.reg_max).float().to(device)  # / 120.0
+        self.use_dfl = use_dfl
+    def preprocess(self, targets, batch_size, scale_tensor):
+        if targets.shape[0] == 0:
+            out = torch.zeros(batch_size, 0, 5, device=self.device)
+        else:
+            i = targets[:, 0]  # image index
+            _, counts = i.unique(return_counts=True)
+            out = torch.zeros(batch_size, counts.max(), 5, device=self.device)
+            for j in range(batch_size):
+                matches = i == j
+                n = matches.sum()
+                if n:
+                    out[j, :n] = targets[matches, 1:]
+            out[..., 1:5] = xywh2xyxy(out[..., 1:5].mul_(scale_tensor))
+        return out
+    def bbox_decode(self, anchor_points, pred_dist):
+        if self.use_dfl:
+            b, a, c = pred_dist.shape  # batch, anchors, channels
+            pred_dist = pred_dist.view(b, a, 4, c // 4).softmax(3).matmul(self.proj.type(pred_dist.dtype))
+            # pred_dist = pred_dist.view(b, a, c // 4, 4).transpose(2,3).softmax(3).matmul(self.proj.type(pred_dist.dtype))
+            # pred_dist = (pred_dist.view(b, a, c // 4, 4).softmax(2) * self.proj.type(pred_dist.dtype).view(1, 1, -1, 1)).sum(2)
+        return dist2bbox(pred_dist, anchor_points, xywh=False)
+    def __call__(self, p, targets, img=None, epoch=0):
+        loss = torch.zeros(3, device=self.device)  # box, cls, dfl
+        feats = p[1] if isinstance(p, tuple) else p
+        pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split(
+            (self.reg_max * 4, self.nc), 1)
+        pred_scores = pred_scores.permute(0, 2, 1).contiguous()
+        pred_distri = pred_distri.permute(0, 2, 1).contiguous()
+        dtype = pred_scores.dtype
+        batch_size, grid_size = pred_scores.shape[:2]
+        imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0]  # image size (h,w)
+        anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5)
+        # targets
+        targets = self.preprocess(targets, batch_size, scale_tensor=imgsz[[1, 0, 1, 0]])
+        gt_labels, gt_bboxes = targets.split((1, 4), 2)  # cls, xyxy
+        mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0)
+        # pboxes
+        pred_bboxes = self.bbox_decode(anchor_points, pred_distri)  # xyxy, (b, h*w, 4)
+        target_labels, target_bboxes, target_scores, fg_mask = self.assigner(
+            pred_scores.detach().sigmoid(),
+            (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype),
+            anchor_points * stride_tensor,
+            gt_labels,
+            gt_bboxes,
+            mask_gt)
+        target_bboxes /= stride_tensor
+        target_scores_sum = max(target_scores.sum(), 1)
+        # cls loss
+        # loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum  # VFL way
+        loss[1] = self.BCEcls(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum  # BCE
+        # bbox loss
+        if fg_mask.sum():
+            loss[0], loss[2], iou = self.bbox_loss(pred_distri,
+                                                   pred_bboxes,
+                                                   anchor_points,
+                                                   target_bboxes,
+                                                   target_scores,
+                                                   target_scores_sum,
+                                                   fg_mask)
+        loss[0] *= 7.5  # box gain
+        loss[1] *= 0.5  # cls gain
+        loss[2] *= 1.5  # dfl gain
+        return loss.sum() * batch_size, loss.detach()  # loss(box, cls, dfl)
--- a/utils/loss_tal_dual.py
+++ b/utils/loss_tal_dual.py
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from utils.general import xywh2xyxy
+from utils.metrics import bbox_iou
+from utils.tal.anchor_generator import dist2bbox, make_anchors, bbox2dist
+from utils.tal.assigner import TaskAlignedAssigner
+from utils.torch_utils import de_parallel
+def smooth_BCE(eps=0.1):  # https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441
+    # return positive, negative label smoothing BCE targets
+    return 1.0 - 0.5 * eps, 0.5 * eps
+class VarifocalLoss(nn.Module):
+    # Varifocal loss by Zhang et al. https://arxiv.org/abs/2008.13367
+    def __init__(self):
+        super().__init__()
+    def forward(self, pred_score, gt_score, label, alpha=0.75, gamma=2.0):
+        weight = alpha * pred_score.sigmoid().pow(gamma) * (1 - label) + gt_score * label
+        with torch.cuda.amp.autocast(enabled=False):
+            loss = (F.binary_cross_entropy_with_logits(pred_score.float(), gt_score.float(),
+                                                       reduction="none") * weight).sum()
+        return loss
+class FocalLoss(nn.Module):
+    # Wraps focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5)
+    def __init__(self, loss_fcn, gamma=1.5, alpha=0.25):
+        super().__init__()
+        self.loss_fcn = loss_fcn  # must be nn.BCEWithLogitsLoss()
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = loss_fcn.reduction
+        self.loss_fcn.reduction = "none"  # required to apply FL to each element
+    def forward(self, pred, true):
+        loss = self.loss_fcn(pred, true)
+        # p_t = torch.exp(-loss)
+        # loss *= self.alpha * (1.000001 - p_t) ** self.gamma  # non-zero power for gradient stability
+        # TF implementation https://github.com/tensorflow/addons/blob/v0.7.1/tensorflow_addons/losses/focal_loss.py
+        pred_prob = torch.sigmoid(pred)  # prob from logits
+        p_t = true * pred_prob + (1 - true) * (1 - pred_prob)
+        alpha_factor = true * self.alpha + (1 - true) * (1 - self.alpha)
+        modulating_factor = (1.0 - p_t) ** self.gamma
+        loss *= alpha_factor * modulating_factor
+        if self.reduction == "mean":
+            return loss.mean()
+        elif self.reduction == "sum":
+            return loss.sum()
+        else:  # 'none'
+            return loss
+class BboxLoss(nn.Module):
+    def __init__(self, reg_max, use_dfl=False):
+        super().__init__()
+        self.reg_max = reg_max
+        self.use_dfl = use_dfl
+    def forward(self, pred_dist, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask):
+        # iou loss
+        bbox_mask = fg_mask.unsqueeze(-1).repeat([1, 1, 4])  # (b, h*w, 4)
+        pred_bboxes_pos = torch.masked_select(pred_bboxes, bbox_mask).view(-1, 4)
+        target_bboxes_pos = torch.masked_select(target_bboxes, bbox_mask).view(-1, 4)
+        bbox_weight = torch.masked_select(target_scores.sum(-1), fg_mask).unsqueeze(-1)
+        iou = bbox_iou(pred_bboxes_pos, target_bboxes_pos, xywh=False, CIoU=True)
+        loss_iou = 1.0 - iou
+        loss_iou *= bbox_weight
+        loss_iou = loss_iou.sum() / target_scores_sum
+        # dfl loss
+        if self.use_dfl:
+            dist_mask = fg_mask.unsqueeze(-1).repeat([1, 1, (self.reg_max + 1) * 4])
+            pred_dist_pos = torch.masked_select(pred_dist, dist_mask).view(-1, 4, self.reg_max + 1)
+            target_ltrb = bbox2dist(anchor_points, target_bboxes, self.reg_max)
+            target_ltrb_pos = torch.masked_select(target_ltrb, bbox_mask).view(-1, 4)
+            loss_dfl = self._df_loss(pred_dist_pos, target_ltrb_pos) * bbox_weight
+            loss_dfl = loss_dfl.sum() / target_scores_sum
+        else:
+            loss_dfl = torch.tensor(0.0).to(pred_dist.device)
+        return loss_iou, loss_dfl, iou
+    def _df_loss(self, pred_dist, target):
+        target_left = target.to(torch.long)
+        target_right = target_left + 1
+        weight_left = target_right.to(torch.float) - target
+        weight_right = 1 - weight_left
+        loss_left = F.cross_entropy(pred_dist.view(-1, self.reg_max + 1), target_left.view(-1), reduction="none").view(
+            target_left.shape) * weight_left
+        loss_right = F.cross_entropy(pred_dist.view(-1, self.reg_max + 1), target_right.view(-1),
+                                     reduction="none").view(target_left.shape) * weight_right
+        return (loss_left + loss_right).mean(-1, keepdim=True)
+class ComputeLoss:
+    # Compute losses
+    def __init__(self, model, use_dfl=True):
+        device = next(model.parameters()).device  # get model device
+        h = model.hyp  # hyperparameters
+        # Define criteria
+        BCEcls = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h["cls_pw"]], device=device), reduction='none')
+        # Class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3
+        self.cp, self.cn = smooth_BCE(eps=h.get("label_smoothing", 0.0))  # positive, negative BCE targets
+        # Focal loss
+        g = h["fl_gamma"]  # focal loss gamma
+        if g > 0:
+            BCEcls = FocalLoss(BCEcls, g)
+        m = de_parallel(model).model[-1]  # Detect() module
+        self.balance = {3: [4.0, 1.0, 0.4]}.get(m.nl, [4.0, 1.0, 0.25, 0.06, 0.02])  # P3-P7
+        self.BCEcls = BCEcls
+        self.hyp = h
+        self.stride = m.stride  # model strides
+        self.nc = m.nc  # number of classes
+        self.nl = m.nl  # number of layers
+        self.no = m.no
+        self.reg_max = m.reg_max
+        self.device = device
+        self.assigner = TaskAlignedAssigner(topk=int(os.getenv('YOLOM', 10)),
+                                            num_classes=self.nc,
+                                            alpha=float(os.getenv('YOLOA', 0.5)),
+                                            beta=float(os.getenv('YOLOB', 6.0)))
+        self.assigner2 = TaskAlignedAssigner(topk=int(os.getenv('YOLOM', 10)),
+                                            num_classes=self.nc,
+                                            alpha=float(os.getenv('YOLOA', 0.5)),
+                                            beta=float(os.getenv('YOLOB', 6.0)))
+        self.bbox_loss = BboxLoss(m.reg_max - 1, use_dfl=use_dfl).to(device)
+        self.bbox_loss2 = BboxLoss(m.reg_max - 1, use_dfl=use_dfl).to(device)
+        self.proj = torch.arange(m.reg_max).float().to(device)  # / 120.0
+        self.use_dfl = use_dfl
+    def preprocess(self, targets, batch_size, scale_tensor):
+        if targets.shape[0] == 0:
+            out = torch.zeros(batch_size, 0, 5, device=self.device)
+        else:
+            i = targets[:, 0]  # image index
+            _, counts = i.unique(return_counts=True)
+            out = torch.zeros(batch_size, counts.max(), 5, device=self.device)
+            for j in range(batch_size):
+                matches = i == j
+                n = matches.sum()
+                if n:
+                    out[j, :n] = targets[matches, 1:]
+            out[..., 1:5] = xywh2xyxy(out[..., 1:5].mul_(scale_tensor))
+        return out
+    def bbox_decode(self, anchor_points, pred_dist):
+        if self.use_dfl:
+            b, a, c = pred_dist.shape  # batch, anchors, channels
+            pred_dist = pred_dist.view(b, a, 4, c // 4).softmax(3).matmul(self.proj.type(pred_dist.dtype))
+            # pred_dist = pred_dist.view(b, a, c // 4, 4).transpose(2,3).softmax(3).matmul(self.proj.type(pred_dist.dtype))
+            # pred_dist = (pred_dist.view(b, a, c // 4, 4).softmax(2) * self.proj.type(pred_dist.dtype).view(1, 1, -1, 1)).sum(2)
+        return dist2bbox(pred_dist, anchor_points, xywh=False)
+    def __call__(self, p, targets, img=None, epoch=0):
+        loss = torch.zeros(3, device=self.device)  # box, cls, dfl
+        feats = p[1][0] if isinstance(p, tuple) else p[0]
+        feats2 = p[1][1] if isinstance(p, tuple) else p[1]
+        pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split(
+            (self.reg_max * 4, self.nc), 1)
+        pred_scores = pred_scores.permute(0, 2, 1).contiguous()
+        pred_distri = pred_distri.permute(0, 2, 1).contiguous()
+        pred_distri2, pred_scores2 = torch.cat([xi.view(feats2[0].shape[0], self.no, -1) for xi in feats2], 2).split(
+            (self.reg_max * 4, self.nc), 1)
+        pred_scores2 = pred_scores2.permute(0, 2, 1).contiguous()
+        pred_distri2 = pred_distri2.permute(0, 2, 1).contiguous()
+        dtype = pred_scores.dtype
+        batch_size, grid_size = pred_scores.shape[:2]
+        imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0]  # image size (h,w)
+        anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5)
+        # targets
+        targets = self.preprocess(targets, batch_size, scale_tensor=imgsz[[1, 0, 1, 0]])
+        gt_labels, gt_bboxes = targets.split((1, 4), 2)  # cls, xyxy
+        mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0)
+        # pboxes
+        pred_bboxes = self.bbox_decode(anchor_points, pred_distri)  # xyxy, (b, h*w, 4)
+        pred_bboxes2 = self.bbox_decode(anchor_points, pred_distri2)  # xyxy, (b, h*w, 4)
+        target_labels, target_bboxes, target_scores, fg_mask = self.assigner(
+            pred_scores.detach().sigmoid(),
+            (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype),
+            anchor_points * stride_tensor,
+            gt_labels,
+            gt_bboxes,
+            mask_gt)
+        target_labels2, target_bboxes2, target_scores2, fg_mask2 = self.assigner2(
+            pred_scores2.detach().sigmoid(),
+            (pred_bboxes2.detach() * stride_tensor).type(gt_bboxes.dtype),
+            anchor_points * stride_tensor,
+            gt_labels,
+            gt_bboxes,
+            mask_gt)
+        target_bboxes /= stride_tensor
+        target_scores_sum = max(target_scores.sum(), 1)
+        target_bboxes2 /= stride_tensor
+        target_scores_sum2 = max(target_scores2.sum(), 1)
+        # cls loss
+        # loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum  # VFL way
+        loss[1] = self.BCEcls(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum # BCE
+        loss[1] *= 0.25
+        loss[1] += self.BCEcls(pred_scores2, target_scores2.to(dtype)).sum() / target_scores_sum2 # BCE
+        # bbox loss
+        if fg_mask.sum():
+            loss[0], loss[2], iou = self.bbox_loss(pred_distri,
+                                                   pred_bboxes,
+                                                   anchor_points,
+                                                   target_bboxes,
+                                                   target_scores,
+                                                   target_scores_sum,
+                                                   fg_mask)
+            loss[0] *= 0.25
+            loss[2] *= 0.25
+        if fg_mask2.sum():
+            loss0_, loss2_, iou2 = self.bbox_loss2(pred_distri2,
+                                                   pred_bboxes2,
+                                                   anchor_points,
+                                                   target_bboxes2,
+                                                   target_scores2,
+                                                   target_scores_sum2,
+                                                   fg_mask2)
+            loss[0] += loss0_
+            loss[2] += loss2_
+        loss[0] *= 7.5  # box gain
+        loss[1] *= 0.5  # cls gain
+        loss[2] *= 1.5  # dfl gain
+        return loss.sum() * batch_size, loss.detach()  # loss(box, cls, dfl)
+class ComputeLossLH:
+    # Compute losses
+    def __init__(self, model, use_dfl=True):
+        device = next(model.parameters()).device  # get model device
+        h = model.hyp  # hyperparameters
+        # Define criteria
+        BCEcls = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h["cls_pw"]], device=device), reduction='none')
+        # Class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3
+        self.cp, self.cn = smooth_BCE(eps=h.get("label_smoothing", 0.0))  # positive, negative BCE targets
+        # Focal loss
+        g = h["fl_gamma"]  # focal loss gamma
+        if g > 0:
+            BCEcls = FocalLoss(BCEcls, g)
+        m = de_parallel(model).model[-1]  # Detect() module
+        self.balance = {3: [4.0, 1.0, 0.4]}.get(m.nl, [4.0, 1.0, 0.25, 0.06, 0.02])  # P3-P7
+        self.BCEcls = BCEcls
+        self.hyp = h
+        self.stride = m.stride  # model strides
+        self.nc = m.nc  # number of classes
+        self.nl = m.nl  # number of layers
+        self.no = m.no
+        self.reg_max = m.reg_max
+        self.device = device
+        self.assigner = TaskAlignedAssigner(topk=int(os.getenv('YOLOM', 10)),
+                                            num_classes=self.nc,
+                                            alpha=float(os.getenv('YOLOA', 0.5)),
+                                            beta=float(os.getenv('YOLOB', 6.0)))
+        self.bbox_loss = BboxLoss(m.reg_max - 1, use_dfl=use_dfl).to(device)
+        self.proj = torch.arange(m.reg_max).float().to(device)  # / 120.0
+        self.use_dfl = use_dfl
+    def preprocess(self, targets, batch_size, scale_tensor):
+        if targets.shape[0] == 0:
+            out = torch.zeros(batch_size, 0, 5, device=self.device)
+        else:
+            i = targets[:, 0]  # image index
+            _, counts = i.unique(return_counts=True)
+            out = torch.zeros(batch_size, counts.max(), 5, device=self.device)
+            for j in range(batch_size):
+                matches = i == j
+                n = matches.sum()
+                if n:
+                    out[j, :n] = targets[matches, 1:]
+            out[..., 1:5] = xywh2xyxy(out[..., 1:5].mul_(scale_tensor))
+        return out
+    def bbox_decode(self, anchor_points, pred_dist):
+        if self.use_dfl:
+            b, a, c = pred_dist.shape  # batch, anchors, channels
+            pred_dist = pred_dist.view(b, a, 4, c // 4).softmax(3).matmul(self.proj.type(pred_dist.dtype))
+            # pred_dist = pred_dist.view(b, a, c // 4, 4).transpose(2,3).softmax(3).matmul(self.proj.type(pred_dist.dtype))
+            # pred_dist = (pred_dist.view(b, a, c // 4, 4).softmax(2) * self.proj.type(pred_dist.dtype).view(1, 1, -1, 1)).sum(2)
+        return dist2bbox(pred_dist, anchor_points, xywh=False)
+    def __call__(self, p, targets, img=None, epoch=0):
+        loss = torch.zeros(3, device=self.device)  # box, cls, dfl
+        feats = p[1][0] if isinstance(p, tuple) else p[0]
+        feats2 = p[1][1] if isinstance(p, tuple) else p[1]
+        pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split(
+            (self.reg_max * 4, self.nc), 1)
+        pred_scores = pred_scores.permute(0, 2, 1).contiguous()
+        pred_distri = pred_distri.permute(0, 2, 1).contiguous()
+        pred_distri2, pred_scores2 = torch.cat([xi.view(feats2[0].shape[0], self.no, -1) for xi in feats2], 2).split(
+            (self.reg_max * 4, self.nc), 1)
+        pred_scores2 = pred_scores2.permute(0, 2, 1).contiguous()
+        pred_distri2 = pred_distri2.permute(0, 2, 1).contiguous()
+        dtype = pred_scores.dtype
+        batch_size, grid_size = pred_scores.shape[:2]
+        imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0]  # image size (h,w)
+        anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5)
+        # targets
+        targets = self.preprocess(targets, batch_size, scale_tensor=imgsz[[1, 0, 1, 0]])
+        gt_labels, gt_bboxes = targets.split((1, 4), 2)  # cls, xyxy
+        mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0)
+        # pboxes
+        pred_bboxes = self.bbox_decode(anchor_points, pred_distri)  # xyxy, (b, h*w, 4)
+        pred_bboxes2 = self.bbox_decode(anchor_points, pred_distri2)  # xyxy, (b, h*w, 4)
+        target_labels, target_bboxes, target_scores, fg_mask = self.assigner(
+            pred_scores2.detach().sigmoid(),
+            (pred_bboxes2.detach() * stride_tensor).type(gt_bboxes.dtype),
+            anchor_points * stride_tensor,
+            gt_labels,
+            gt_bboxes,
+            mask_gt)
+        target_bboxes /= stride_tensor
+        target_scores_sum = target_scores.sum()
+        # cls loss
+        # loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum  # VFL way
+        loss[1] = self.BCEcls(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum # BCE
+        loss[1] *= 0.25
+        loss[1] += self.BCEcls(pred_scores2, target_scores.to(dtype)).sum() / target_scores_sum # BCE
+        # bbox loss
+        if fg_mask.sum():
+            loss[0], loss[2], iou = self.bbox_loss(pred_distri,
+                                                   pred_bboxes,
+                                                   anchor_points,
+                                                   target_bboxes,
+                                                   target_scores,
+                                                   target_scores_sum,
+                                                   fg_mask)
+            loss[0] *= 0.25
+            loss[2] *= 0.25
+        if fg_mask.sum():
+            loss0_, loss2_, iou2 = self.bbox_loss(pred_distri2,
+                                                   pred_bboxes2,
+                                                   anchor_points,
+                                                   target_bboxes,
+                                                   target_scores,
+                                                   target_scores_sum,
+                                                   fg_mask)
+            loss[0] += loss0_
+            loss[2] += loss2_
+        loss[0] *= 7.5  # box gain
+        loss[1] *= 0.5  # cls gain
+        loss[2] *= 1.5  # dfl gain
+        return loss.sum() * batch_size, loss.detach()  # loss(box, cls, dfl)
--- a/utils/loss_tal_triple.py
+++ b/utils/loss_tal_triple.py
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from utils.general import xywh2xyxy
+from utils.metrics import bbox_iou
+from utils.tal.anchor_generator import dist2bbox, make_anchors, bbox2dist
+from utils.tal.assigner import TaskAlignedAssigner
+from utils.torch_utils import de_parallel
+def smooth_BCE(eps=0.1):  # https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441
+    # return positive, negative label smoothing BCE targets
+    return 1.0 - 0.5 * eps, 0.5 * eps
+class VarifocalLoss(nn.Module):
+    # Varifocal loss by Zhang et al. https://arxiv.org/abs/2008.13367
+    def __init__(self):
+        super().__init__()
+    def forward(self, pred_score, gt_score, label, alpha=0.75, gamma=2.0):
+        weight = alpha * pred_score.sigmoid().pow(gamma) * (1 - label) + gt_score * label
+        with torch.cuda.amp.autocast(enabled=False):
+            loss = (F.binary_cross_entropy_with_logits(pred_score.float(), gt_score.float(),
+                                                       reduction="none") * weight).sum()
+        return loss
+class FocalLoss(nn.Module):
+    # Wraps focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5)
+    def __init__(self, loss_fcn, gamma=1.5, alpha=0.25):
+        super().__init__()
+        self.loss_fcn = loss_fcn  # must be nn.BCEWithLogitsLoss()
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = loss_fcn.reduction
+        self.loss_fcn.reduction = "none"  # required to apply FL to each element
+    def forward(self, pred, true):
+        loss = self.loss_fcn(pred, true)
+        # p_t = torch.exp(-loss)
+        # loss *= self.alpha * (1.000001 - p_t) ** self.gamma  # non-zero power for gradient stability
+        # TF implementation https://github.com/tensorflow/addons/blob/v0.7.1/tensorflow_addons/losses/focal_loss.py
+        pred_prob = torch.sigmoid(pred)  # prob from logits
+        p_t = true * pred_prob + (1 - true) * (1 - pred_prob)
+        alpha_factor = true * self.alpha + (1 - true) * (1 - self.alpha)
+        modulating_factor = (1.0 - p_t) ** self.gamma
+        loss *= alpha_factor * modulating_factor
+        if self.reduction == "mean":
+            return loss.mean()
+        elif self.reduction == "sum":
+            return loss.sum()
+        else:  # 'none'
+            return loss
+class BboxLoss(nn.Module):
+    def __init__(self, reg_max, use_dfl=False):
+        super().__init__()
+        self.reg_max = reg_max
+        self.use_dfl = use_dfl
+    def forward(self, pred_dist, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask):
+        # iou loss
+        bbox_mask = fg_mask.unsqueeze(-1).repeat([1, 1, 4])  # (b, h*w, 4)
+        pred_bboxes_pos = torch.masked_select(pred_bboxes, bbox_mask).view(-1, 4)
+        target_bboxes_pos = torch.masked_select(target_bboxes, bbox_mask).view(-1, 4)
+        bbox_weight = torch.masked_select(target_scores.sum(-1), fg_mask).unsqueeze(-1)
+        iou = bbox_iou(pred_bboxes_pos, target_bboxes_pos, xywh=False, CIoU=True)
+        loss_iou = 1.0 - iou
+        loss_iou *= bbox_weight
+        loss_iou = loss_iou.sum() / target_scores_sum
+        # dfl loss
+        if self.use_dfl:
+            dist_mask = fg_mask.unsqueeze(-1).repeat([1, 1, (self.reg_max + 1) * 4])
+            pred_dist_pos = torch.masked_select(pred_dist, dist_mask).view(-1, 4, self.reg_max + 1)
+            target_ltrb = bbox2dist(anchor_points, target_bboxes, self.reg_max)
+            target_ltrb_pos = torch.masked_select(target_ltrb, bbox_mask).view(-1, 4)
+            loss_dfl = self._df_loss(pred_dist_pos, target_ltrb_pos) * bbox_weight
+            loss_dfl = loss_dfl.sum() / target_scores_sum
+        else:
+            loss_dfl = torch.tensor(0.0).to(pred_dist.device)
+        return loss_iou, loss_dfl, iou
+    def _df_loss(self, pred_dist, target):
+        target_left = target.to(torch.long)
+        target_right = target_left + 1
+        weight_left = target_right.to(torch.float) - target
+        weight_right = 1 - weight_left
+        loss_left = F.cross_entropy(pred_dist.view(-1, self.reg_max + 1), target_left.view(-1), reduction="none").view(
+            target_left.shape) * weight_left
+        loss_right = F.cross_entropy(pred_dist.view(-1, self.reg_max + 1), target_right.view(-1),
+                                     reduction="none").view(target_left.shape) * weight_right
+        return (loss_left + loss_right).mean(-1, keepdim=True)
+class ComputeLoss:
+    # Compute losses
+    def __init__(self, model, use_dfl=True):
+        device = next(model.parameters()).device  # get model device
+        h = model.hyp  # hyperparameters
+        # Define criteria
+        BCEcls = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h["cls_pw"]], device=device), reduction='none')
+        # Class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3
+        self.cp, self.cn = smooth_BCE(eps=h.get("label_smoothing", 0.0))  # positive, negative BCE targets
+        # Focal loss
+        g = h["fl_gamma"]  # focal loss gamma
+        if g > 0:
+            BCEcls = FocalLoss(BCEcls, g)
+        m = de_parallel(model).model[-1]  # Detect() module
+        self.balance = {3: [4.0, 1.0, 0.4]}.get(m.nl, [4.0, 1.0, 0.25, 0.06, 0.02])  # P3-P7
+        self.BCEcls = BCEcls
+        self.hyp = h
+        self.stride = m.stride  # model strides
+        self.nc = m.nc  # number of classes
+        self.nl = m.nl  # number of layers
+        self.no = m.no
+        self.reg_max = m.reg_max
+        self.device = device
+        self.assigner = TaskAlignedAssigner(topk=int(os.getenv('YOLOM', 10)),
+                                            num_classes=self.nc,
+                                            alpha=float(os.getenv('YOLOA', 0.5)),
+                                            beta=float(os.getenv('YOLOB', 6.0)))
+        self.assigner2 = TaskAlignedAssigner(topk=int(os.getenv('YOLOM', 10)),
+                                            num_classes=self.nc,
+                                            alpha=float(os.getenv('YOLOA', 0.5)),
+                                            beta=float(os.getenv('YOLOB', 6.0)))
+        self.assigner3 = TaskAlignedAssigner(topk=int(os.getenv('YOLOM', 10)),
+                                            num_classes=self.nc,
+                                            alpha=float(os.getenv('YOLOA', 0.5)),
+                                            beta=float(os.getenv('YOLOB', 6.0)))
+        self.bbox_loss = BboxLoss(m.reg_max - 1, use_dfl=use_dfl).to(device)
+        self.bbox_loss2 = BboxLoss(m.reg_max - 1, use_dfl=use_dfl).to(device)
+        self.bbox_loss3 = BboxLoss(m.reg_max - 1, use_dfl=use_dfl).to(device)
+        self.proj = torch.arange(m.reg_max).float().to(device)  # / 120.0
+        self.use_dfl = use_dfl
+    def preprocess(self, targets, batch_size, scale_tensor):
+        if targets.shape[0] == 0:
+            out = torch.zeros(batch_size, 0, 5, device=self.device)
+        else:
+            i = targets[:, 0]  # image index
+            _, counts = i.unique(return_counts=True)
+            out = torch.zeros(batch_size, counts.max(), 5, device=self.device)
+            for j in range(batch_size):
+                matches = i == j
+                n = matches.sum()
+                if n:
+                    out[j, :n] = targets[matches, 1:]
+            out[..., 1:5] = xywh2xyxy(out[..., 1:5].mul_(scale_tensor))
+        return out
+    def bbox_decode(self, anchor_points, pred_dist):
+        if self.use_dfl:
+            b, a, c = pred_dist.shape  # batch, anchors, channels
+            pred_dist = pred_dist.view(b, a, 4, c // 4).softmax(3).matmul(self.proj.type(pred_dist.dtype))
+            # pred_dist = pred_dist.view(b, a, c // 4, 4).transpose(2,3).softmax(3).matmul(self.proj.type(pred_dist.dtype))
+            # pred_dist = (pred_dist.view(b, a, c // 4, 4).softmax(2) * self.proj.type(pred_dist.dtype).view(1, 1, -1, 1)).sum(2)
+        return dist2bbox(pred_dist, anchor_points, xywh=False)
+    def __call__(self, p, targets, img=None, epoch=0):
+        loss = torch.zeros(3, device=self.device)  # box, cls, dfl
+        feats = p[1][0] if isinstance(p, tuple) else p[0]
+        feats2 = p[1][1] if isinstance(p, tuple) else p[1]
+        feats3 = p[1][2] if isinstance(p, tuple) else p[2]
+        pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split(
+            (self.reg_max * 4, self.nc), 1)
+        pred_scores = pred_scores.permute(0, 2, 1).contiguous()
+        pred_distri = pred_distri.permute(0, 2, 1).contiguous()
+        pred_distri2, pred_scores2 = torch.cat([xi.view(feats2[0].shape[0], self.no, -1) for xi in feats2], 2).split(
+            (self.reg_max * 4, self.nc), 1)
+        pred_scores2 = pred_scores2.permute(0, 2, 1).contiguous()
+        pred_distri2 = pred_distri2.permute(0, 2, 1).contiguous()
+        pred_distri3, pred_scores3 = torch.cat([xi.view(feats3[0].shape[0], self.no, -1) for xi in feats3], 2).split(
+            (self.reg_max * 4, self.nc), 1)
+        pred_scores3 = pred_scores3.permute(0, 2, 1).contiguous()
+        pred_distri3 = pred_distri3.permute(0, 2, 1).contiguous()
+        dtype = pred_scores.dtype
+        batch_size, grid_size = pred_scores.shape[:2]
+        imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0]  # image size (h,w)
+        anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5)
+        # targets
+        targets = self.preprocess(targets, batch_size, scale_tensor=imgsz[[1, 0, 1, 0]])
+        gt_labels, gt_bboxes = targets.split((1, 4), 2)  # cls, xyxy
+        mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0)
+        # pboxes
+        pred_bboxes = self.bbox_decode(anchor_points, pred_distri)  # xyxy, (b, h*w, 4)
+        pred_bboxes2 = self.bbox_decode(anchor_points, pred_distri2)  # xyxy, (b, h*w, 4)
+        pred_bboxes3 = self.bbox_decode(anchor_points, pred_distri3)  # xyxy, (b, h*w, 4)
+        target_labels, target_bboxes, target_scores, fg_mask = self.assigner(
+            pred_scores.detach().sigmoid(),
+            (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype),
+            anchor_points * stride_tensor,
+            gt_labels,
+            gt_bboxes,
+            mask_gt)
+        target_labels2, target_bboxes2, target_scores2, fg_mask2 = self.assigner2(
+            pred_scores2.detach().sigmoid(),
+            (pred_bboxes2.detach() * stride_tensor).type(gt_bboxes.dtype),
+            anchor_points * stride_tensor,
+            gt_labels,
+            gt_bboxes,
+            mask_gt)
+        target_labels3, target_bboxes3, target_scores3, fg_mask3 = self.assigner3(
+            pred_scores3.detach().sigmoid(),
+            (pred_bboxes3.detach() * stride_tensor).type(gt_bboxes.dtype),
+            anchor_points * stride_tensor,
+            gt_labels,
+            gt_bboxes,
+            mask_gt)
+        target_bboxes /= stride_tensor
+        target_scores_sum = max(target_scores.sum(), 1)
+        target_bboxes2 /= stride_tensor
+        target_scores_sum2 = max(target_scores2.sum(), 1)
+        target_bboxes3 /= stride_tensor
+        target_scores_sum3 = max(target_scores3.sum(), 1)
+        # cls loss
+        # loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum  # VFL way
+        loss[1] = 0.25 * self.BCEcls(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum # BCE
+        loss[1] += 0.25 * self.BCEcls(pred_scores2, target_scores2.to(dtype)).sum() / target_scores_sum2 # BCE
+        loss[1] += self.BCEcls(pred_scores3, target_scores3.to(dtype)).sum() / target_scores_sum3 # BCE
+        # bbox loss
+        if fg_mask.sum():
+            loss[0], loss[2], iou = self.bbox_loss(pred_distri,
+                                                   pred_bboxes,
+                                                   anchor_points,
+                                                   target_bboxes,
+                                                   target_scores,
+                                                   target_scores_sum,
+                                                   fg_mask)
+            loss[0] *= 0.25
+            loss[2] *= 0.25
+        if fg_mask2.sum():
+            loss0_, loss2_, iou2 = self.bbox_loss2(pred_distri2,
+                                                   pred_bboxes2,
+                                                   anchor_points,
+                                                   target_bboxes2,
+                                                   target_scores2,
+                                                   target_scores_sum2,
+                                                   fg_mask2)
+            loss[0] += 0.25 * loss0_
+            loss[2] += 0.25 * loss2_
+        if fg_mask3.sum():
+            loss0__, loss2__, iou3 = self.bbox_loss3(pred_distri3,
+                                                   pred_bboxes3,
+                                                   anchor_points,
+                                                   target_bboxes3,
+                                                   target_scores3,
+                                                   target_scores_sum3,
+                                                   fg_mask3)
+            loss[0] += loss0__
+            loss[2] += loss2__
+        loss[0] *= 7.5  # box gain
+        loss[1] *= 0.5  # cls gain
+        loss[2] *= 1.5  # dfl gain
+        return loss.sum() * batch_size, loss.detach()  # loss(box, cls, dfl)
--- a/utils/metrics.py
+++ b/utils/metrics.py
+import math
+import warnings
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from utils import TryExcept, threaded
+def fitness(x):
+    # Model fitness as a weighted combination of metrics
+    w = [0.0, 0.0, 0.1, 0.9]  # weights for [P, R, mAP@0.5, mAP@0.5:0.95]
+    return (x[:, :4] * w).sum(1)
+def smooth(y, f=0.05):
+    # Box filter of fraction f
+    nf = round(len(y) * f * 2) // 2 + 1  # number of filter elements (must be odd)
+    p = np.ones(nf // 2)  # ones padding
+    yp = np.concatenate((p * y[0], y, p * y[-1]), 0)  # y padded
+    return np.convolve(yp, np.ones(nf) / nf, mode='valid')  # y-smoothed
+def ap_per_class(tp, conf, pred_cls, target_cls, plot=False, save_dir='.', names=(), eps=1e-16, prefix=""):
+    """ Compute the average precision, given the recall and precision curves.
+    Source: https://github.com/rafaelpadilla/Object-Detection-Metrics.
+    # Arguments
+        tp:  True positives (nparray, nx1 or nx10).
+        conf:  Objectness value from 0-1 (nparray).
+        pred_cls:  Predicted object classes (nparray).
+        target_cls:  True object classes (nparray).
+        plot:  Plot precision-recall curve at mAP@0.5
+        save_dir:  Plot save directory
+    # Returns
+        The average precision as computed in py-faster-rcnn.
+    """
+    # Sort by objectness
+    i = np.argsort(-conf)
+    tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]
+    # Find unique classes
+    unique_classes, nt = np.unique(target_cls, return_counts=True)
+    nc = unique_classes.shape[0]  # number of classes, number of detections
+    # Create Precision-Recall curve and compute AP for each class
+    px, py = np.linspace(0, 1, 1000), []  # for plotting
+    ap, p, r = np.zeros((nc, tp.shape[1])), np.zeros((nc, 1000)), np.zeros((nc, 1000))
+    for ci, c in enumerate(unique_classes):
+        i = pred_cls == c
+        n_l = nt[ci]  # number of labels
+        n_p = i.sum()  # number of predictions
+        if n_p == 0 or n_l == 0:
+            continue
+        # Accumulate FPs and TPs
+        fpc = (1 - tp[i]).cumsum(0)
+        tpc = tp[i].cumsum(0)
+        # Recall
+        recall = tpc / (n_l + eps)  # recall curve
+        r[ci] = np.interp(-px, -conf[i], recall[:, 0], left=0)  # negative x, xp because xp decreases
+        # Precision
+        precision = tpc / (tpc + fpc)  # precision curve
+        p[ci] = np.interp(-px, -conf[i], precision[:, 0], left=1)  # p at pr_score
+        # AP from recall-precision curve
+        for j in range(tp.shape[1]):
+            ap[ci, j], mpre, mrec = compute_ap(recall[:, j], precision[:, j])
+            if plot and j == 0:
+                py.append(np.interp(px, mrec, mpre))  # precision at mAP@0.5
+    # Compute F1 (harmonic mean of precision and recall)
+    f1 = 2 * p * r / (p + r + eps)
+    names = [v for k, v in names.items() if k in unique_classes]  # list: only classes that have data
+    names = dict(enumerate(names))  # to dict
+    if plot:
+        plot_pr_curve(px, py, ap, Path(save_dir) / f'{prefix}PR_curve.png', names)
+        plot_mc_curve(px, f1, Path(save_dir) / f'{prefix}F1_curve.png', names, ylabel='F1')
+        plot_mc_curve(px, p, Path(save_dir) / f'{prefix}P_curve.png', names, ylabel='Precision')
+        plot_mc_curve(px, r, Path(save_dir) / f'{prefix}R_curve.png', names, ylabel='Recall')
+    i = smooth(f1.mean(0), 0.1).argmax()  # max F1 index
+    p, r, f1 = p[:, i], r[:, i], f1[:, i]
+    tp = (r * nt).round()  # true positives
+    fp = (tp / (p + eps) - tp).round()  # false positives
+    return tp, fp, p, r, f1, ap, unique_classes.astype(int)
+def compute_ap(recall, precision):
+    """ Compute the average precision, given the recall and precision curves
+    # Arguments
+        recall:    The recall curve (list)
+        precision: The precision curve (list)
+    # Returns
+        Average precision, precision curve, recall curve
+    """
+    # Append sentinel values to beginning and end
+    mrec = np.concatenate(([0.0], recall, [1.0]))
+    mpre = np.concatenate(([1.0], precision, [0.0]))
+    # Compute the precision envelope
+    mpre = np.flip(np.maximum.accumulate(np.flip(mpre)))
+    # Integrate area under curve
+    method = 'interp'  # methods: 'continuous', 'interp'
+    if method == 'interp':
+        x = np.linspace(0, 1, 101)  # 101-point interp (COCO)
+        ap = np.trapz(np.interp(x, mrec, mpre), x)  # integrate
+    else:  # 'continuous'
+        i = np.where(mrec[1:] != mrec[:-1])[0]  # points where x axis (recall) changes
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])  # area under curve
+    return ap, mpre, mrec
+class ConfusionMatrix:
+    # Updated version of https://github.com/kaanakan/object_detection_confusion_matrix
+    def __init__(self, nc, conf=0.25, iou_thres=0.45):
+        self.matrix = np.zeros((nc + 1, nc + 1))
+        self.nc = nc  # number of classes
+        self.conf = conf
+        self.iou_thres = iou_thres
+    def process_batch(self, detections, labels):
+        """
+        Return intersection-over-union (Jaccard index) of boxes.
+        Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+        Arguments:
+            detections (Array[N, 6]), x1, y1, x2, y2, conf, class
+            labels (Array[M, 5]), class, x1, y1, x2, y2
+        Returns:
+            None, updates confusion matrix accordingly
+        """
+        if detections is None:
+            gt_classes = labels.int()
+            for gc in gt_classes:
+                self.matrix[self.nc, gc] += 1  # background FN
+            return
+        detections = detections[detections[:, 4] > self.conf]
+        gt_classes = labels[:, 0].int()
+        detection_classes = detections[:, 5].int()
+        iou = box_iou(labels[:, 1:], detections[:, :4])
+        x = torch.where(iou > self.iou_thres)
+        if x[0].shape[0]:
+            matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1).cpu().numpy()
+            if x[0].shape[0] > 1:
+                matches = matches[matches[:, 2].argsort()[::-1]]
+                matches = matches[np.unique(matches[:, 1], return_index=True)[1]]
+                matches = matches[matches[:, 2].argsort()[::-1]]
+                matches = matches[np.unique(matches[:, 0], return_index=True)[1]]
+        else:
+            matches = np.zeros((0, 3))
+        n = matches.shape[0] > 0
+        m0, m1, _ = matches.transpose().astype(int)
+        for i, gc in enumerate(gt_classes):
+            j = m0 == i
+            if n and sum(j) == 1:
+                self.matrix[detection_classes[m1[j]], gc] += 1  # correct
+            else:
+                self.matrix[self.nc, gc] += 1  # true background
+        if n:
+            for i, dc in enumerate(detection_classes):
+                if not any(m1 == i):
+                    self.matrix[dc, self.nc] += 1  # predicted background
+    def matrix(self):
+        return self.matrix
+    def tp_fp(self):
+        tp = self.matrix.diagonal()  # true positives
+        fp = self.matrix.sum(1) - tp  # false positives
+        # fn = self.matrix.sum(0) - tp  # false negatives (missed detections)
+        return tp[:-1], fp[:-1]  # remove background class
+    @TryExcept('WARNING ⚠️ ConfusionMatrix plot failure')
+    def plot(self, normalize=True, save_dir='', names=()):
+        import seaborn as sn
+        array = self.matrix / ((self.matrix.sum(0).reshape(1, -1) + 1E-9) if normalize else 1)  # normalize columns
+        array[array < 0.005] = np.nan  # don't annotate (would appear as 0.00)
+        fig, ax = plt.subplots(1, 1, figsize=(12, 9), tight_layout=True)
+        nc, nn = self.nc, len(names)  # number of classes, names
+        sn.set(font_scale=1.0 if nc < 50 else 0.8)  # for label size
+        labels = (0 < nn < 99) and (nn == nc)  # apply names to ticklabels
+        ticklabels = (names + ['background']) if labels else "auto"
+        with warnings.catch_warnings():
+            warnings.simplefilter('ignore')  # suppress empty matrix RuntimeWarning: All-NaN slice encountered
+            sn.heatmap(array,
+                       ax=ax,
+                       annot=nc < 30,
+                       annot_kws={
+                           "size": 8},
+                       cmap='Blues',
+                       fmt='.2f',
+                       square=True,
+                       vmin=0.0,
+                       xticklabels=ticklabels,
+                       yticklabels=ticklabels).set_facecolor((1, 1, 1))
+        ax.set_ylabel('True')
+        ax.set_ylabel('Predicted')
+        ax.set_title('Confusion Matrix')
+        fig.savefig(Path(save_dir) / 'confusion_matrix.png', dpi=250)
+        plt.close(fig)
+    def print(self):
+        for i in range(self.nc + 1):
+            print(' '.join(map(str, self.matrix[i])))
+class WIoU_Scale:
+    ''' monotonous: {
+            None: origin v1
+            True: monotonic FM v2
+            False: non-monotonic FM v3
+        }
+        momentum: The momentum of running mean'''
+    iou_mean = 1.
+    monotonous = False
+    _momentum = 1 - 0.5 ** (1 / 7000)
+    _is_train = True
+    def __init__(self, iou):
+        self.iou = iou
+        self._update(self)
+    @classmethod
+    def _update(cls, self):
+        if cls._is_train: cls.iou_mean = (1 - cls._momentum) * cls.iou_mean + \
+                                         cls._momentum * self.iou.detach().mean().item()
+    @classmethod
+    def _scaled_loss(cls, self, gamma=1.9, delta=3):
+        if isinstance(self.monotonous, bool):
+            if self.monotonous:
+                return (self.iou.detach() / self.iou_mean).sqrt()
+            else:
+                beta = self.iou.detach() / self.iou_mean
+                alpha = delta * torch.pow(gamma, beta - delta)
+                return beta / alpha
+        return 1
+def bbox_iou(box1, box2, xywh=True, GIoU=False, DIoU=False, CIoU=False, MDPIoU=False, feat_h=640, feat_w=640, eps=1e-7):
+    # Returns Intersection over Union (IoU) of box1(1,4) to box2(n,4)
+    # Get the coordinates of bounding boxes
+    if xywh:  # transform from xywh to xyxy
+        (x1, y1, w1, h1), (x2, y2, w2, h2) = box1.chunk(4, -1), box2.chunk(4, -1)
+        w1_, h1_, w2_, h2_ = w1 / 2, h1 / 2, w2 / 2, h2 / 2
+        b1_x1, b1_x2, b1_y1, b1_y2 = x1 - w1_, x1 + w1_, y1 - h1_, y1 + h1_
+        b2_x1, b2_x2, b2_y1, b2_y2 = x2 - w2_, x2 + w2_, y2 - h2_, y2 + h2_
+    else:  # x1, y1, x2, y2 = box1
+        b1_x1, b1_y1, b1_x2, b1_y2 = box1.chunk(4, -1)
+        b2_x1, b2_y1, b2_x2, b2_y2 = box2.chunk(4, -1)
+        w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
+        w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
+    # Intersection area
+    inter = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * \
+            (torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1)).clamp(0)
+    # Union Area
+    union = w1 * h1 + w2 * h2 - inter + eps
+    # IoU
+    iou = inter / union
+    if CIoU or DIoU or GIoU:
+        cw = torch.max(b1_x2, b2_x2) - torch.min(b1_x1, b2_x1)  # convex (smallest enclosing box) width
+        ch = torch.max(b1_y2, b2_y2) - torch.min(b1_y1, b2_y1)  # convex height
+        if CIoU or DIoU:  # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1
+            c2 = cw ** 2 + ch ** 2 + eps  # convex diagonal squared
+            rho2 = ((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 + (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4  # center dist ** 2
+            if CIoU:  # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47
+                v = (4 / math.pi ** 2) * torch.pow(torch.atan(w2 / h2) - torch.atan(w1 / h1), 2)
+                with torch.no_grad():
+                    alpha = v / (v - iou + (1 + eps))
+                return iou - (rho2 / c2 + v * alpha)  # CIoU
+            return iou - rho2 / c2  # DIoU
+        c_area = cw * ch + eps  # convex area
+        return iou - (c_area - union) / c_area  # GIoU https://arxiv.org/pdf/1902.09630.pdf
+    elif MDPIoU:
+        d1 = (b2_x1 - b1_x1) ** 2 + (b2_y1 - b1_y1) ** 2
+        d2 = (b2_x2 - b1_x2) ** 2 + (b2_y2 - b1_y2) ** 2
+        mpdiou_hw_pow = feat_h ** 2 + feat_w ** 2
+        return iou - d1 / mpdiou_hw_pow - d2 / mpdiou_hw_pow  # MPDIoU
+    return iou  # IoU
+def box_iou(box1, box2, eps=1e-7):
+    # https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
+    """
+    Return intersection-over-union (Jaccard index) of boxes.
+    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+    Arguments:
+        box1 (Tensor[N, 4])
+        box2 (Tensor[M, 4])
+    Returns:
+        iou (Tensor[N, M]): the NxM matrix containing the pairwise
+            IoU values for every element in boxes1 and boxes2
+    """
+    # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
+    (a1, a2), (b1, b2) = box1.unsqueeze(1).chunk(2, 2), box2.unsqueeze(0).chunk(2, 2)
+    inter = (torch.min(a2, b2) - torch.max(a1, b1)).clamp(0).prod(2)
+    # IoU = inter / (area1 + area2 - inter)
+    return inter / ((a2 - a1).prod(2) + (b2 - b1).prod(2) - inter + eps)
+def bbox_ioa(box1, box2, eps=1e-7):
+    """Returns the intersection over box2 area given box1, box2. Boxes are x1y1x2y2
+    box1:       np.array of shape(nx4)
+    box2:       np.array of shape(mx4)
+    returns:    np.array of shape(nxm)
+    """
+    # Get the coordinates of bounding boxes
+    b1_x1, b1_y1, b1_x2, b1_y2 = box1.T
+    b2_x1, b2_y1, b2_x2, b2_y2 = box2.T
+    # Intersection area
+    inter_area = (np.minimum(b1_x2[:, None], b2_x2) - np.maximum(b1_x1[:, None], b2_x1)).clip(0) * \
+                 (np.minimum(b1_y2[:, None], b2_y2) - np.maximum(b1_y1[:, None], b2_y1)).clip(0)
+    # box2 area
+    box2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + eps
+    # Intersection over box2 area
+    return inter_area / box2_area
+def wh_iou(wh1, wh2, eps=1e-7):
+    # Returns the nxm IoU matrix. wh1 is nx2, wh2 is mx2
+    wh1 = wh1[:, None]  # [N,1,2]
+    wh2 = wh2[None]  # [1,M,2]
+    inter = torch.min(wh1, wh2).prod(2)  # [N,M]
+    return inter / (wh1.prod(2) + wh2.prod(2) - inter + eps)  # iou = inter / (area1 + area2 - inter)
+# Plots ----------------------------------------------------------------------------------------------------------------
+@threaded
+def plot_pr_curve(px, py, ap, save_dir=Path('pr_curve.png'), names=()):
+    # Precision-recall curve
+    fig, ax = plt.subplots(1, 1, figsize=(9, 6), tight_layout=True)
+    py = np.stack(py, axis=1)
+    if 0 < len(names) < 21:  # display per-class legend if < 21 classes
+        for i, y in enumerate(py.T):
+            ax.plot(px, y, linewidth=1, label=f'{names[i]} {ap[i, 0]:.3f}')  # plot(recall, precision)
+    else:
+        ax.plot(px, py, linewidth=1, color='grey')  # plot(recall, precision)
+    ax.plot(px, py.mean(1), linewidth=3, color='blue', label='all classes %.3f mAP@0.5' % ap[:, 0].mean())
+    ax.set_xlabel('Recall')
+    ax.set_ylabel('Precision')
+    ax.set_xlim(0, 1)
+    ax.set_ylim(0, 1)
+    ax.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
+    ax.set_title('Precision-Recall Curve')
+    fig.savefig(save_dir, dpi=250)
+    plt.close(fig)
+@threaded
+def plot_mc_curve(px, py, save_dir=Path('mc_curve.png'), names=(), xlabel='Confidence', ylabel='Metric'):
+    # Metric-confidence curve
+    fig, ax = plt.subplots(1, 1, figsize=(9, 6), tight_layout=True)
+    if 0 < len(names) < 21:  # display per-class legend if < 21 classes
+        for i, y in enumerate(py):
+            ax.plot(px, y, linewidth=1, label=f'{names[i]}')  # plot(confidence, metric)
+    else:
+        ax.plot(px, py.T, linewidth=1, color='grey')  # plot(confidence, metric)
+    y = smooth(py.mean(0), 0.05)
+    ax.plot(px, y, linewidth=3, color='blue', label=f'all classes {y.max():.2f} at {px[y.argmax()]:.3f}')
+    ax.set_xlabel(xlabel)
+    ax.set_ylabel(ylabel)
+    ax.set_xlim(0, 1)
+    ax.set_ylim(0, 1)
+    ax.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
+    ax.set_title(f'{ylabel}-Confidence Curve')
+    fig.savefig(save_dir, dpi=250)
+    plt.close(fig)
--- a/utils/panoptic/__init__.py
+++ b/utils/panoptic/__init__.py
+# init
\ No newline at end of file
--- a/utils/panoptic/augmentations.py
+++ b/utils/panoptic/augmentations.py
+import math
+import random
+import cv2
+import numpy as np
+from ..augmentations import box_candidates
+from ..general import resample_segments, segment2box
+from ..metrics import bbox_ioa
+def mixup(im, labels, segments, seg_cls, semantic_masks, im2, labels2, segments2, seg_cls2, semantic_masks2):
+    # Applies MixUp augmentation https://arxiv.org/pdf/1710.09412.pdf
+    r = np.random.beta(32.0, 32.0)  # mixup ratio, alpha=beta=32.0
+    im = (im * r + im2 * (1 - r)).astype(np.uint8)
+    labels = np.concatenate((labels, labels2), 0)
+    segments = np.concatenate((segments, segments2), 0)
+    seg_cls = np.concatenate((seg_cls, seg_cls2), 0)
+    semantic_masks = np.concatenate((semantic_masks, semantic_masks2), 0)
+    return im, labels, segments, seg_cls, semantic_masks
+def random_perspective(im,
+                       targets=(),
+                       segments=(),
+                       semantic_masks = (),
+                       degrees=10,
+                       translate=.1,
+                       scale=.1,
+                       shear=10,
+                       perspective=0.0,
+                       border=(0, 0)):
+    # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-10, 10))
+    # targets = [cls, xyxy]
+    height = im.shape[0] + border[0] * 2  # shape(h,w,c)
+    width = im.shape[1] + border[1] * 2
+    # Center
+    C = np.eye(3)
+    C[0, 2] = -im.shape[1] / 2  # x translation (pixels)
+    C[1, 2] = -im.shape[0] / 2  # y translation (pixels)
+    # Perspective
+    P = np.eye(3)
+    P[2, 0] = random.uniform(-perspective, perspective)  # x perspective (about y)
+    P[2, 1] = random.uniform(-perspective, perspective)  # y perspective (about x)
+    # Rotation and Scale
+    R = np.eye(3)
+    a = random.uniform(-degrees, degrees)
+    # a += random.choice([-180, -90, 0, 90])  # add 90deg rotations to small rotations
+    s = random.uniform(1 - scale, 1 + scale)
+    # s = 2 ** random.uniform(-scale, scale)
+    R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s)
+    # Shear
+    S = np.eye(3)
+    S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # x shear (deg)
+    S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # y shear (deg)
+    # Translation
+    T = np.eye(3)
+    T[0, 2] = (random.uniform(0.5 - translate, 0.5 + translate) * width)  # x translation (pixels)
+    T[1, 2] = (random.uniform(0.5 - translate, 0.5 + translate) * height)  # y translation (pixels)
+    # Combined rotation matrix
+    M = T @ S @ R @ P @ C  # order of operations (right to left) is IMPORTANT
+    if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any():  # image changed
+        if perspective:
+            im = cv2.warpPerspective(im, M, dsize=(width, height), borderValue=(114, 114, 114))
+        else:  # affine
+            im = cv2.warpAffine(im, M[:2], dsize=(width, height), borderValue=(114, 114, 114))
+    # Visualize
+    # import matplotlib.pyplot as plt
+    # ax = plt.subplots(1, 2, figsize=(12, 6))[1].ravel()
+    # ax[0].imshow(im[:, :, ::-1])  # base
+    # ax[1].imshow(im2[:, :, ::-1])  # warped
+    # Transform label coordinates
+    n = len(targets)
+    new_segments = []
+    new_semantic_masks = []
+    if n:
+        new = np.zeros((n, 4))
+        segments = resample_segments(segments)  # upsample
+        for i, segment in enumerate(segments):
+            xy = np.ones((len(segment), 3))
+            xy[:, :2] = segment
+            xy = xy @ M.T  # transform
+            xy = (xy[:, :2] / xy[:, 2:3] if perspective else xy[:, :2])  # perspective rescale or affine
+            # clip
+            new[i] = segment2box(xy, width, height)
+            new_segments.append(xy)
+        semantic_masks = resample_segments(semantic_masks)
+        for i, semantic_mask in enumerate(semantic_masks):
+            #if i < n:
+            #    xy = np.ones((len(segments[i]), 3))
+            #    xy[:, :2] = segments[i]
+            #    xy = xy @ M.T  # transform
+            #    xy = (xy[:, :2] / xy[:, 2:3] if perspective else xy[:, :2])  # perspective rescale or affine
+            #    new[i] = segment2box(xy, width, height)
+            #    new_segments.append(xy)
+            xy_s = np.ones((len(semantic_mask), 3))
+            xy_s[:, :2] = semantic_mask
+            xy_s = xy_s @ M.T  # transform
+            xy_s = (xy_s[:, :2] / xy_s[:, 2:3] if perspective else xy_s[:, :2])  # perspective rescale or affine
+            new_semantic_masks.append(xy_s)
+        # filter candidates
+        i = box_candidates(box1=targets[:, 1:5].T * s, box2=new.T, area_thr=0.01)
+        targets = targets[i]
+        targets[:, 1:5] = new[i]
+        new_segments = np.array(new_segments)[i]
+        new_semantic_masks = np.array(new_semantic_masks)
+    return im, targets, new_segments, new_semantic_masks
+def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
+    # Resize and pad image while meeting stride-multiple constraints
+    shape = im.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:  # only scale down, do not scale up (for better val mAP)
+        r = min(r, 1.0)
+    # Compute padding
+    ratio = r, r  # width, height ratios
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+    if auto:  # minimum rectangle
+        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
+    elif scaleFill:  # stretch
+        dw, dh = 0.0, 0.0
+        new_unpad = (new_shape[1], new_shape[0])
+        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+    if shape[::-1] != new_unpad:  # resize
+        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
+    return im, ratio, (dw, dh)
+def copy_paste(im, labels, segments, seg_cls, semantic_masks, p=0.5):
+    # Implement Copy-Paste augmentation https://arxiv.org/abs/2012.07177, labels as nx5 np.array(cls, xyxy)
+    n = len(segments)
+    if p and n:
+        h, w, _ = im.shape  # height, width, channels
+        im_new = np.zeros(im.shape, np.uint8)
+        # calculate ioa first then select indexes randomly
+        boxes = np.stack([w - labels[:, 3], labels[:, 2], w - labels[:, 1], labels[:, 4]], axis=-1)  # (n, 4)
+        ioa = bbox_ioa(boxes, labels[:, 1:5])  # intersection over area
+        indexes = np.nonzero((ioa < 0.30).all(1))[0]  # (N, )
+        n = len(indexes)
+        for j in random.sample(list(indexes), k=round(p * n)):
+            l, box, s = labels[j], boxes[j], segments[j]
+            labels = np.concatenate((labels, [[l[0], *box]]), 0)
+            segments.append(np.concatenate((w - s[:, 0:1], s[:, 1:2]), 1))
+            seg_cls.append(l[0].astype(int))
+            semantic_masks.append(np.concatenate((w - s[:, 0:1], s[:, 1:2]), 1))
+            cv2.drawContours(im_new, [segments[j].astype(np.int32)], -1, (1, 1, 1), cv2.FILLED)
+        result = cv2.flip(im, 1)  # augment segments (flip left-right)
+        i = cv2.flip(im_new, 1).astype(bool)
+        im[i] = result[i]  # cv2.imwrite('debug.jpg', im)  # debug
+    return im, labels, segments, seg_cls, semantic_masks
\ No newline at end of file
--- a/utils/panoptic/dataloaders.py
+++ b/utils/panoptic/dataloaders.py
+import os
+import random
+import pickle
+from pathlib import Path
+from itertools import repeat
+from multiprocessing.pool import Pool, ThreadPool
+import cv2
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, distributed
+from tqdm import tqdm
+from ..augmentations import augment_hsv
+from ..dataloaders import InfiniteDataLoader, LoadImagesAndLabels, seed_worker, get_hash, verify_image_label, HELP_URL, TQDM_BAR_FORMAT, LOCAL_RANK
+from ..general import NUM_THREADS, LOGGER, xyn2xy, xywhn2xyxy, xyxy2xywhn
+from ..torch_utils import torch_distributed_zero_first
+from ..coco_utils import annToMask, getCocoIds
+from .augmentations import mixup, random_perspective, copy_paste, letterbox
+RANK = int(os.getenv('RANK', -1))
+def create_dataloader(path,
+                      imgsz,
+                      batch_size,
+                      stride,
+                      single_cls=False,
+                      hyp=None,
+                      augment=False,
+                      cache=False,
+                      pad=0.0,
+                      rect=False,
+                      rank=-1,
+                      workers=8,
+                      image_weights=False,
+                      close_mosaic=False,
+                      quad=False,
+                      prefix='',
+                      shuffle=False,
+                      mask_downsample_ratio=1,
+                      overlap_mask=False):
+    if rect and shuffle:
+        LOGGER.warning('WARNING ⚠️ --rect is incompatible with DataLoader shuffle, setting shuffle=False')
+        shuffle = False
+    with torch_distributed_zero_first(rank):  # init dataset *.cache only once if DDP
+        dataset = LoadImagesAndLabelsAndMasks(
+            path,
+            imgsz,
+            batch_size,
+            augment=augment,  # augmentation
+            hyp=hyp,  # hyperparameters
+            rect=rect,  # rectangular batches
+            cache_images=cache,
+            single_cls=single_cls,
+            stride=int(stride),
+            pad=pad,
+            image_weights=image_weights,
+            prefix=prefix,
+            downsample_ratio=mask_downsample_ratio,
+            overlap=overlap_mask)
+    batch_size = min(batch_size, len(dataset))
+    nd = torch.cuda.device_count()  # number of CUDA devices
+    nw = min([os.cpu_count() // max(nd, 1), batch_size if batch_size > 1 else 0, workers])  # number of workers
+    sampler = None if rank == -1 else distributed.DistributedSampler(dataset, shuffle=shuffle)
+    #loader = DataLoader if image_weights else InfiniteDataLoader  # only DataLoader allows for attribute updates
+    loader = DataLoader if image_weights or close_mosaic else InfiniteDataLoader
+    generator = torch.Generator()
+    generator.manual_seed(6148914691236517205 + RANK)
+    return loader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=shuffle and sampler is None,
+        num_workers=nw,
+        sampler=sampler,
+        pin_memory=True,
+        collate_fn=LoadImagesAndLabelsAndMasks.collate_fn4 if quad else LoadImagesAndLabelsAndMasks.collate_fn,
+        worker_init_fn=seed_worker,
+        generator=generator,
+    ), dataset
+def img2stuff_paths(img_paths):
+    # Define label paths as a function of image paths
+    sa, sb = f'{os.sep}images{os.sep}', f'{os.sep}stuff{os.sep}'  # /images/, /segmentations/ substrings
+    return [sb.join(x.rsplit(sa, 1)).rsplit('.', 1)[0] + '.txt' for x in img_paths]
+class LoadImagesAndLabelsAndMasks(LoadImagesAndLabels):  # for training/testing
+    def __init__(
+        self,
+        path,
+        img_size=640,
+        batch_size=16,
+        augment=False,
+        hyp=None,
+        rect=False,
+        image_weights=False,
+        cache_images=False,
+        single_cls=False,
+        stride=32,
+        pad=0,
+        min_items=0,
+        prefix="",
+        downsample_ratio=1,
+        overlap=False,
+    ):
+        super().__init__(
+            path,
+            img_size,
+            batch_size,
+            augment,
+            hyp,
+            rect,
+            image_weights,
+            cache_images,
+            single_cls,
+            stride,
+            pad,
+            min_items,
+            prefix)        
+        self.downsample_ratio = downsample_ratio
+        self.overlap = overlap
+        # semantic segmentation
+        self.coco_ids = getCocoIds()
+        # Check cache
+        self.seg_files = img2stuff_paths(self.im_files)  # labels
+        p = Path(path)
+        cache_path = (p.with_suffix('') if p.is_file() else Path(self.seg_files[0]).parent)
+        cache_path = Path(str(cache_path) + '_stuff').with_suffix('.cache')
+        try:
+            cache, exists = np.load(cache_path, allow_pickle = True).item(), True  # load dict
+            #assert cache['version'] == self.cache_version  # matches current version
+            #assert cache['hash'] == get_hash(self.seg_files + self.im_files)  # identical hash
+        except Exception:
+            cache, exists = self.cache_seg_labels(cache_path, prefix), False  # run cache ops
+        # Display cache
+        nf, nm, ne, nc, n = cache.pop('results')  # found, missing, empty, corrupt, total
+        if exists and LOCAL_RANK in {-1, 0}:
+            d = f"Scanning '{cache_path}' images and labels... {nf} found, {nm} missing, {ne} empty, {nc} corrupt"
+            tqdm(None, desc = (prefix + d), total = n, initial = n, bar_format = TQDM_BAR_FORMAT)  # display cache results
+            if cache['msgs']:
+                LOGGER.info('\n'.join(cache['msgs']))  # display warnings
+        assert (0 < nf) or (not augment), f'{prefix}No labels found in {cache_path}, can not start training. {HELP_URL}'
+        # Read cache
+        [cache.pop(k) for k in ('hash', 'version', 'msgs')]  # remove items
+        seg_labels, _, self.semantic_masks = zip(*cache.values())
+        nl = len(np.concatenate(seg_labels, 0))  # number of labels
+        assert nl > 0 or not augment, f'{prefix}All labels empty in {cache_path}, can not start training. {HELP_URL}'
+        # Update labels
+        self.seg_cls = []
+        include_class = []  # filter labels to include only these classes (optional)
+        include_class_array = np.array(include_class).reshape(1, -1)
+        for i, (label, semantic_masks) in enumerate(zip(seg_labels, self.semantic_masks)):
+            self.seg_cls.append((label[:, 0].astype(int)).tolist())
+            if include_class:
+                j = (label[:, 0:1] == include_class_array).any(1)
+                if semantic_masks:
+                    self.semantic_masks[i] = semantic_masks[j]
+            if single_cls:  # single-class training, merge all classes into 0
+                if semantic_masks:
+                    self.semantic_masks[i][:, 0] = 0
+    def __getitem__(self, index):
+        index = self.indices[index]  # linear, shuffled, or image_weights
+        hyp = self.hyp
+        mosaic = self.mosaic and random.random() < hyp['mosaic']
+        masks = []
+        if mosaic:
+            # Load mosaic
+            img, labels, segments, seg_cls, semantic_masks = self.load_mosaic(index)
+            shapes = None
+            # MixUp augmentation
+            if random.random() < hyp["mixup"]:
+                img, labels, segments, seg_cls, semantic_masks = mixup(img, labels, segments, seg_cls, semantic_masks,
+                                                                       *self.load_mosaic(random.randint(0, self.n - 1)))
+        else:
+            # Load image
+            img, (h0, w0), (h, w) = self.load_image(index)
+            # Letterbox
+            shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size  # final letterboxed shape
+            img, ratio, pad = letterbox(img, shape, auto=False, scaleup=self.augment)
+            shapes = (h0, w0), ((h / h0, w / w0), pad)  # for COCO mAP rescaling
+            labels = self.labels[index].copy()
+            # [array, array, ....], array.shape=(num_points, 2), xyxyxyxy
+            segments = self.segments[index].copy()
+            if len(segments):
+                for i_s in range(len(segments)):
+                    segments[i_s] = xyn2xy(
+                        segments[i_s],
+                        ratio[0] * w,
+                        ratio[1] * h,
+                        padw=pad[0],
+                        padh=pad[1],
+                    )
+            seg_cls = self.seg_cls[index].copy()
+            semantic_masks = self.semantic_masks[index].copy()
+            #semantic_masks = [xyn2xy(x, ratio[0] * w, ratio[1] * h, padw = pad[0], padh = pad[1]) for x in semantic_masks]
+            if len(semantic_masks):
+                for ss in range(len(semantic_masks)):
+                    semantic_masks[ss] = xyn2xy(
+                        semantic_masks[ss],
+                        ratio[0] * w,
+                        ratio[1] * h,
+                        padw = pad[0],
+                        padh = pad[1],
+                    )
+            if labels.size:  # normalized xywh to pixel xyxy format
+                labels[:, 1:] = xywhn2xyxy(labels[:, 1:], ratio[0] * w, ratio[1] * h, padw=pad[0], padh=pad[1])
+            if self.augment:
+                img, labels, segments, semantic_masks = random_perspective(
+                                                           img,
+                                                           labels,
+                                                           segments=segments,
+                                                           semantic_masks = semantic_masks,
+                                                           degrees=hyp["degrees"],
+                                                           translate=hyp["translate"],
+                                                           scale=hyp["scale"],
+                                                           shear=hyp["shear"],
+                                                           perspective=hyp["perspective"])
+        nl = len(labels)  # number of labels
+        if nl:
+            labels[:, 1:5] = xyxy2xywhn(labels[:, 1:5], w=img.shape[1], h=img.shape[0], clip=True, eps=1e-3)
+            if self.overlap:
+                masks, sorted_idx = polygons2masks_overlap(img.shape[:2],
+                                                           segments,
+                                                           downsample_ratio=self.downsample_ratio)
+                masks = masks[None]  # (640, 640) -> (1, 640, 640)
+                labels = labels[sorted_idx]
+            else:
+                masks = polygons2masks(img.shape[:2], segments, color=1, downsample_ratio=self.downsample_ratio)
+        masks = (torch.from_numpy(masks) if len(masks) else torch.zeros(1 if self.overlap else nl, img.shape[0] //
+                                                                        self.downsample_ratio, img.shape[1] //
+                                                                        self.downsample_ratio))
+        semantic_masks = polygons2masks(img.shape[:2], semantic_masks, color = 1, downsample_ratio=self.downsample_ratio)
+        #semantic_masks = polygons2masks(img.shape[:2], semantic_masks, color = 1, downsample_ratio=1)
+        semantic_masks = torch.from_numpy(semantic_masks)
+        # TODO: albumentations support
+        if self.augment:
+            # Albumentations
+            # there are some augmentation that won't change boxes and masks,
+            # so just be it for now.
+            img, labels = self.albumentations(img, labels)
+            nl = len(labels)  # update after albumentations
+            ns = len(semantic_masks)
+            # HSV color-space
+            augment_hsv(img, hgain=hyp["hsv_h"], sgain=hyp["hsv_s"], vgain=hyp["hsv_v"])
+            # Flip up-down
+            if random.random() < hyp["flipud"]:
+                img = np.flipud(img)
+                if nl:
+                    labels[:, 2] = 1 - labels[:, 2]
+                    masks = torch.flip(masks, dims=[1])
+                if ns:
+                    semantic_masks = torch.flip(semantic_masks, dims = [1])
+            # Flip left-right
+            if random.random() < hyp["fliplr"]:
+                img = np.fliplr(img)
+                if nl:
+                    labels[:, 1] = 1 - labels[:, 1]
+                    masks = torch.flip(masks, dims=[2])
+                if ns:
+                    semantic_masks = torch.flip(semantic_masks, dims = [2])
+            # Cutouts  # labels = cutout(img, labels, p=0.5)
+        labels_out = torch.zeros((nl, 6))
+        if nl:
+            labels_out[:, 1:] = torch.from_numpy(labels)
+        # Combine semantic masks
+        semantic_seg_masks = torch.zeros((len(self.coco_ids), img.shape[0] // self.downsample_ratio, 
+                                          img.shape[1] // self.downsample_ratio), dtype = torch.uint8)
+        #semantic_seg_masks = torch.zeros((len(self.coco_ids), img.shape[0], img.shape[1]), dtype = torch.uint8)
+        for cls_id, semantic_mask in zip(seg_cls, semantic_masks):
+            semantic_seg_masks[cls_id] = (semantic_seg_masks[cls_id].logical_or(semantic_mask)).int()
+        # Convert
+        img = img.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
+        img = np.ascontiguousarray(img)
+        return (torch.from_numpy(img), labels_out, self.im_files[index], shapes, masks, semantic_seg_masks)
+    def load_mosaic(self, index):
+        # YOLO 4-mosaic loader. Loads 1 image + 3 random images into a 4-image mosaic
+        labels4, segments4, seg_cls, semantic_masks4 = [], [], [], []
+        s = self.img_size
+        yc, xc = (int(random.uniform(-x, 2 * s + x)) for x in self.mosaic_border)  # mosaic center x, y
+        # 3 additional image indices
+        indices = [index] + random.choices(self.indices, k=3)  # 3 additional image indices
+        for i, index in enumerate(indices):
+            # Load image
+            img, _, (h, w) = self.load_image(index)
+            # place img in img4
+            if i == 0:  # top left
+                img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8)  # base image with 4 tiles
+                x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc  # xmin, ymin, xmax, ymax (large image)
+                x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h  # xmin, ymin, xmax, ymax (small image)
+            elif i == 1:  # top right
+                x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc
+                x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
+            elif i == 2:  # bottom left
+                x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h)
+                x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
+            elif i == 3:  # bottom right
+                x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h)
+                x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)
+            img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b]  # img4[ymin:ymax, xmin:xmax]
+            padw = x1a - x1b
+            padh = y1a - y1b
+            labels, segments, semantic_masks = self.labels[index].copy(), self.segments[index].copy(), self.semantic_masks[index].copy()
+            if labels.size:
+                labels[:, 1:] = xywhn2xyxy(labels[:, 1:], w, h, padw, padh)  # normalized xywh to pixel xyxy format
+                segments = [xyn2xy(x, w, h, padw, padh) for x in segments]
+            semantic_masks = [xyn2xy(x, w, h, padw, padh) for x in semantic_masks]
+            labels4.append(labels)
+            segments4.extend(segments)
+            seg_cls.extend(self.seg_cls[index].copy())
+            semantic_masks4.extend(semantic_masks)
+        # Concat/clip labels
+        labels4 = np.concatenate(labels4, 0)
+        for i in range(len(semantic_masks4)):
+            if i < len(segments4):
+                np.clip(labels4[:, 1:][i], 0, 2 * s, out = labels4[:, 1:][i])
+                np.clip(segments4[i], 0, 2 * s, out = segments4[i])
+            np.clip(semantic_masks4[i], 0, 2 * s, out = semantic_masks4[i])
+        # img4, labels4 = replicate(img4, labels4)  # replicate
+        # 3 additional image indices
+        # Augment
+        img4, labels4, segments4, seg_cls, semantic_masks4 = copy_paste(img4, labels4, segments4, seg_cls, semantic_masks4, p=self.hyp["copy_paste"])
+        img4, labels4, segments4, semantic_masks4 = random_perspective(img4,
+                                                      labels4,
+                                                      segments4,
+                                                      semantic_masks4,
+                                                      degrees=self.hyp["degrees"],
+                                                      translate=self.hyp["translate"],
+                                                      scale=self.hyp["scale"],
+                                                      shear=self.hyp["shear"],
+                                                      perspective=self.hyp["perspective"],
+                                                      border=self.mosaic_border)  # border to remove
+        return img4, labels4, segments4, seg_cls, semantic_masks4
+    def cache_seg_labels(self, path = Path('./labels_stuff.cache'), prefix = ''):
+        # Cache dataset labels, check images and read shapes
+        x = {}  # dict
+        nm, nf, ne, nc, msgs = 0, 0, 0, 0, []  # number missing, found, empty, corrupt, messages
+        desc = f"{prefix}Scanning '{path.parent / path.stem}' images and labels..."
+        with Pool(NUM_THREADS) as pool:
+            pbar = tqdm(pool.imap(verify_image_label, zip(self.im_files, self.seg_files, repeat(prefix))),
+                        desc = desc,
+                        total = len(self.im_files),
+                        bar_format = TQDM_BAR_FORMAT)
+            for im_file, lb, shape, segments, nm_f, nf_f, ne_f, nc_f, msg in pbar:
+                nm += nm_f
+                nf += nf_f
+                ne += ne_f
+                nc += nc_f
+                if im_file:
+                    x[im_file] = [lb, shape, segments]
+                if msg:
+                    msgs.append(msg)
+                pbar.desc = f"{desc}{nf} found, {nm} missing, {ne} empty, {nc} corrupt"
+        pbar.close()
+        if msgs:
+            LOGGER.info('\n'.join(msgs))
+        if nf == 0:
+            LOGGER.warning(f'{prefix}WARNING: No labels found in {path}. {HELP_URL}')
+        x['hash'] = get_hash(self.seg_files + self.im_files)
+        x['results'] = nf, nm, ne, nc, len(self.im_files)
+        x['msgs'] = msgs  # warnings
+        x['version'] = self.cache_version  # cache version
+        try:
+            np.save(path, x)  # save cache for next time
+            path.with_suffix('.cache.npy').rename(path)  # remove .npy suffix
+            LOGGER.info(f'{prefix}New cache created: {path}')
+        except Exception as e:
+            LOGGER.warning(f'{prefix}WARNING: Cache directory {path.parent} is not writeable: {e}')  # not writeable
+        return x
+    @staticmethod
+    def collate_fn(batch):
+        img, label, path, shapes, masks, semantic_masks = zip(*batch)  # transposed
+        batched_masks = torch.cat(masks, 0)
+        for i, l in enumerate(label):
+            l[:, 0] = i  # add target image index for build_targets()
+        return torch.stack(img, 0), torch.cat(label, 0), path, shapes, batched_masks, torch.stack(semantic_masks, 0)
+def polygon2mask(img_size, polygons, color=1, downsample_ratio=1):
+    """
+    Args:
+        img_size (tuple): The image size.
+        polygons (np.ndarray): [N, M], N is the number of polygons,
+            M is the number of points(Be divided by 2).
+    """
+    mask = np.zeros(img_size, dtype=np.uint8)
+    polygons = np.asarray(polygons)
+    polygons = polygons.astype(np.int32)
+    shape = polygons.shape
+    polygons = polygons.reshape(shape[0], -1, 2)
+    cv2.fillPoly(mask, polygons, color=color)
+    nh, nw = (img_size[0] // downsample_ratio, img_size[1] // downsample_ratio)
+    # NOTE: fillPoly firstly then resize is trying the keep the same way
+    # of loss calculation when mask-ratio=1.
+    mask = cv2.resize(mask, (nw, nh))
+    return mask
+def polygons2masks(img_size, polygons, color, downsample_ratio=1):
+    """
+    Args:
+        img_size (tuple): The image size.
+        polygons (list[np.ndarray]): each polygon is [N, M],
+            N is the number of polygons,
+            M is the number of points(Be divided by 2).
+    """
+    masks = []
+    for si in range(len(polygons)):
+        mask = polygon2mask(img_size, [polygons[si].reshape(-1)], color, downsample_ratio)
+        masks.append(mask)
+    return np.array(masks)
+def polygons2masks_overlap(img_size, segments, downsample_ratio=1):
+    """Return a (640, 640) overlap mask."""
+    masks = np.zeros((img_size[0] // downsample_ratio, img_size[1] // downsample_ratio),
+                     dtype=np.int32 if len(segments) > 255 else np.uint8)
+    areas = []
+    ms = []
+    for si in range(len(segments)):
+        mask = polygon2mask(
+            img_size,
+            [segments[si].reshape(-1)],
+            downsample_ratio=downsample_ratio,
+            color=1,
+        )
+        ms.append(mask)
+        areas.append(mask.sum())
+    areas = np.asarray(areas)
+    index = np.argsort(-areas)
+    ms = np.array(ms)[index]
+    for i in range(len(segments)):
+        mask = ms[i] * (i + 1)
+        masks = masks + mask
+        masks = np.clip(masks, a_min=0, a_max=i + 1)
+    return masks, index
--- a/utils/panoptic/general.py
+++ b/utils/panoptic/general.py
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+def crop_mask(masks, boxes):
+    """
+    "Crop" predicted masks by zeroing out everything not in the predicted bbox.
+    Vectorized by Chong (thanks Chong).
+    Args:
+        - masks should be a size [h, w, n] tensor of masks
+        - boxes should be a size [n, 4] tensor of bbox coords in relative point form
+    """
+    n, h, w = masks.shape
+    x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1)  # x1 shape(1,1,n)
+    r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :]  # rows shape(1,w,1)
+    c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None]  # cols shape(h,1,1)
+    return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
+def process_mask_upsample(protos, masks_in, bboxes, shape):
+    """
+    Crop after upsample.
+    proto_out: [mask_dim, mask_h, mask_w]
+    out_masks: [n, mask_dim], n is number of masks after nms
+    bboxes: [n, 4], n is number of masks after nms
+    shape:input_image_size, (h, w)
+    return: h, w, n
+    """
+    c, mh, mw = protos.shape  # CHW
+    masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw)
+    masks = F.interpolate(masks[None], shape, mode='bilinear', align_corners=False)[0]  # CHW
+    masks = crop_mask(masks, bboxes)  # CHW
+    return masks.gt_(0.5)
+def process_mask(protos, masks_in, bboxes, shape, upsample=False):
+    """
+    Crop before upsample.
+    proto_out: [mask_dim, mask_h, mask_w]
+    out_masks: [n, mask_dim], n is number of masks after nms
+    bboxes: [n, 4], n is number of masks after nms
+    shape:input_image_size, (h, w)
+    return: h, w, n
+    """
+    c, mh, mw = protos.shape  # CHW
+    ih, iw = shape
+    masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw)  # CHW
+    downsampled_bboxes = bboxes.clone()
+    downsampled_bboxes[:, 0] *= mw / iw
+    downsampled_bboxes[:, 2] *= mw / iw
+    downsampled_bboxes[:, 3] *= mh / ih
+    downsampled_bboxes[:, 1] *= mh / ih
+    masks = crop_mask(masks, downsampled_bboxes)  # CHW
+    if upsample:
+        masks = F.interpolate(masks[None], shape, mode='bilinear', align_corners=False)[0]  # CHW
+    return masks.gt_(0.5)
+def scale_image(im1_shape, masks, im0_shape, ratio_pad=None):
+    """
+    img1_shape: model input shape, [h, w]
+    img0_shape: origin pic shape, [h, w, 3]
+    masks: [h, w, num]
+    """
+    # Rescale coordinates (xyxy) from im1_shape to im0_shape
+    if ratio_pad is None:  # calculate from im0_shape
+        gain = min(im1_shape[0] / im0_shape[0], im1_shape[1] / im0_shape[1])  # gain  = old / new
+        pad = (im1_shape[1] - im0_shape[1] * gain) / 2, (im1_shape[0] - im0_shape[0] * gain) / 2  # wh padding
+    else:
+        pad = ratio_pad[1]
+    top, left = int(pad[1]), int(pad[0])  # y, x
+    bottom, right = int(im1_shape[0] - pad[1]), int(im1_shape[1] - pad[0])
+    if len(masks.shape) < 2:
+        raise ValueError(f'"len of masks shape" should be 2 or 3, but got {len(masks.shape)}')
+    masks = masks[top:bottom, left:right]
+    # masks = masks.permute(2, 0, 1).contiguous()
+    # masks = F.interpolate(masks[None], im0_shape[:2], mode='bilinear', align_corners=False)[0]
+    # masks = masks.permute(1, 2, 0).contiguous()
+    masks = cv2.resize(masks, (im0_shape[1], im0_shape[0]))
+    if len(masks.shape) == 2:
+        masks = masks[:, :, None]
+    return masks
+def mask_iou(mask1, mask2, eps=1e-7):
+    """
+    mask1: [N, n] m1 means number of predicted objects
+    mask2: [M, n] m2 means number of gt objects
+    Note: n means image_w x image_h
+    return: masks iou, [N, M]
+    """
+    intersection = torch.matmul(mask1, mask2.t()).clamp(0)
+    union = (mask1.sum(1)[:, None] + mask2.sum(1)[None]) - intersection  # (area1 + area2) - intersection
+    return intersection / (union + eps)
+def masks_iou(mask1, mask2, eps=1e-7):
+    """
+    mask1: [N, n] m1 means number of predicted objects
+    mask2: [N, n] m2 means number of gt objects
+    Note: n means image_w x image_h
+    return: masks iou, (N, )
+    """
+    intersection = (mask1 * mask2).sum(1).clamp(0)  # (N, )
+    union = (mask1.sum(1) + mask2.sum(1))[None] - intersection  # (area1 + area2) - intersection
+    return intersection / (union + eps)
+def masks2segments(masks, strategy='largest'):
+    # Convert masks(n,160,160) into segments(n,xy)
+    segments = []
+    for x in masks.int().cpu().numpy().astype('uint8'):
+        c = cv2.findContours(x, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0]
+        if c:
+            if strategy == 'concat':  # concatenate all segments
+                c = np.concatenate([x.reshape(-1, 2) for x in c])
+            elif strategy == 'largest':  # select largest segment
+                c = np.array(c[np.array([len(x) for x in c]).argmax()]).reshape(-1, 2)
+        else:
+            c = np.zeros((0, 2))  # no segments found
+        segments.append(c.astype('float32'))
+    return segments
--- a/utils/panoptic/loss.py
+++ b/utils/panoptic/loss.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ..general import xywh2xyxy
+from ..loss import FocalLoss, smooth_BCE
+from ..metrics import bbox_iou
+from ..torch_utils import de_parallel
+from .general import crop_mask
+class ComputeLoss:
+    # Compute losses
+    def __init__(self, model, autobalance=False, overlap=False):
+        self.sort_obj_iou = False
+        self.overlap = overlap
+        device = next(model.parameters()).device  # get model device
+        h = model.hyp  # hyperparameters
+        self.device = device
+        # Define criteria
+        BCEcls = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['cls_pw']], device=device))
+        BCEobj = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['obj_pw']], device=device))
+        # Class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3
+        self.cp, self.cn = smooth_BCE(eps=h.get('label_smoothing', 0.0))  # positive, negative BCE targets
+        # Focal loss
+        g = h['fl_gamma']  # focal loss gamma
+        if g > 0:
+            BCEcls, BCEobj = FocalLoss(BCEcls, g), FocalLoss(BCEobj, g)
+        m = de_parallel(model).model[-1]  # Detect() module
+        self.balance = {3: [4.0, 1.0, 0.4]}.get(m.nl, [4.0, 1.0, 0.25, 0.06, 0.02])  # P3-P7
+        self.ssi = list(m.stride).index(16) if autobalance else 0  # stride 16 index
+        self.BCEcls, self.BCEobj, self.gr, self.hyp, self.autobalance = BCEcls, BCEobj, 1.0, h, autobalance
+        self.na = m.na  # number of anchors
+        self.nc = m.nc  # number of classes
+        self.nl = m.nl  # number of layers
+        self.nm = m.nm  # number of masks
+        self.anchors = m.anchors
+        self.device = device
+    def __call__(self, preds, targets, masks):  # predictions, targets, model
+        p, proto = preds
+        bs, nm, mask_h, mask_w = proto.shape  # batch size, number of masks, mask height, mask width
+        lcls = torch.zeros(1, device=self.device)
+        lbox = torch.zeros(1, device=self.device)
+        lobj = torch.zeros(1, device=self.device)
+        lseg = torch.zeros(1, device=self.device)
+        tcls, tbox, indices, anchors, tidxs, xywhn = self.build_targets(p, targets)  # targets
+        # Losses
+        for i, pi in enumerate(p):  # layer index, layer predictions
+            b, a, gj, gi = indices[i]  # image, anchor, gridy, gridx
+            tobj = torch.zeros(pi.shape[:4], dtype=pi.dtype, device=self.device)  # target obj
+            n = b.shape[0]  # number of targets
+            if n:
+                pxy, pwh, _, pcls, pmask = pi[b, a, gj, gi].split((2, 2, 1, self.nc, nm), 1)  # subset of predictions
+                # Box regression
+                pxy = pxy.sigmoid() * 2 - 0.5
+                pwh = (pwh.sigmoid() * 2) ** 2 * anchors[i]
+                pbox = torch.cat((pxy, pwh), 1)  # predicted box
+                iou = bbox_iou(pbox, tbox[i], CIoU=True).squeeze()  # iou(prediction, target)
+                lbox += (1.0 - iou).mean()  # iou loss
+                # Objectness
+                iou = iou.detach().clamp(0).type(tobj.dtype)
+                if self.sort_obj_iou:
+                    j = iou.argsort()
+                    b, a, gj, gi, iou = b[j], a[j], gj[j], gi[j], iou[j]
+                if self.gr < 1:
+                    iou = (1.0 - self.gr) + self.gr * iou
+                tobj[b, a, gj, gi] = iou  # iou ratio
+                # Classification
+                if self.nc > 1:  # cls loss (only if multiple classes)
+                    t = torch.full_like(pcls, self.cn, device=self.device)  # targets
+                    t[range(n), tcls[i]] = self.cp
+                    lcls += self.BCEcls(pcls, t)  # BCE
+                # Mask regression
+                if tuple(masks.shape[-2:]) != (mask_h, mask_w):  # downsample
+                    masks = F.interpolate(masks[None], (mask_h, mask_w), mode="nearest")[0]
+                marea = xywhn[i][:, 2:].prod(1)  # mask width, height normalized
+                mxyxy = xywh2xyxy(xywhn[i] * torch.tensor([mask_w, mask_h, mask_w, mask_h], device=self.device))
+                for bi in b.unique():
+                    j = b == bi  # matching index
+                    if self.overlap:
+                        mask_gti = torch.where(masks[bi][None] == tidxs[i][j].view(-1, 1, 1), 1.0, 0.0)
+                    else:
+                        mask_gti = masks[tidxs[i]][j]
+                    lseg += self.single_mask_loss(mask_gti, pmask[j], proto[bi], mxyxy[j], marea[j])
+            obji = self.BCEobj(pi[..., 4], tobj)
+            lobj += obji * self.balance[i]  # obj loss
+            if self.autobalance:
+                self.balance[i] = self.balance[i] * 0.9999 + 0.0001 / obji.detach().item()
+        if self.autobalance:
+            self.balance = [x / self.balance[self.ssi] for x in self.balance]
+        lbox *= self.hyp["box"]
+        lobj *= self.hyp["obj"]
+        lcls *= self.hyp["cls"]
+        lseg *= self.hyp["box"] / bs
+        loss = lbox + lobj + lcls + lseg
+        return loss * bs, torch.cat((lbox, lseg, lobj, lcls)).detach()
+    def single_mask_loss(self, gt_mask, pred, proto, xyxy, area):
+        # Mask loss for one image
+        pred_mask = (pred @ proto.view(self.nm, -1)).view(-1, *proto.shape[1:])  # (n,32) @ (32,80,80) -> (n,80,80)
+        loss = F.binary_cross_entropy_with_logits(pred_mask, gt_mask, reduction="none")
+        return (crop_mask(loss, xyxy).mean(dim=(1, 2)) / area).mean()
+    def build_targets(self, p, targets):
+        # Build targets for compute_loss(), input targets(image,class,x,y,w,h)
+        na, nt = self.na, targets.shape[0]  # number of anchors, targets
+        tcls, tbox, indices, anch, tidxs, xywhn = [], [], [], [], [], []
+        gain = torch.ones(8, device=self.device)  # normalized to gridspace gain
+        ai = torch.arange(na, device=self.device).float().view(na, 1).repeat(1, nt)  # same as .repeat_interleave(nt)
+        if self.overlap:
+            batch = p[0].shape[0]
+            ti = []
+            for i in range(batch):
+                num = (targets[:, 0] == i).sum()  # find number of targets of each image
+                ti.append(torch.arange(num, device=self.device).float().view(1, num).repeat(na, 1) + 1)  # (na, num)
+            ti = torch.cat(ti, 1)  # (na, nt)
+        else:
+            ti = torch.arange(nt, device=self.device).float().view(1, nt).repeat(na, 1)
+        targets = torch.cat((targets.repeat(na, 1, 1), ai[..., None], ti[..., None]), 2)  # append anchor indices
+        g = 0.5  # bias
+        off = torch.tensor(
+            [
+                [0, 0],
+                [1, 0],
+                [0, 1],
+                [-1, 0],
+                [0, -1],  # j,k,l,m
+                # [1, 1], [1, -1], [-1, 1], [-1, -1],  # jk,jm,lk,lm
+            ],
+            device=self.device).float() * g  # offsets
+        for i in range(self.nl):
+            anchors, shape = self.anchors[i], p[i].shape
+            gain[2:6] = torch.tensor(shape)[[3, 2, 3, 2]]  # xyxy gain
+            # Match targets to anchors
+            t = targets * gain  # shape(3,n,7)
+            if nt:
+                # Matches
+                r = t[..., 4:6] / anchors[:, None]  # wh ratio
+                j = torch.max(r, 1 / r).max(2)[0] < self.hyp['anchor_t']  # compare
+                # j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t']  # iou(3,n)=wh_iou(anchors(3,2), gwh(n,2))
+                t = t[j]  # filter
+                # Offsets
+                gxy = t[:, 2:4]  # grid xy
+                gxi = gain[[2, 3]] - gxy  # inverse
+                j, k = ((gxy % 1 < g) & (gxy > 1)).T
+                l, m = ((gxi % 1 < g) & (gxi > 1)).T
+                j = torch.stack((torch.ones_like(j), j, k, l, m))
+                t = t.repeat((5, 1, 1))[j]
+                offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j]
+            else:
+                t = targets[0]
+                offsets = 0
+            # Define
+            bc, gxy, gwh, at = t.chunk(4, 1)  # (image, class), grid xy, grid wh, anchors
+            (a, tidx), (b, c) = at.long().T, bc.long().T  # anchors, image, class
+            gij = (gxy - offsets).long()
+            gi, gj = gij.T  # grid indices
+            # Append
+            indices.append((b, a, gj.clamp_(0, shape[2] - 1), gi.clamp_(0, shape[3] - 1)))  # image, anchor, grid
+            tbox.append(torch.cat((gxy - gij, gwh), 1))  # box
+            anch.append(anchors[a])  # anchors
+            tcls.append(c)  # class
+            tidxs.append(tidx)
+            xywhn.append(torch.cat((gxy, gwh), 1) / gain[2:6])  # xywh normalized
+        return tcls, tbox, indices, anch, tidxs, xywhn
--- a/utils/panoptic/loss_tal.py
+++ b/utils/panoptic/loss_tal.py
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.ops import sigmoid_focal_loss
+from utils.general import xywh2xyxy, xyxy2xywh
+from utils.metrics import bbox_iou
+from utils.panoptic.tal.anchor_generator import dist2bbox, make_anchors, bbox2dist
+from utils.panoptic.tal.assigner import TaskAlignedAssigner
+from utils.torch_utils import de_parallel
+from utils.panoptic.general import crop_mask
+def smooth_BCE(eps=0.1):  # https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441
+    # return positive, negative label smoothing BCE targets
+    return 1.0 - 0.5 * eps, 0.5 * eps
+class VarifocalLoss(nn.Module):
+    # Varifocal loss by Zhang et al. https://arxiv.org/abs/2008.13367
+    def __init__(self):
+        super().__init__()
+    def forward(self, pred_score, gt_score, label, alpha=0.75, gamma=2.0):
+        weight = alpha * pred_score.sigmoid().pow(gamma) * (1 - label) + gt_score * label
+        with torch.cuda.amp.autocast(enabled=False):
+            loss = (F.binary_cross_entropy_with_logits(pred_score.float(), gt_score.float(),
+                                                       reduction="none") * weight).sum()
+        return loss
+class FocalLoss(nn.Module):
+    # Wraps focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5)
+    def __init__(self, loss_fcn, gamma=1.5, alpha=0.25):
+        super().__init__()
+        self.loss_fcn = loss_fcn  # must be nn.BCEWithLogitsLoss()
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = loss_fcn.reduction
+        self.loss_fcn.reduction = "none"  # required to apply FL to each element
+    def forward(self, pred, true):
+        loss = self.loss_fcn(pred, true)
+        # p_t = torch.exp(-loss)
+        # loss *= self.alpha * (1.000001 - p_t) ** self.gamma  # non-zero power for gradient stability
+        # TF implementation https://github.com/tensorflow/addons/blob/v0.7.1/tensorflow_addons/losses/focal_loss.py
+        pred_prob = torch.sigmoid(pred)  # prob from logits
+        p_t = true * pred_prob + (1 - true) * (1 - pred_prob)
+        alpha_factor = true * self.alpha + (1 - true) * (1 - self.alpha)
+        modulating_factor = (1.0 - p_t) ** self.gamma
+        loss *= alpha_factor * modulating_factor
+        if self.reduction == "mean":
+            return loss.mean()
+        elif self.reduction == "sum":
+            return loss.sum()
+        else:  # 'none'
+            return loss
+class BboxLoss(nn.Module):
+    def __init__(self, reg_max, use_dfl=False):
+        super().__init__()
+        self.reg_max = reg_max
+        self.use_dfl = use_dfl
+    def forward(self, pred_dist, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask):
+        # iou loss
+        bbox_mask = fg_mask.unsqueeze(-1).repeat([1, 1, 4])  # (b, h*w, 4)
+        pred_bboxes_pos = torch.masked_select(pred_bboxes, bbox_mask).view(-1, 4)
+        target_bboxes_pos = torch.masked_select(target_bboxes, bbox_mask).view(-1, 4)
+        bbox_weight = torch.masked_select(target_scores.sum(-1), fg_mask).unsqueeze(-1)
+        iou = bbox_iou(pred_bboxes_pos, target_bboxes_pos, xywh=False, CIoU=True)
+        loss_iou = 1.0 - iou
+        #### wiou
+        #iou = bbox_iou(pred_bboxes_pos, target_bboxes_pos, xywh=False, WIoU=True, scale=True)
+        #if type(iou) is tuple:
+        #    if len(iou) == 2:
+        #        loss_iou = (iou[1].detach() * (1 - iou[0]))
+        #        iou = iou[0]
+        #    else:
+        #        loss_iou = (iou[0] * iou[1])
+        #        iou = iou[-1]
+        #else:
+        #    loss_iou = (1.0 - iou)  # iou loss
+        loss_iou *= bbox_weight
+        loss_iou = loss_iou.sum() / target_scores_sum
+        # loss_iou = loss_iou.mean()
+        # dfl loss
+        if self.use_dfl:
+            dist_mask = fg_mask.unsqueeze(-1).repeat([1, 1, (self.reg_max + 1) * 4])
+            pred_dist_pos = torch.masked_select(pred_dist, dist_mask).view(-1, 4, self.reg_max + 1)
+            target_ltrb = bbox2dist(anchor_points, target_bboxes, self.reg_max)
+            target_ltrb_pos = torch.masked_select(target_ltrb, bbox_mask).view(-1, 4)
+            loss_dfl = self._df_loss(pred_dist_pos, target_ltrb_pos) * bbox_weight
+            loss_dfl = loss_dfl.sum() / target_scores_sum
+        else:
+            loss_dfl = torch.tensor(0.0).to(pred_dist.device)
+        return loss_iou, loss_dfl, iou
+    def _df_loss(self, pred_dist, target):
+        target_left = target.to(torch.long)
+        target_right = target_left + 1
+        weight_left = target_right.to(torch.float) - target
+        weight_right = 1 - weight_left
+        loss_left = F.cross_entropy(pred_dist.view(-1, self.reg_max + 1), target_left.view(-1), reduction="none").view(
+            target_left.shape) * weight_left
+        loss_right = F.cross_entropy(pred_dist.view(-1, self.reg_max + 1), target_right.view(-1),
+                                     reduction="none").view(target_left.shape) * weight_right
+        return (loss_left + loss_right).mean(-1, keepdim=True)
+class ComputeLoss:
+    # Compute losses
+    def __init__(self, model, use_dfl=True, overlap=True):
+        device = next(model.parameters()).device  # get model device
+        h = model.hyp  # hyperparameters
+        # Define criteria
+        BCEcls = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h["cls_pw"]], device=device), reduction='none')
+        # Class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3
+        self.cp, self.cn = smooth_BCE(eps=h.get("label_smoothing", 0.0))  # positive, negative BCE targets
+        # Focal loss
+        g = h["fl_gamma"]  # focal loss gamma
+        if g > 0:
+            BCEcls = FocalLoss(BCEcls, g)
+        m = de_parallel(model).model[-1]  # Detect() module
+        self.balance = {3: [4.0, 1.0, 0.4]}.get(m.nl, [4.0, 1.0, 0.25, 0.06, 0.02])  # P3-P7
+        self.BCEcls = BCEcls
+        self.hyp = h
+        self.stride = m.stride  # model strides
+        self.nc = m.nc  # number of classes
+        self.nl = m.nl  # number of layers
+        self.no = m.no
+        self.nm = m.nm
+        self.overlap = overlap
+        self.reg_max = m.reg_max
+        self.device = device
+        self.assigner = TaskAlignedAssigner(topk=int(os.getenv('YOLOM', 10)),
+                                            num_classes=self.nc,
+                                            alpha=float(os.getenv('YOLOA', 0.5)),
+                                            beta=float(os.getenv('YOLOB', 6.0)))
+        self.bbox_loss = BboxLoss(m.reg_max - 1, use_dfl=use_dfl).to(device)
+        self.proj = torch.arange(m.reg_max).float().to(device)  # / 120.0
+        self.use_dfl = use_dfl
+    def preprocess(self, targets, batch_size, scale_tensor):
+        if targets.shape[0] == 0:
+            out = torch.zeros(batch_size, 0, 5, device=self.device)
+        else:
+            i = targets[:, 0]  # image index
+            _, counts = i.unique(return_counts=True)
+            out = torch.zeros(batch_size, counts.max(), 5, device=self.device)
+            for j in range(batch_size):
+                matches = i == j
+                n = matches.sum()
+                if n:
+                    out[j, :n] = targets[matches, 1:]
+            out[..., 1:5] = xywh2xyxy(out[..., 1:5].mul_(scale_tensor))
+        return out
+    def bbox_decode(self, anchor_points, pred_dist):
+        if self.use_dfl:
+            b, a, c = pred_dist.shape  # batch, anchors, channels
+            pred_dist = pred_dist.view(b, a, 4, c // 4).softmax(3).matmul(self.proj.type(pred_dist.dtype))
+            # pred_dist = pred_dist.view(b, a, c // 4, 4).transpose(2,3).softmax(3).matmul(self.proj.type(pred_dist.dtype))
+            # pred_dist = (pred_dist.view(b, a, c // 4, 4).softmax(2) * self.proj.type(pred_dist.dtype).view(1, 1, -1, 1)).sum(2)
+        return dist2bbox(pred_dist, anchor_points, xywh=False)
+    def __call__(self, p, targets, masks, semasks, img=None, epoch=0):
+        loss = torch.zeros(6, device=self.device)  # box, cls, dfl
+        feats, pred_masks, proto, psemasks = p if len(p) == 4 else p[1]
+        batch_size, _, mask_h, mask_w = proto.shape
+        pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split(
+            (self.reg_max * 4, self.nc), 1)
+        pred_scores = pred_scores.permute(0, 2, 1).contiguous()
+        pred_distri = pred_distri.permute(0, 2, 1).contiguous()
+        pred_masks = pred_masks.permute(0, 2, 1).contiguous()
+        dtype = pred_scores.dtype
+        batch_size, grid_size = pred_scores.shape[:2]
+        imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0]  # image size (h,w)
+        anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5)
+        # targets
+        try:
+            batch_idx = targets[:, 0].view(-1, 1)
+            targets = self.preprocess(targets.to(self.device), batch_size, scale_tensor=imgsz[[1, 0, 1, 0]])
+            gt_labels, gt_bboxes = targets.split((1, 4), 2)  # cls, xyxy
+            mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0)
+        except RuntimeError as e:
+            raise TypeError('ERROR.') from e
+        # pboxes
+        pred_bboxes = self.bbox_decode(anchor_points, pred_distri)  # xyxy, (b, h*w, 4)
+        target_labels, target_bboxes, target_scores, fg_mask, target_gt_idx = self.assigner(
+            pred_scores.detach().sigmoid(),
+            (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype),
+            anchor_points * stride_tensor,
+            gt_labels,
+            gt_bboxes,
+            mask_gt)
+        target_scores_sum = target_scores.sum()
+        # cls loss
+        # loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum  # VFL way
+        loss[2] = self.BCEcls(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum  # BCE
+        # bbox loss
+        if fg_mask.sum():
+            loss[0], loss[3], _ = self.bbox_loss(pred_distri, 
+                                                  pred_bboxes, 
+                                                  anchor_points, 
+                                                  target_bboxes / stride_tensor,
+                                                  target_scores, 
+                                                  target_scores_sum, 
+                                                  fg_mask)
+            # masks loss
+            if tuple(masks.shape[-2:]) != (mask_h, mask_w):  # downsample
+                masks = F.interpolate(masks[None], (mask_h, mask_w), mode='nearest')[0]
+            for i in range(batch_size):
+                if fg_mask[i].sum():
+                    mask_idx = target_gt_idx[i][fg_mask[i]]
+                    if self.overlap:
+                        gt_mask = torch.where(masks[[i]] == (mask_idx + 1).view(-1, 1, 1), 1.0, 0.0)
+                    else:
+                        gt_mask = masks[batch_idx.view(-1) == i][mask_idx]
+                    xyxyn = target_bboxes[i][fg_mask[i]] / imgsz[[1, 0, 1, 0]]
+                    marea = xyxy2xywh(xyxyn)[:, 2:].prod(1)
+                    mxyxy = xyxyn * torch.tensor([mask_w, mask_h, mask_w, mask_h], device=self.device)
+                    loss[1] += self.single_mask_loss(gt_mask, pred_masks[i][fg_mask[i]], proto[i], mxyxy,
+                                                     marea)  # seg loss
+        # Semantic Segmentation
+        # focal loss
+        pt = torch.flatten(psemasks, start_dim = 2).permute(0, 2, 1)
+        gt = torch.flatten(semasks, start_dim = 2).permute(0, 2, 1)
+        bs, _, _ = gt.shape
+        #torch.clamp(torch.sigmoid(logits), min=eps, max= 1 - eps)
+        #total_loss = (sigmoid_focal_loss(pt.float(), gt.float(), alpha = .25, gamma = 2., reduction = 'mean')) / 2.
+        #total_loss = (sigmoid_focal_loss(pt.clamp(-16., 16.), gt, alpha = .25, gamma = 2., reduction = 'mean')) / 2.
+        total_loss = (sigmoid_focal_loss(pt, gt, alpha = .25, gamma = 2., reduction = 'mean')) / 2.
+        loss[4] += total_loss * 20.
+        # dice loss
+        pt = torch.flatten(psemasks.softmax(dim = 1))
+        gt = torch.flatten(semasks)
+        inter_mask = torch.sum(torch.mul(pt, gt))
+        union_mask = torch.sum(torch.add(pt, gt))
+        dice_coef = (2. * inter_mask + 1.) / (union_mask + 1.)
+        loss[5] += (1. - dice_coef) / 2.
+        loss[0] *= 7.5  # box gain
+        loss[1] *= 2.5 / batch_size
+        loss[2] *= 0.5  # cls gain
+        loss[3] *= 1.5  # dfl gain
+        loss[4] *= 2.5 #/ batch_size
+        loss[5] *= 2.5 #/ batch_size
+        return loss.sum() * batch_size, loss.detach()  # loss(box, cls, dfl)
+    def single_mask_loss(self, gt_mask, pred, proto, xyxy, area):
+        # Mask loss for one image
+        pred_mask = (pred @ proto.view(self.nm, -1)).view(-1, *proto.shape[1:])  # (n, 32) @ (32,80,80) -> (n,80,80)
+        loss = F.binary_cross_entropy_with_logits(pred_mask, gt_mask, reduction='none')
+        return (crop_mask(loss, xyxy).mean(dim=(1, 2)) / area).mean()
--- a/utils/panoptic/metrics.py
+++ b/utils/panoptic/metrics.py
+import numpy as np
+import torch
+from ..metrics import ap_per_class
+def fitness(x):
+    # Model fitness as a weighted combination of metrics
+    w = [0.0, 0.0, 0.1, 0.9, 0.0, 0.0, 0.1, 0.9, 0.1, 0.9]
+    return (x[:, :len(w)] * w).sum(1)
+def ap_per_class_box_and_mask(
+        tp_m,
+        tp_b,
+        conf,
+        pred_cls,
+        target_cls,
+        plot=False,
+        save_dir=".",
+        names=(),
+):
+    """
+    Args:
+        tp_b: tp of boxes.
+        tp_m: tp of masks.
+        other arguments see `func: ap_per_class`.
+    """
+    results_boxes = ap_per_class(tp_b,
+                                 conf,
+                                 pred_cls,
+                                 target_cls,
+                                 plot=plot,
+                                 save_dir=save_dir,
+                                 names=names,
+                                 prefix="Box")[2:]
+    results_masks = ap_per_class(tp_m,
+                                 conf,
+                                 pred_cls,
+                                 target_cls,
+                                 plot=plot,
+                                 save_dir=save_dir,
+                                 names=names,
+                                 prefix="Mask")[2:]
+    results = {
+        "boxes": {
+            "p": results_boxes[0],
+            "r": results_boxes[1],
+            "ap": results_boxes[3],
+            "f1": results_boxes[2],
+            "ap_class": results_boxes[4]},
+        "masks": {
+            "p": results_masks[0],
+            "r": results_masks[1],
+            "ap": results_masks[3],
+            "f1": results_masks[2],
+            "ap_class": results_masks[4]}}
+    return results
+class Metric:
+    def __init__(self) -> None:
+        self.p = []  # (nc, )
+        self.r = []  # (nc, )
+        self.f1 = []  # (nc, )
+        self.all_ap = []  # (nc, 10)
+        self.ap_class_index = []  # (nc, )
+    @property
+    def ap50(self):
+        """AP@0.5 of all classes.
+        Return:
+            (nc, ) or [].
+        """
+        return self.all_ap[:, 0] if len(self.all_ap) else []
+    @property
+    def ap(self):
+        """AP@0.5:0.95
+        Return:
+            (nc, ) or [].
+        """
+        return self.all_ap.mean(1) if len(self.all_ap) else []
+    @property
+    def mp(self):
+        """mean precision of all classes.
+        Return:
+            float.
+        """
+        return self.p.mean() if len(self.p) else 0.0
+    @property
+    def mr(self):
+        """mean recall of all classes.
+        Return:
+            float.
+        """
+        return self.r.mean() if len(self.r) else 0.0
+    @property
+    def map50(self):
+        """Mean AP@0.5 of all classes.
+        Return:
+            float.
+        """
+        return self.all_ap[:, 0].mean() if len(self.all_ap) else 0.0
+    @property
+    def map(self):
+        """Mean AP@0.5:0.95 of all classes.
+        Return:
+            float.
+        """
+        return self.all_ap.mean() if len(self.all_ap) else 0.0
+    def mean_results(self):
+        """Mean of results, return mp, mr, map50, map"""
+        return (self.mp, self.mr, self.map50, self.map)
+    def class_result(self, i):
+        """class-aware result, return p[i], r[i], ap50[i], ap[i]"""
+        return (self.p[i], self.r[i], self.ap50[i], self.ap[i])
+    def get_maps(self, nc):
+        maps = np.zeros(nc) + self.map
+        for i, c in enumerate(self.ap_class_index):
+            maps[c] = self.ap[i]
+        return maps
+    def update(self, results):
+        """
+        Args:
+            results: tuple(p, r, ap, f1, ap_class)
+        """
+        p, r, all_ap, f1, ap_class_index = results
+        self.p = p
+        self.r = r
+        self.all_ap = all_ap
+        self.f1 = f1
+        self.ap_class_index = ap_class_index
+class Metrics:
+    """Metric for boxes and masks."""
+    def __init__(self) -> None:
+        self.metric_box = Metric()
+        self.metric_mask = Metric()
+    def update(self, results):
+        """
+        Args:
+            results: Dict{'boxes': Dict{}, 'masks': Dict{}}
+        """
+        self.metric_box.update(list(results["boxes"].values()))
+        self.metric_mask.update(list(results["masks"].values()))
+    def mean_results(self):
+        return self.metric_box.mean_results() + self.metric_mask.mean_results()
+    def class_result(self, i):
+        return self.metric_box.class_result(i) + self.metric_mask.class_result(i)
+    def get_maps(self, nc):
+        return self.metric_box.get_maps(nc) + self.metric_mask.get_maps(nc)
+    @property
+    def ap_class_index(self):
+        # boxes and masks have the same ap_class_index
+        return self.metric_box.ap_class_index
+class Semantic_Metrics:
+    def __init__(self, nc, device):
+        self.nc = nc  # number of classes
+        self.device = device
+        self.iou = []
+        self.c_bit_counts = torch.zeros(nc, dtype = torch.long).to(device)
+        self.c_intersection_counts = torch.zeros(nc, dtype = torch.long).to(device)
+        self.c_union_counts = torch.zeros(nc, dtype = torch.long).to(device)
+    def update(self, pred_masks, target_masks):
+        nb, nc, h, w = pred_masks.shape
+        device = pred_masks.device
+        for b in range(nb):
+            onehot_mask = pred_masks[b].to(device)
+            # convert predict mask to one hot
+            semantic_mask = torch.flatten(onehot_mask, start_dim = 1).permute(1, 0) # class x h x w -> (h x w) x class
+            max_idx = semantic_mask.argmax(1)
+            output_masks = (torch.zeros(semantic_mask.shape).to(self.device)).scatter(1, max_idx.unsqueeze(1), 1.0) # one hot: (h x w) x class
+            output_masks = torch.reshape(output_masks.permute(1, 0), (nc, h, w)) # (h x w) x class -> class x h x w
+            onehot_mask = output_masks.int()
+            for c in range(self.nc):
+                pred_mask = onehot_mask[c].to(device)
+                target_mask = target_masks[b, c].to(device)
+                # calculate IoU
+                intersection = (torch.logical_and(pred_mask, target_mask).sum()).item()
+                union = (torch.logical_or(pred_mask, target_mask).sum()).item()
+                iou = 0. if (0 == union) else (intersection / union)
+                # record class pixel counts, intersection counts, union counts
+                self.c_bit_counts[c] += target_mask.int().sum()
+                self.c_intersection_counts[c] += intersection
+                self.c_union_counts[c] += union
+                self.iou.append(iou)
+    def results(self):
+        # Mean IoU
+        miou = 0. if (0 == len(self.iou)) else np.sum(self.iou) / (len(self.iou) * self.nc)
+        # Frequency Weighted IoU
+        c_iou = self.c_intersection_counts / (self.c_union_counts + 1)  # add smooth
+        # c_bit_counts = self.c_bit_counts.astype(int)
+        total_c_bit_counts = self.c_bit_counts.sum()
+        freq_ious = torch.zeros(1, dtype = torch.long).to(self.device) if (0 == total_c_bit_counts) else (self.c_bit_counts / total_c_bit_counts) * c_iou
+        fwiou = (freq_ious.sum()).item()
+        return (miou, fwiou)
+    def reset(self):
+        self.iou = []
+        self.c_bit_counts = torch.zeros(self.nc, dtype = torch.long).to(self.device)
+        self.c_intersection_counts = torch.zeros(self.nc, dtype = torch.long).to(self.device)
+        self.c_union_counts = torch.zeros(self.nc, dtype = torch.long).to(self.device)
+KEYS = [
+    "train/box_loss",
+    "train/seg_loss",  # train loss
+    "train/cls_loss",
+    "train/dfl_loss",
+    "train/fcl_loss",
+    "train/dic_loss",
+    "metrics/precision(B)",
+    "metrics/recall(B)",
+    "metrics/mAP_0.5(B)",
+    "metrics/mAP_0.5:0.95(B)",  # metrics
+    "metrics/precision(M)",
+    "metrics/recall(M)",
+    "metrics/mAP_0.5(M)",
+    "metrics/mAP_0.5:0.95(M)",  # metrics
+    "metrics/MIOUS(S)",
+    "metrics/FWIOUS(S)",        # metrics
+    "val/box_loss",
+    "val/seg_loss",  # val loss
+    "val/cls_loss",
+    "val/dfl_loss",
+    "val/fcl_loss",
+    "val/dic_loss",
+    "x/lr0",
+    "x/lr1",
+    "x/lr2",]
+BEST_KEYS = [
+    "best/epoch",
+    "best/precision(B)",
+    "best/recall(B)",
+    "best/mAP_0.5(B)",
+    "best/mAP_0.5:0.95(B)",
+    "best/precision(M)",
+    "best/recall(M)",
+    "best/mAP_0.5(M)",
+    "best/mAP_0.5:0.95(M)",
+    "best/MIOUS(S)",
+    "best/FWIOUS(S)",]
--- a/utils/panoptic/plots.py
+++ b/utils/panoptic/plots.py
+import contextlib
+import math
+from pathlib import Path
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import torch
+from torchvision.utils import draw_segmentation_masks, save_image
+from .. import threaded
+from ..general import xywh2xyxy
+from ..plots import Annotator, colors
+@threaded
+def plot_images_and_masks(images, targets, masks, semasks, paths=None, fname='images.jpg', names=None):
+    try:
+        if images.shape[-2:] != semasks.shape[-2:]:
+            m = torch.nn.Upsample(scale_factor=4, mode='nearest')
+            semasks = m(semasks)
+        for idx in range(images.shape[0]):
+            output_img = draw_segmentation_masks(
+                image = images[idx, :, :, :].cpu().to(dtype = torch.uint8),
+                masks = semasks[idx, :, :, :].cpu().to(dtype = torch.bool),
+                alpha = 1)
+            cv2.imwrite(
+                '{}_{}.jpg'.format(fname, idx),
+                torch.permute(output_img, (1, 2, 0)).numpy()
+            )
+    except:
+        pass
+    # Plot image grid with labels
+    if isinstance(images, torch.Tensor):
+        images = images.cpu().float().numpy()
+    if isinstance(targets, torch.Tensor):
+        targets = targets.cpu().numpy()
+    if isinstance(masks, torch.Tensor):
+        masks = masks.cpu().numpy().astype(int)
+    if isinstance(semasks, torch.Tensor):
+        semasks = semasks.cpu().numpy().astype(int)
+    max_size = 1920  # max image size
+    max_subplots = 16  # max image subplots, i.e. 4x4
+    bs, _, h, w = images.shape  # batch size, _, height, width
+    bs = min(bs, max_subplots)  # limit plot images
+    ns = np.ceil(bs ** 0.5)  # number of subplots (square)
+    if np.max(images[0]) <= 1:
+        images *= 255  # de-normalise (optional)
+    # Build Image
+    mosaic = np.full((int(ns * h), int(ns * w), 3), 255, dtype=np.uint8)  # init
+    for i, im in enumerate(images):
+        if i == max_subplots:  # if last batch has fewer images than we expect
+            break
+        x, y = int(w * (i // ns)), int(h * (i % ns))  # block origin
+        im = im.transpose(1, 2, 0)
+        mosaic[y:y + h, x:x + w, :] = im
+    # Resize (optional)
+    scale = max_size / ns / max(h, w)
+    if scale < 1:
+        h = math.ceil(scale * h)
+        w = math.ceil(scale * w)
+        mosaic = cv2.resize(mosaic, tuple(int(x * ns) for x in (w, h)))
+    # Annotate
+    fs = int((h + w) * ns * 0.01)  # font size
+    annotator = Annotator(mosaic, line_width=round(fs / 10), font_size=fs, pil=True, example=names)
+    for i in range(i + 1):
+        x, y = int(w * (i // ns)), int(h * (i % ns))  # block origin
+        annotator.rectangle([x, y, x + w, y + h], None, (255, 255, 255), width=2)  # borders
+        if paths:
+            annotator.text((x + 5, y + 5 + h), text=Path(paths[i]).name[:40], txt_color=(220, 220, 220))  # filenames
+        if len(targets) > 0:
+            idx = targets[:, 0] == i
+            ti = targets[idx]  # image targets
+            boxes = xywh2xyxy(ti[:, 2:6]).T
+            classes = ti[:, 1].astype('int')
+            labels = ti.shape[1] == 6  # labels if no conf column
+            conf = None if labels else ti[:, 6]  # check for confidence presence (label vs pred)
+            if boxes.shape[1]:
+                if boxes.max() <= 1.01:  # if normalized with tolerance 0.01
+                    boxes[[0, 2]] *= w  # scale to pixels
+                    boxes[[1, 3]] *= h
+                elif scale < 1:  # absolute coords need scale if image scales
+                    boxes *= scale
+            boxes[[0, 2]] += x
+            boxes[[1, 3]] += y
+            for j, box in enumerate(boxes.T.tolist()):
+                cls = classes[j]
+                color = colors(cls)
+                cls = names[cls] if names else cls
+                if labels or conf[j] > 0.25:  # 0.25 conf thresh
+                    label = f'{cls}' if labels else f'{cls} {conf[j]:.1f}'
+                    annotator.box_label(box, label, color=color)
+            # Plot masks
+            if len(masks):
+                if masks.max() > 1.0:  # mean that masks are overlap
+                    image_masks = masks[[i]]  # (1, 640, 640)
+                    nl = len(ti)
+                    index = np.arange(nl).reshape(nl, 1, 1) + 1
+                    image_masks = np.repeat(image_masks, nl, axis=0)
+                    image_masks = np.where(image_masks == index, 1.0, 0.0)
+                else:
+                    image_masks = masks[idx]
+                im = np.asarray(annotator.im).copy()
+                for j, box in enumerate(boxes.T.tolist()):
+                    if labels or conf[j] > 0.25:  # 0.25 conf thresh
+                        color = colors(classes[j])
+                        mh, mw = image_masks[j].shape
+                        if mh != h or mw != w:
+                            mask = image_masks[j].astype(np.uint8)
+                            mask = cv2.resize(mask, (w, h))
+                            mask = mask.astype(bool)
+                        else:
+                            mask = image_masks[j].astype(bool)
+                        with contextlib.suppress(Exception):
+                            im[y:y + h, x:x + w, :][mask] = im[y:y + h, x:x + w, :][mask] * 0.4 + np.array(color) * 0.6
+                annotator.fromarray(im)
+    annotator.im.save(fname)  # save
+def plot_results_with_masks(file="path/to/results.csv", dir="", best=True):
+    # Plot training results.csv. Usage: from utils.plots import *; plot_results('path/to/results.csv')
+    save_dir = Path(file).parent if file else Path(dir)
+    fig, ax = plt.subplots(2, 8, figsize=(18, 6), tight_layout=True)
+    ax = ax.ravel()
+    files = list(save_dir.glob("results*.csv"))
+    assert len(files), f"No results.csv files found in {save_dir.resolve()}, nothing to plot."
+    for f in files:
+        try:
+            data = pd.read_csv(f)
+            index = np.argmax(0.9 * data.values[:, 8] + 0.1 * data.values[:, 7] + 0.9 * data.values[:, 12] +
+                              0.1 * data.values[:, 11])
+            s = [x.strip() for x in data.columns]
+            x = data.values[:, 0]
+            for i, j in enumerate([1, 2, 3, 4, 5, 6, 9, 10, 13, 14, 15, 16, 7, 8, 11, 12]):
+                y = data.values[:, j]
+                # y[y == 0] = np.nan  # don't show zero values
+                ax[i].plot(x, y, marker=".", label=f.stem, linewidth=2, markersize=2)
+                if best:
+                    # best
+                    ax[i].scatter(index, y[index], color="r", label=f"best:{index}", marker="*", linewidth=3)
+                    ax[i].set_title(s[j] + f"\n{round(y[index], 5)}")
+                else:
+                    # last
+                    ax[i].scatter(x[-1], y[-1], color="r", label="last", marker="*", linewidth=3)
+                    ax[i].set_title(s[j] + f"\n{round(y[-1], 5)}")
+                # if j in [8, 9, 10]:  # share train and val loss y axes
+                #     ax[i].get_shared_y_axes().join(ax[i], ax[i - 5])
+        except Exception as e:
+            print(f"Warning: Plotting error for {f}: {e}")
+    ax[1].legend()
+    fig.savefig(save_dir / "results.png", dpi=200)
+    plt.close()
--- a/utils/panoptic/tal/__init__.py
+++ b/utils/panoptic/tal/__init__.py
+# init
\ No newline at end of file
--- a/utils/panoptic/tal/anchor_generator.py
+++ b/utils/panoptic/tal/anchor_generator.py
+import torch
+from utils.general import check_version
+TORCH_1_10 = check_version(torch.__version__, '1.10.0')
+def make_anchors(feats, strides, grid_cell_offset=0.5):
+    """Generate anchors from features."""
+    anchor_points, stride_tensor = [], []
+    assert feats is not None
+    dtype, device = feats[0].dtype, feats[0].device
+    for i, stride in enumerate(strides):
+        _, _, h, w = feats[i].shape
+        sx = torch.arange(end=w, device=device, dtype=dtype) + grid_cell_offset  # shift x
+        sy = torch.arange(end=h, device=device, dtype=dtype) + grid_cell_offset  # shift y
+        sy, sx = torch.meshgrid(sy, sx, indexing='ij') if TORCH_1_10 else torch.meshgrid(sy, sx)
+        anchor_points.append(torch.stack((sx, sy), -1).view(-1, 2))
+        stride_tensor.append(torch.full((h * w, 1), stride, dtype=dtype, device=device))
+    return torch.cat(anchor_points), torch.cat(stride_tensor)
+def dist2bbox(distance, anchor_points, xywh=True, dim=-1):
+    """Transform distance(ltrb) to box(xywh or xyxy)."""
+    lt, rb = torch.split(distance, 2, dim)
+    x1y1 = anchor_points - lt
+    x2y2 = anchor_points + rb
+    if xywh:
+        c_xy = (x1y1 + x2y2) / 2
+        wh = x2y2 - x1y1
+        return torch.cat((c_xy, wh), dim)  # xywh bbox
+    return torch.cat((x1y1, x2y2), dim)  # xyxy bbox
+def bbox2dist(anchor_points, bbox, reg_max):
+    """Transform bbox(xyxy) to dist(ltrb)."""
+    x1y1, x2y2 = torch.split(bbox, 2, -1)
+    return torch.cat((anchor_points - x1y1, x2y2 - anchor_points), -1).clamp(0, reg_max - 0.01)  # dist (lt, rb)
--- a/utils/panoptic/tal/assigner.py
+++ b/utils/panoptic/tal/assigner.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from utils.metrics import bbox_iou
+def select_candidates_in_gts(xy_centers, gt_bboxes, eps=1e-9):
+    """select the positive anchor center in gt
+    Args:
+        xy_centers (Tensor): shape(h*w, 4)
+        gt_bboxes (Tensor): shape(b, n_boxes, 4)
+    Return:
+        (Tensor): shape(b, n_boxes, h*w)
+    """
+    n_anchors = xy_centers.shape[0]
+    bs, n_boxes, _ = gt_bboxes.shape
+    lt, rb = gt_bboxes.view(-1, 1, 4).chunk(2, 2)  # left-top, right-bottom
+    bbox_deltas = torch.cat((xy_centers[None] - lt, rb - xy_centers[None]), dim=2).view(bs, n_boxes, n_anchors, -1)
+    # return (bbox_deltas.min(3)[0] > eps).to(gt_bboxes.dtype)
+    return bbox_deltas.amin(3).gt_(eps)
+def select_highest_overlaps(mask_pos, overlaps, n_max_boxes):
+    """if an anchor box is assigned to multiple gts,
+        the one with the highest iou will be selected.
+    Args:
+        mask_pos (Tensor): shape(b, n_max_boxes, h*w)
+        overlaps (Tensor): shape(b, n_max_boxes, h*w)
+    Return:
+        target_gt_idx (Tensor): shape(b, h*w)
+        fg_mask (Tensor): shape(b, h*w)
+        mask_pos (Tensor): shape(b, n_max_boxes, h*w)
+    """
+    # (b, n_max_boxes, h*w) -> (b, h*w)
+    fg_mask = mask_pos.sum(-2)
+    if fg_mask.max() > 1:  # one anchor is assigned to multiple gt_bboxes
+        mask_multi_gts = (fg_mask.unsqueeze(1) > 1).repeat([1, n_max_boxes, 1])  # (b, n_max_boxes, h*w)
+        max_overlaps_idx = overlaps.argmax(1)  # (b, h*w)
+        is_max_overlaps = F.one_hot(max_overlaps_idx, n_max_boxes)  # (b, h*w, n_max_boxes)
+        is_max_overlaps = is_max_overlaps.permute(0, 2, 1).to(overlaps.dtype)  # (b, n_max_boxes, h*w)
+        mask_pos = torch.where(mask_multi_gts, is_max_overlaps, mask_pos)  # (b, n_max_boxes, h*w)
+        fg_mask = mask_pos.sum(-2)
+    # find each grid serve which gt(index)
+    target_gt_idx = mask_pos.argmax(-2)  # (b, h*w)
+    return target_gt_idx, fg_mask, mask_pos
+class TaskAlignedAssigner(nn.Module):
+    def __init__(self, topk=13, num_classes=80, alpha=1.0, beta=6.0, eps=1e-9):
+        super().__init__()
+        self.topk = topk
+        self.num_classes = num_classes
+        self.bg_idx = num_classes
+        self.alpha = alpha
+        self.beta = beta
+        self.eps = eps
+    @torch.no_grad()
+    def forward(self, pd_scores, pd_bboxes, anc_points, gt_labels, gt_bboxes, mask_gt):
+        """This code referenced to
+           https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/assigner/tal_assigner.py
+        Args:
+            pd_scores (Tensor): shape(bs, num_total_anchors, num_classes)
+            pd_bboxes (Tensor): shape(bs, num_total_anchors, 4)
+            anc_points (Tensor): shape(num_total_anchors, 2)
+            gt_labels (Tensor): shape(bs, n_max_boxes, 1)
+            gt_bboxes (Tensor): shape(bs, n_max_boxes, 4)
+            mask_gt (Tensor): shape(bs, n_max_boxes, 1)
+        Returns:
+            target_labels (Tensor): shape(bs, num_total_anchors)
+            target_bboxes (Tensor): shape(bs, num_total_anchors, 4)
+            target_scores (Tensor): shape(bs, num_total_anchors, num_classes)
+            fg_mask (Tensor): shape(bs, num_total_anchors)
+        """
+        self.bs = pd_scores.size(0)
+        self.n_max_boxes = gt_bboxes.size(1)
+        if self.n_max_boxes == 0:
+            device = gt_bboxes.device
+            return (torch.full_like(pd_scores[..., 0], self.bg_idx).to(device),
+                    torch.zeros_like(pd_bboxes).to(device),
+                    torch.zeros_like(pd_scores).to(device),
+                    torch.zeros_like(pd_scores[..., 0]).to(device),
+                    torch.zeros_like(pd_scores[..., 0]).to(device))
+        mask_pos, align_metric, overlaps = self.get_pos_mask(pd_scores, pd_bboxes, gt_labels, gt_bboxes, anc_points,
+                                                             mask_gt)
+        target_gt_idx, fg_mask, mask_pos = select_highest_overlaps(mask_pos, overlaps, self.n_max_boxes)
+        # assigned target
+        target_labels, target_bboxes, target_scores = self.get_targets(gt_labels, gt_bboxes, target_gt_idx, fg_mask)
+        # normalize
+        align_metric *= mask_pos
+        pos_align_metrics = align_metric.amax(axis=-1, keepdim=True)  # b, max_num_obj
+        pos_overlaps = (overlaps * mask_pos).amax(axis=-1, keepdim=True)  # b, max_num_obj
+        norm_align_metric = (align_metric * pos_overlaps / (pos_align_metrics + self.eps)).amax(-2).unsqueeze(-1)
+        target_scores = target_scores * norm_align_metric
+        return target_labels, target_bboxes, target_scores, fg_mask.bool(), target_gt_idx
+    def get_pos_mask(self, pd_scores, pd_bboxes, gt_labels, gt_bboxes, anc_points, mask_gt):
+        # get anchor_align metric, (b, max_num_obj, h*w)
+        align_metric, overlaps = self.get_box_metrics(pd_scores, pd_bboxes, gt_labels, gt_bboxes)
+        # get in_gts mask, (b, max_num_obj, h*w)
+        mask_in_gts = select_candidates_in_gts(anc_points, gt_bboxes)
+        # get topk_metric mask, (b, max_num_obj, h*w)
+        mask_topk = self.select_topk_candidates(align_metric * mask_in_gts,
+                                                topk_mask=mask_gt.repeat([1, 1, self.topk]).bool())
+        # merge all mask to a final mask, (b, max_num_obj, h*w)
+        mask_pos = mask_topk * mask_in_gts * mask_gt
+        return mask_pos, align_metric, overlaps
+    def get_box_metrics(self, pd_scores, pd_bboxes, gt_labels, gt_bboxes):
+        gt_labels = gt_labels.to(torch.long)  # b, max_num_obj, 1
+        ind = torch.zeros([2, self.bs, self.n_max_boxes], dtype=torch.long)  # 2, b, max_num_obj
+        ind[0] = torch.arange(end=self.bs).view(-1, 1).repeat(1, self.n_max_boxes)  # b, max_num_obj
+        ind[1] = gt_labels.squeeze(-1)  # b, max_num_obj
+        # get the scores of each grid for each gt cls
+        bbox_scores = pd_scores[ind[0], :, ind[1]]  # b, max_num_obj, h*w
+        overlaps = bbox_iou(gt_bboxes.unsqueeze(2), pd_bboxes.unsqueeze(1), xywh=False, CIoU=True).squeeze(3).clamp(0)
+        #overlaps = bbox_iou(gt_bboxes.unsqueeze(2), pd_bboxes.unsqueeze(1), xywh=False, WIoU=True, scale=True)[-1].squeeze(3).clamp(0)
+        align_metric = bbox_scores.pow(self.alpha) * overlaps.pow(self.beta)
+        return align_metric, overlaps
+    def select_topk_candidates(self, metrics, largest=True, topk_mask=None):
+        """
+        Args:
+            metrics: (b, max_num_obj, h*w).
+            topk_mask: (b, max_num_obj, topk) or None
+        """
+        num_anchors = metrics.shape[-1]  # h*w
+        # (b, max_num_obj, topk)
+        topk_metrics, topk_idxs = torch.topk(metrics, self.topk, dim=-1, largest=largest)
+        if topk_mask is None:
+            topk_mask = (topk_metrics.max(-1, keepdim=True) > self.eps).tile([1, 1, self.topk])
+        # (b, max_num_obj, topk)
+        topk_idxs = torch.where(topk_mask, topk_idxs, 0)
+        # (b, max_num_obj, topk, h*w) -> (b, max_num_obj, h*w)
+        is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(-2)
+        # filter invalid bboxes
+        # assigned topk should be unique, this is for dealing with empty labels
+        # since empty labels will generate index `0` through `F.one_hot`
+        # NOTE: but what if the topk_idxs include `0`?
+        is_in_topk = torch.where(is_in_topk > 1, 0, is_in_topk)
+        return is_in_topk.to(metrics.dtype)
+    def get_targets(self, gt_labels, gt_bboxes, target_gt_idx, fg_mask):
+        """
+        Args:
+            gt_labels: (b, max_num_obj, 1)
+            gt_bboxes: (b, max_num_obj, 4)
+            target_gt_idx: (b, h*w)
+            fg_mask: (b, h*w)
+        """
+        # assigned target labels, (b, 1)
+        batch_ind = torch.arange(end=self.bs, dtype=torch.int64, device=gt_labels.device)[..., None]
+        target_gt_idx = target_gt_idx + batch_ind * self.n_max_boxes  # (b, h*w)
+        target_labels = gt_labels.long().flatten()[target_gt_idx]  # (b, h*w)
+        # assigned target boxes, (b, max_num_obj, 4) -> (b, h*w)
+        target_bboxes = gt_bboxes.view(-1, 4)[target_gt_idx]
+        # assigned target scores
+        target_labels.clamp(0)
+        target_scores = F.one_hot(target_labels, self.num_classes)  # (b, h*w, 80)
+        fg_scores_mask = fg_mask[:, :, None].repeat(1, 1, self.num_classes)  # (b, h*w, 80)
+        target_scores = torch.where(fg_scores_mask > 0, target_scores, 0)
+        return target_labels, target_bboxes, target_scores
--- a/utils/plots.py
+++ b/utils/plots.py
+import contextlib
+import math
+import os
+from copy import copy
+from pathlib import Path
+from urllib.error import URLError
+import cv2
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sn
+import torch
+from PIL import Image, ImageDraw, ImageFont
+from utils import TryExcept, threaded
+from utils.general import (CONFIG_DIR, FONT, LOGGER, check_font, check_requirements, clip_boxes, increment_path,
+                           is_ascii, xywh2xyxy, xyxy2xywh)
+from utils.metrics import fitness
+from utils.segment.general import scale_image
+# Settings
+RANK = int(os.getenv('RANK', -1))
+matplotlib.rc('font', **{'size': 11})
+matplotlib.use('Agg')  # for writing to files only
+class Colors:
+    # Ultralytics color palette https://ultralytics.com/
+    def __init__(self):
+        # hex = matplotlib.colors.TABLEAU_COLORS.values()
+        hexs = ('FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A', '92CC17', '3DDB86', '1A9334', '00D4BB',
+                '2C99A8', '00C2FF', '344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF', 'FF95C8', 'FF37C7')
+        self.palette = [self.hex2rgb(f'#{c}') for c in hexs]
+        self.n = len(self.palette)
+    def __call__(self, i, bgr=False):
+        c = self.palette[int(i) % self.n]
+        return (c[2], c[1], c[0]) if bgr else c
+    @staticmethod
+    def hex2rgb(h):  # rgb order (PIL)
+        return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4))
+colors = Colors()  # create instance for 'from utils.plots import colors'
+def check_pil_font(font=FONT, size=10):
+    # Return a PIL TrueType Font, downloading to CONFIG_DIR if necessary
+    font = Path(font)
+    font = font if font.exists() else (CONFIG_DIR / font.name)
+    try:
+        return ImageFont.truetype(str(font) if font.exists() else font.name, size)
+    except Exception:  # download if missing
+        try:
+            check_font(font)
+            return ImageFont.truetype(str(font), size)
+        except TypeError:
+            check_requirements('Pillow>=8.4.0')  # known issue https://github.com/ultralytics/yolov5/issues/5374
+        except URLError:  # not online
+            return ImageFont.load_default()
+class Annotator:
+    # YOLOv5 Annotator for train/val mosaics and jpgs and detect/hub inference annotations
+    def __init__(self, im, line_width=None, font_size=None, font='Arial.ttf', pil=False, example='abc'):
+        assert im.data.contiguous, 'Image not contiguous. Apply np.ascontiguousarray(im) to Annotator() input images.'
+        non_ascii = not is_ascii(example)  # non-latin labels, i.e. asian, arabic, cyrillic
+        self.pil = pil or non_ascii
+        if self.pil:  # use PIL
+            self.im = im if isinstance(im, Image.Image) else Image.fromarray(im)
+            self.draw = ImageDraw.Draw(self.im)
+            self.font = check_pil_font(font='Arial.Unicode.ttf' if non_ascii else font,
+                                       size=font_size or max(round(sum(self.im.size) / 2 * 0.035), 12))
+        else:  # use cv2
+            self.im = im
+        self.lw = line_width or max(round(sum(im.shape) / 2 * 0.003), 2)  # line width
+    def box_label(self, box, label='', color=(128, 128, 128), txt_color=(255, 255, 255)):
+        # Add one xyxy box to image with label
+        if self.pil or not is_ascii(label):
+            self.draw.rectangle(box, width=self.lw, outline=color)  # box
+            if label:
+                w, h = self.font.getsize(label)  # text width, height
+                outside = box[1] - h >= 0  # label fits outside box
+                self.draw.rectangle(
+                    (box[0], box[1] - h if outside else box[1], box[0] + w + 1,
+                     box[1] + 1 if outside else box[1] + h + 1),
+                    fill=color,
+                )
+                # self.draw.text((box[0], box[1]), label, fill=txt_color, font=self.font, anchor='ls')  # for PIL>8.0
+                self.draw.text((box[0], box[1] - h if outside else box[1]), label, fill=txt_color, font=self.font)
+        else:  # cv2
+            p1, p2 = (int(box[0]), int(box[1])), (int(box[2]), int(box[3]))
+            cv2.rectangle(self.im, p1, p2, color, thickness=self.lw, lineType=cv2.LINE_AA)
+            if label:
+                tf = max(self.lw - 1, 1)  # font thickness
+                w, h = cv2.getTextSize(label, 0, fontScale=self.lw / 3, thickness=tf)[0]  # text width, height
+                outside = p1[1] - h >= 3
+                p2 = p1[0] + w, p1[1] - h - 3 if outside else p1[1] + h + 3
+                cv2.rectangle(self.im, p1, p2, color, -1, cv2.LINE_AA)  # filled
+                cv2.putText(self.im,
+                            label, (p1[0], p1[1] - 2 if outside else p1[1] + h + 2),
+                            0,
+                            self.lw / 3,
+                            txt_color,
+                            thickness=tf,
+                            lineType=cv2.LINE_AA)
+    def masks(self, masks, colors, im_gpu=None, alpha=0.5):
+        """Plot masks at once.
+        Args:
+            masks (tensor): predicted masks on cuda, shape: [n, h, w]
+            colors (List[List[Int]]): colors for predicted masks, [[r, g, b] * n]
+            im_gpu (tensor): img is in cuda, shape: [3, h, w], range: [0, 1]
+            alpha (float): mask transparency: 0.0 fully transparent, 1.0 opaque
+        """
+        if self.pil:
+            # convert to numpy first
+            self.im = np.asarray(self.im).copy()
+        if im_gpu is None:
+            # Add multiple masks of shape(h,w,n) with colors list([r,g,b], [r,g,b], ...)
+            if len(masks) == 0:
+                return
+            if isinstance(masks, torch.Tensor):
+                masks = torch.as_tensor(masks, dtype=torch.uint8)
+                masks = masks.permute(1, 2, 0).contiguous()
+                masks = masks.cpu().numpy()
+            # masks = np.ascontiguousarray(masks.transpose(1, 2, 0))
+            masks = scale_image(masks.shape[:2], masks, self.im.shape)
+            masks = np.asarray(masks, dtype=np.float32)
+            colors = np.asarray(colors, dtype=np.float32)  # shape(n,3)
+            s = masks.sum(2, keepdims=True).clip(0, 1)  # add all masks together
+            masks = (masks @ colors).clip(0, 255)  # (h,w,n) @ (n,3) = (h,w,3)
+            self.im[:] = masks * alpha + self.im * (1 - s * alpha)
+        else:
+            if len(masks) == 0:
+                self.im[:] = im_gpu.permute(1, 2, 0).contiguous().cpu().numpy() * 255
+            colors = torch.tensor(colors, device=im_gpu.device, dtype=torch.float32) / 255.0
+            colors = colors[:, None, None]  # shape(n,1,1,3)
+            masks = masks.unsqueeze(3)  # shape(n,h,w,1)
+            masks_color = masks * (colors * alpha)  # shape(n,h,w,3)
+            inv_alph_masks = (1 - masks * alpha).cumprod(0)  # shape(n,h,w,1)
+            mcs = (masks_color * inv_alph_masks).sum(0) * 2  # mask color summand shape(n,h,w,3)
+            im_gpu = im_gpu.flip(dims=[0])  # flip channel
+            im_gpu = im_gpu.permute(1, 2, 0).contiguous()  # shape(h,w,3)
+            im_gpu = im_gpu * inv_alph_masks[-1] + mcs
+            im_mask = (im_gpu * 255).byte().cpu().numpy()
+            self.im[:] = scale_image(im_gpu.shape, im_mask, self.im.shape)
+        if self.pil:
+            # convert im back to PIL and update draw
+            self.fromarray(self.im)
+    def rectangle(self, xy, fill=None, outline=None, width=1):
+        # Add rectangle to image (PIL-only)
+        self.draw.rectangle(xy, fill, outline, width)
+    def text(self, xy, text, txt_color=(255, 255, 255), anchor='top'):
+        # Add text to image (PIL-only)
+        if anchor == 'bottom':  # start y from font bottom
+            w, h = self.font.getsize(text)  # text width, height
+            xy[1] += 1 - h
+        self.draw.text(xy, text, fill=txt_color, font=self.font)
+    def fromarray(self, im):
+        # Update self.im from a numpy array
+        self.im = im if isinstance(im, Image.Image) else Image.fromarray(im)
+        self.draw = ImageDraw.Draw(self.im)
+    def result(self):
+        # Return annotated image as array
+        return np.asarray(self.im)
+def feature_visualization(x, module_type, stage, n=32, save_dir=Path('runs/detect/exp')):
+    """
+    x:              Features to be visualized
+    module_type:    Module type
+    stage:          Module stage within model
+    n:              Maximum number of feature maps to plot
+    save_dir:       Directory to save results
+    """
+    if 'Detect' not in module_type:
+        batch, channels, height, width = x.shape  # batch, channels, height, width
+        if height > 1 and width > 1:
+            f = save_dir / f"stage{stage}_{module_type.split('.')[-1]}_features.png"  # filename
+            blocks = torch.chunk(x[0].cpu(), channels, dim=0)  # select batch index 0, block by channels
+            n = min(n, channels)  # number of plots
+            fig, ax = plt.subplots(math.ceil(n / 8), 8, tight_layout=True)  # 8 rows x n/8 cols
+            ax = ax.ravel()
+            plt.subplots_adjust(wspace=0.05, hspace=0.05)
+            for i in range(n):
+                ax[i].imshow(blocks[i].squeeze())  # cmap='gray'
+                ax[i].axis('off')
+            LOGGER.info(f'Saving {f}... ({n}/{channels})')
+            plt.savefig(f, dpi=300, bbox_inches='tight')
+            plt.close()
+            np.save(str(f.with_suffix('.npy')), x[0].cpu().numpy())  # npy save
+def hist2d(x, y, n=100):
+    # 2d histogram used in labels.png and evolve.png
+    xedges, yedges = np.linspace(x.min(), x.max(), n), np.linspace(y.min(), y.max(), n)
+    hist, xedges, yedges = np.histogram2d(x, y, (xedges, yedges))
+    xidx = np.clip(np.digitize(x, xedges) - 1, 0, hist.shape[0] - 1)
+    yidx = np.clip(np.digitize(y, yedges) - 1, 0, hist.shape[1] - 1)
+    return np.log(hist[xidx, yidx])
+def butter_lowpass_filtfilt(data, cutoff=1500, fs=50000, order=5):
+    from scipy.signal import butter, filtfilt
+    # https://stackoverflow.com/questions/28536191/how-to-filter-smooth-with-scipy-numpy
+    def butter_lowpass(cutoff, fs, order):
+        nyq = 0.5 * fs
+        normal_cutoff = cutoff / nyq
+        return butter(order, normal_cutoff, btype='low', analog=False)
+    b, a = butter_lowpass(cutoff, fs, order=order)
+    return filtfilt(b, a, data)  # forward-backward filter
+def output_to_target(output, max_det=300):
+    # Convert model output to target format [batch_id, class_id, x, y, w, h, conf] for plotting
+    targets = []
+    for i, o in enumerate(output):
+        box, conf, cls = o[:max_det, :6].cpu().split((4, 1, 1), 1)
+        j = torch.full((conf.shape[0], 1), i)
+        targets.append(torch.cat((j, cls, xyxy2xywh(box), conf), 1))
+    return torch.cat(targets, 0).numpy()
+@threaded
+def plot_images(images, targets, paths=None, fname='images.jpg', names=None):
+    # Plot image grid with labels
+    if isinstance(images, torch.Tensor):
+        images = images.cpu().float().numpy()
+    if isinstance(targets, torch.Tensor):
+        targets = targets.cpu().numpy()
+    max_size = 1920  # max image size
+    max_subplots = 16  # max image subplots, i.e. 4x4
+    bs, _, h, w = images.shape  # batch size, _, height, width
+    bs = min(bs, max_subplots)  # limit plot images
+    ns = np.ceil(bs ** 0.5)  # number of subplots (square)
+    if np.max(images[0]) <= 1:
+        images *= 255  # de-normalise (optional)
+    # Build Image
+    mosaic = np.full((int(ns * h), int(ns * w), 3), 255, dtype=np.uint8)  # init
+    for i, im in enumerate(images):
+        if i == max_subplots:  # if last batch has fewer images than we expect
+            break
+        x, y = int(w * (i // ns)), int(h * (i % ns))  # block origin
+        im = im.transpose(1, 2, 0)
+        mosaic[y:y + h, x:x + w, :] = im
+    # Resize (optional)
+    scale = max_size / ns / max(h, w)
+    if scale < 1:
+        h = math.ceil(scale * h)
+        w = math.ceil(scale * w)
+        mosaic = cv2.resize(mosaic, tuple(int(x * ns) for x in (w, h)))
+    # Annotate
+    fs = int((h + w) * ns * 0.01)  # font size
+    annotator = Annotator(mosaic, line_width=round(fs / 10), font_size=fs, pil=True, example=names)
+    for i in range(i + 1):
+        x, y = int(w * (i // ns)), int(h * (i % ns))  # block origin
+        annotator.rectangle([x, y, x + w, y + h], None, (255, 255, 255), width=2)  # borders
+        if paths:
+            annotator.text((x + 5, y + 5), text=Path(paths[i]).name[:40], txt_color=(220, 220, 220))  # filenames
+        if len(targets) > 0:
+            ti = targets[targets[:, 0] == i]  # image targets
+            boxes = xywh2xyxy(ti[:, 2:6]).T
+            classes = ti[:, 1].astype('int')
+            labels = ti.shape[1] == 6  # labels if no conf column
+            conf = None if labels else ti[:, 6]  # check for confidence presence (label vs pred)
+            if boxes.shape[1]:
+                if boxes.max() <= 1.01:  # if normalized with tolerance 0.01
+                    boxes[[0, 2]] *= w  # scale to pixels
+                    boxes[[1, 3]] *= h
+                elif scale < 1:  # absolute coords need scale if image scales
+                    boxes *= scale
+            boxes[[0, 2]] += x
+            boxes[[1, 3]] += y
+            for j, box in enumerate(boxes.T.tolist()):
+                cls = classes[j]
+                color = colors(cls)
+                cls = names[cls] if names else cls
+                if labels or conf[j] > 0.25:  # 0.25 conf thresh
+                    label = f'{cls}' if labels else f'{cls} {conf[j]:.1f}'
+                    annotator.box_label(box, label, color=color)
+    annotator.im.save(fname)  # save
+def plot_lr_scheduler(optimizer, scheduler, epochs=300, save_dir=''):
+    # Plot LR simulating training for full epochs
+    optimizer, scheduler = copy(optimizer), copy(scheduler)  # do not modify originals
+    y = []
+    for _ in range(epochs):
+        scheduler.step()
+        y.append(optimizer.param_groups[0]['lr'])
+    plt.plot(y, '.-', label='LR')
+    plt.xlabel('epoch')
+    plt.ylabel('LR')
+    plt.grid()
+    plt.xlim(0, epochs)
+    plt.ylim(0)
+    plt.savefig(Path(save_dir) / 'LR.png', dpi=200)
+    plt.close()
+def plot_val_txt():  # from utils.plots import *; plot_val()
+    # Plot val.txt histograms
+    x = np.loadtxt('val.txt', dtype=np.float32)
+    box = xyxy2xywh(x[:, :4])
+    cx, cy = box[:, 0], box[:, 1]
+    fig, ax = plt.subplots(1, 1, figsize=(6, 6), tight_layout=True)
+    ax.hist2d(cx, cy, bins=600, cmax=10, cmin=0)
+    ax.set_aspect('equal')
+    plt.savefig('hist2d.png', dpi=300)
+    fig, ax = plt.subplots(1, 2, figsize=(12, 6), tight_layout=True)
+    ax[0].hist(cx, bins=600)
+    ax[1].hist(cy, bins=600)
+    plt.savefig('hist1d.png', dpi=200)
+def plot_targets_txt():  # from utils.plots import *; plot_targets_txt()
+    # Plot targets.txt histograms
+    x = np.loadtxt('targets.txt', dtype=np.float32).T
+    s = ['x targets', 'y targets', 'width targets', 'height targets']
+    fig, ax = plt.subplots(2, 2, figsize=(8, 8), tight_layout=True)
+    ax = ax.ravel()
+    for i in range(4):
+        ax[i].hist(x[i], bins=100, label=f'{x[i].mean():.3g} +/- {x[i].std():.3g}')
+        ax[i].legend()
+        ax[i].set_title(s[i])
+    plt.savefig('targets.jpg', dpi=200)
+def plot_val_study(file='', dir='', x=None):  # from utils.plots import *; plot_val_study()
+    # Plot file=study.txt generated by val.py (or plot all study*.txt in dir)
+    save_dir = Path(file).parent if file else Path(dir)
+    plot2 = False  # plot additional results
+    if plot2:
+        ax = plt.subplots(2, 4, figsize=(10, 6), tight_layout=True)[1].ravel()
+    fig2, ax2 = plt.subplots(1, 1, figsize=(8, 4), tight_layout=True)
+    # for f in [save_dir / f'study_coco_{x}.txt' for x in ['yolov5n6', 'yolov5s6', 'yolov5m6', 'yolov5l6', 'yolov5x6']]:
+    for f in sorted(save_dir.glob('study*.txt')):
+        y = np.loadtxt(f, dtype=np.float32, usecols=[0, 1, 2, 3, 7, 8, 9], ndmin=2).T
+        x = np.arange(y.shape[1]) if x is None else np.array(x)
+        if plot2:
+            s = ['P', 'R', 'mAP@.5', 'mAP@.5:.95', 't_preprocess (ms/img)', 't_inference (ms/img)', 't_NMS (ms/img)']
+            for i in range(7):
+                ax[i].plot(x, y[i], '.-', linewidth=2, markersize=8)
+                ax[i].set_title(s[i])
+        j = y[3].argmax() + 1
+        ax2.plot(y[5, 1:j],
+                 y[3, 1:j] * 1E2,
+                 '.-',
+                 linewidth=2,
+                 markersize=8,
+                 label=f.stem.replace('study_coco_', '').replace('yolo', 'YOLO'))
+    ax2.plot(1E3 / np.array([209, 140, 97, 58, 35, 18]), [34.6, 40.5, 43.0, 47.5, 49.7, 51.5],
+             'k.-',
+             linewidth=2,
+             markersize=8,
+             alpha=.25,
+             label='EfficientDet')
+    ax2.grid(alpha=0.2)
+    ax2.set_yticks(np.arange(20, 60, 5))
+    ax2.set_xlim(0, 57)
+    ax2.set_ylim(25, 55)
+    ax2.set_xlabel('GPU Speed (ms/img)')
+    ax2.set_ylabel('COCO AP val')
+    ax2.legend(loc='lower right')
+    f = save_dir / 'study.png'
+    print(f'Saving {f}...')
+    plt.savefig(f, dpi=300)
+@TryExcept()  # known issue https://github.com/ultralytics/yolov5/issues/5395
+def plot_labels(labels, names=(), save_dir=Path('')):
+    # plot dataset labels
+    LOGGER.info(f"Plotting labels to {save_dir / 'labels.jpg'}... ")
+    c, b = labels[:, 0], labels[:, 1:].transpose()  # classes, boxes
+    nc = int(c.max() + 1)  # number of classes
+    x = pd.DataFrame(b.transpose(), columns=['x', 'y', 'width', 'height'])
+    # seaborn correlogram
+    sn.pairplot(x, corner=True, diag_kind='auto', kind='hist', diag_kws=dict(bins=50), plot_kws=dict(pmax=0.9))
+    plt.savefig(save_dir / 'labels_correlogram.jpg', dpi=200)
+    plt.close()
+    # matplotlib labels
+    matplotlib.use('svg')  # faster
+    ax = plt.subplots(2, 2, figsize=(8, 8), tight_layout=True)[1].ravel()
+    y = ax[0].hist(c, bins=np.linspace(0, nc, nc + 1) - 0.5, rwidth=0.8)
+    with contextlib.suppress(Exception):  # color histogram bars by class
+        [y[2].patches[i].set_color([x / 255 for x in colors(i)]) for i in range(nc)]  # known issue #3195
+    ax[0].set_ylabel('instances')
+    if 0 < len(names) < 30:
+        ax[0].set_xticks(range(len(names)))
+        ax[0].set_xticklabels(list(names.values()), rotation=90, fontsize=10)
+    else:
+        ax[0].set_xlabel('classes')
+    sn.histplot(x, x='x', y='y', ax=ax[2], bins=50, pmax=0.9)
+    sn.histplot(x, x='width', y='height', ax=ax[3], bins=50, pmax=0.9)
+    # rectangles
+    labels[:, 1:3] = 0.5  # center
+    labels[:, 1:] = xywh2xyxy(labels[:, 1:]) * 2000
+    img = Image.fromarray(np.ones((2000, 2000, 3), dtype=np.uint8) * 255)
+    for cls, *box in labels[:1000]:
+        ImageDraw.Draw(img).rectangle(box, width=1, outline=colors(cls))  # plot
+    ax[1].imshow(img)
+    ax[1].axis('off')
+    for a in [0, 1, 2, 3]:
+        for s in ['top', 'right', 'left', 'bottom']:
+            ax[a].spines[s].set_visible(False)
+    plt.savefig(save_dir / 'labels.jpg', dpi=200)
+    matplotlib.use('Agg')
+    plt.close()
+def imshow_cls(im, labels=None, pred=None, names=None, nmax=25, verbose=False, f=Path('images.jpg')):
+    # Show classification image grid with labels (optional) and predictions (optional)
+    from utils.augmentations import denormalize
+    names = names or [f'class{i}' for i in range(1000)]
+    blocks = torch.chunk(denormalize(im.clone()).cpu().float(), len(im),
+                         dim=0)  # select batch index 0, block by channels
+    n = min(len(blocks), nmax)  # number of plots
+    m = min(8, round(n ** 0.5))  # 8 x 8 default
+    fig, ax = plt.subplots(math.ceil(n / m), m)  # 8 rows x n/8 cols
+    ax = ax.ravel() if m > 1 else [ax]
+    # plt.subplots_adjust(wspace=0.05, hspace=0.05)
+    for i in range(n):
+        ax[i].imshow(blocks[i].squeeze().permute((1, 2, 0)).numpy().clip(0.0, 1.0))
+        ax[i].axis('off')
+        if labels is not None:
+            s = names[labels[i]] + (f'—{names[pred[i]]}' if pred is not None else '')
+            ax[i].set_title(s, fontsize=8, verticalalignment='top')
+    plt.savefig(f, dpi=300, bbox_inches='tight')
+    plt.close()
+    if verbose:
+        LOGGER.info(f"Saving {f}")
+        if labels is not None:
+            LOGGER.info('True:     ' + ' '.join(f'{names[i]:3s}' for i in labels[:nmax]))
+        if pred is not None:
+            LOGGER.info('Predicted:' + ' '.join(f'{names[i]:3s}' for i in pred[:nmax]))
+    return f
+def plot_evolve(evolve_csv='path/to/evolve.csv'):  # from utils.plots import *; plot_evolve()
+    # Plot evolve.csv hyp evolution results
+    evolve_csv = Path(evolve_csv)
+    data = pd.read_csv(evolve_csv)
+    keys = [x.strip() for x in data.columns]
+    x = data.values
+    f = fitness(x)
+    j = np.argmax(f)  # max fitness index
+    plt.figure(figsize=(10, 12), tight_layout=True)
+    matplotlib.rc('font', **{'size': 8})
+    print(f'Best results from row {j} of {evolve_csv}:')
+    for i, k in enumerate(keys[7:]):
+        v = x[:, 7 + i]
+        mu = v[j]  # best single result
+        plt.subplot(6, 5, i + 1)
+        plt.scatter(v, f, c=hist2d(v, f, 20), cmap='viridis', alpha=.8, edgecolors='none')
+        plt.plot(mu, f.max(), 'k+', markersize=15)
+        plt.title(f'{k} = {mu:.3g}', fontdict={'size': 9})  # limit to 40 characters
+        if i % 5 != 0:
+            plt.yticks([])
+        print(f'{k:>15}: {mu:.3g}')
+    f = evolve_csv.with_suffix('.png')  # filename
+    plt.savefig(f, dpi=200)
+    plt.close()
+    print(f'Saved {f}')
+def plot_results(file='path/to/results.csv', dir=''):
+    # Plot training results.csv. Usage: from utils.plots import *; plot_results('path/to/results.csv')
+    save_dir = Path(file).parent if file else Path(dir)
+    fig, ax = plt.subplots(2, 5, figsize=(12, 6), tight_layout=True)
+    ax = ax.ravel()
+    files = list(save_dir.glob('results*.csv'))
+    assert len(files), f'No results.csv files found in {save_dir.resolve()}, nothing to plot.'
+    for f in files:
+        try:
+            data = pd.read_csv(f)
+            s = [x.strip() for x in data.columns]
+            x = data.values[:, 0]
+            for i, j in enumerate([1, 2, 3, 4, 5, 8, 9, 10, 6, 7]):
+                y = data.values[:, j].astype('float')
+                # y[y == 0] = np.nan  # don't show zero values
+                ax[i].plot(x, y, marker='.', label=f.stem, linewidth=2, markersize=8)
+                ax[i].set_title(s[j], fontsize=12)
+                # if j in [8, 9, 10]:  # share train and val loss y axes
+                #     ax[i].get_shared_y_axes().join(ax[i], ax[i - 5])
+        except Exception as e:
+            LOGGER.info(f'Warning: Plotting error for {f}: {e}')
+    ax[1].legend()
+    fig.savefig(save_dir / 'results.png', dpi=200)
+    plt.close()
+def profile_idetection(start=0, stop=0, labels=(), save_dir=''):
+    # Plot iDetection '*.txt' per-image logs. from utils.plots import *; profile_idetection()
+    ax = plt.subplots(2, 4, figsize=(12, 6), tight_layout=True)[1].ravel()
+    s = ['Images', 'Free Storage (GB)', 'RAM Usage (GB)', 'Battery', 'dt_raw (ms)', 'dt_smooth (ms)', 'real-world FPS']
+    files = list(Path(save_dir).glob('frames*.txt'))
+    for fi, f in enumerate(files):
+        try:
+            results = np.loadtxt(f, ndmin=2).T[:, 90:-30]  # clip first and last rows
+            n = results.shape[1]  # number of rows
+            x = np.arange(start, min(stop, n) if stop else n)
+            results = results[:, x]
+            t = (results[0] - results[0].min())  # set t0=0s
+            results[0] = x
+            for i, a in enumerate(ax):
+                if i < len(results):
+                    label = labels[fi] if len(labels) else f.stem.replace('frames_', '')
+                    a.plot(t, results[i], marker='.', label=label, linewidth=1, markersize=5)
+                    a.set_title(s[i])
+                    a.set_xlabel('time (s)')
+                    # if fi == len(files) - 1:
+                    #     a.set_ylim(bottom=0)
+                    for side in ['top', 'right']:
+                        a.spines[side].set_visible(False)
+                else:
+                    a.remove()
+        except Exception as e:
+            print(f'Warning: Plotting error for {f}; {e}')
+    ax[1].legend()
+    plt.savefig(Path(save_dir) / 'idetection_profile.png', dpi=200)
+def save_one_box(xyxy, im, file=Path('im.jpg'), gain=1.02, pad=10, square=False, BGR=False, save=True):
+    # Save image crop as {file} with crop size multiple {gain} and {pad} pixels. Save and/or return crop
+    xyxy = torch.tensor(xyxy).view(-1, 4)
+    b = xyxy2xywh(xyxy)  # boxes
+    if square:
+        b[:, 2:] = b[:, 2:].max(1)[0].unsqueeze(1)  # attempt rectangle to square
+    b[:, 2:] = b[:, 2:] * gain + pad  # box wh * gain + pad
+    xyxy = xywh2xyxy(b).long()
+    clip_boxes(xyxy, im.shape)
+    crop = im[int(xyxy[0, 1]):int(xyxy[0, 3]), int(xyxy[0, 0]):int(xyxy[0, 2]), ::(1 if BGR else -1)]
+    if save:
+        file.parent.mkdir(parents=True, exist_ok=True)  # make directory
+        f = str(increment_path(file).with_suffix('.jpg'))
+        # cv2.imwrite(f, crop)  # save BGR, https://github.com/ultralytics/yolov5/issues/7007 chroma subsampling issue
+        Image.fromarray(crop[..., ::-1]).save(f, quality=95, subsampling=0)  # save RGB
+    return crop
--- a/utils/segment/__init__.py
+++ b/utils/segment/__init__.py
+# init
\ No newline at end of file
--- a/utils/segment/augmentations.py
+++ b/utils/segment/augmentations.py
+import math
+import random
+import cv2
+import numpy as np
+from ..augmentations import box_candidates
+from ..general import resample_segments, segment2box
+def mixup(im, labels, segments, im2, labels2, segments2):
+    # Applies MixUp augmentation https://arxiv.org/pdf/1710.09412.pdf
+    r = np.random.beta(32.0, 32.0)  # mixup ratio, alpha=beta=32.0
+    im = (im * r + im2 * (1 - r)).astype(np.uint8)
+    labels = np.concatenate((labels, labels2), 0)
+    segments = np.concatenate((segments, segments2), 0)
+    return im, labels, segments
+def random_perspective(im,
+                       targets=(),
+                       segments=(),
+                       degrees=10,
+                       translate=.1,
+                       scale=.1,
+                       shear=10,
+                       perspective=0.0,
+                       border=(0, 0)):
+    # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-10, 10))
+    # targets = [cls, xyxy]
+    height = im.shape[0] + border[0] * 2  # shape(h,w,c)
+    width = im.shape[1] + border[1] * 2
+    # Center
+    C = np.eye(3)
+    C[0, 2] = -im.shape[1] / 2  # x translation (pixels)
+    C[1, 2] = -im.shape[0] / 2  # y translation (pixels)
+    # Perspective
+    P = np.eye(3)
+    P[2, 0] = random.uniform(-perspective, perspective)  # x perspective (about y)
+    P[2, 1] = random.uniform(-perspective, perspective)  # y perspective (about x)
+    # Rotation and Scale
+    R = np.eye(3)
+    a = random.uniform(-degrees, degrees)
+    # a += random.choice([-180, -90, 0, 90])  # add 90deg rotations to small rotations
+    s = random.uniform(1 - scale, 1 + scale)
+    # s = 2 ** random.uniform(-scale, scale)
+    R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s)
+    # Shear
+    S = np.eye(3)
+    S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # x shear (deg)
+    S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # y shear (deg)
+    # Translation
+    T = np.eye(3)
+    T[0, 2] = (random.uniform(0.5 - translate, 0.5 + translate) * width)  # x translation (pixels)
+    T[1, 2] = (random.uniform(0.5 - translate, 0.5 + translate) * height)  # y translation (pixels)
+    # Combined rotation matrix
+    M = T @ S @ R @ P @ C  # order of operations (right to left) is IMPORTANT
+    if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any():  # image changed
+        if perspective:
+            im = cv2.warpPerspective(im, M, dsize=(width, height), borderValue=(114, 114, 114))
+        else:  # affine
+            im = cv2.warpAffine(im, M[:2], dsize=(width, height), borderValue=(114, 114, 114))
+    # Visualize
+    # import matplotlib.pyplot as plt
+    # ax = plt.subplots(1, 2, figsize=(12, 6))[1].ravel()
+    # ax[0].imshow(im[:, :, ::-1])  # base
+    # ax[1].imshow(im2[:, :, ::-1])  # warped
+    # Transform label coordinates
+    n = len(targets)
+    new_segments = []
+    if n:
+        new = np.zeros((n, 4))
+        segments = resample_segments(segments)  # upsample
+        for i, segment in enumerate(segments):
+            xy = np.ones((len(segment), 3))
+            xy[:, :2] = segment
+            xy = xy @ M.T  # transform
+            xy = (xy[:, :2] / xy[:, 2:3] if perspective else xy[:, :2])  # perspective rescale or affine
+            # clip
+            new[i] = segment2box(xy, width, height)
+            new_segments.append(xy)
+        # filter candidates
+        i = box_candidates(box1=targets[:, 1:5].T * s, box2=new.T, area_thr=0.01)
+        targets = targets[i]
+        targets[:, 1:5] = new[i]
+        new_segments = np.array(new_segments)[i]
+    return im, targets, new_segments