"git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "965bcabfb906ca020df4b18fef520a5ce31245d1"
Unverified Commit 8a45147f authored by YosuaMichael's avatar YosuaMichael Committed by GitHub
Browse files

Adding video accuracy for video_classification reference script (#6241)

* Add ensembled video accuracy on video reference script

* Change the parser func to be similar with classification reference

* Fix typo type->dtype

* Use custom kinetics

* Fix dataset to not getting start_pts

* Change dataset name, and put video_idx at the back

* Ufmt format

* Use functional softmax, updating meta and use it to overwrite eval param

* Fix typo

* Put the eval parameters on the docs for now

* Change meta for video resnet to use frame-rate 15, also change wording on docs
parent bd19fb8e
from typing import Tuple
import torchvision
from torch import Tensor
class KineticsWithVideoId(torchvision.datasets.Kinetics):
def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor, int]:
video, audio, info, video_idx = self.video_clips.get_clip(idx)
label = self.samples[video_idx][1]
if self.transform is not None:
video = self.transform(video)
return video, audio, label, video_idx
...@@ -3,6 +3,7 @@ import os ...@@ -3,6 +3,7 @@ import os
import time import time
import warnings import warnings
import datasets
import presets import presets
import torch import torch
import torch.utils.data import torch.utils.data
...@@ -11,7 +12,7 @@ import torchvision.datasets.video_utils ...@@ -11,7 +12,7 @@ import torchvision.datasets.video_utils
import utils import utils
from torch import nn from torch import nn
from torch.utils.data.dataloader import default_collate from torch.utils.data.dataloader import default_collate
from torchvision.datasets.samplers import DistributedSampler, UniformClipSampler, RandomClipSampler from torchvision.datasets.samplers import DistributedSampler, RandomClipSampler, UniformClipSampler
def train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, print_freq, scaler=None): def train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, print_freq, scaler=None):
...@@ -21,7 +22,7 @@ def train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, devi ...@@ -21,7 +22,7 @@ def train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, devi
metric_logger.add_meter("clips/s", utils.SmoothedValue(window_size=10, fmt="{value:.3f}")) metric_logger.add_meter("clips/s", utils.SmoothedValue(window_size=10, fmt="{value:.3f}"))
header = f"Epoch: [{epoch}]" header = f"Epoch: [{epoch}]"
for video, target in metric_logger.log_every(data_loader, print_freq, header): for video, target, _ in metric_logger.log_every(data_loader, print_freq, header):
start_time = time.time() start_time = time.time()
video, target = video.to(device), target.to(device) video, target = video.to(device), target.to(device)
with torch.cuda.amp.autocast(enabled=scaler is not None): with torch.cuda.amp.autocast(enabled=scaler is not None):
...@@ -52,13 +53,25 @@ def evaluate(model, criterion, data_loader, device): ...@@ -52,13 +53,25 @@ def evaluate(model, criterion, data_loader, device):
metric_logger = utils.MetricLogger(delimiter=" ") metric_logger = utils.MetricLogger(delimiter=" ")
header = "Test:" header = "Test:"
num_processed_samples = 0 num_processed_samples = 0
# Group and aggregate output of a video
num_videos = len(data_loader.dataset.samples)
num_classes = len(data_loader.dataset.classes)
agg_preds = torch.zeros((num_videos, num_classes), dtype=torch.float32, device=device)
agg_targets = torch.zeros((num_videos), dtype=torch.int32, device=device)
with torch.inference_mode(): with torch.inference_mode():
for video, target in metric_logger.log_every(data_loader, 100, header): for video, target, video_idx in metric_logger.log_every(data_loader, 100, header):
video = video.to(device, non_blocking=True) video = video.to(device, non_blocking=True)
target = target.to(device, non_blocking=True) target = target.to(device, non_blocking=True)
output = model(video) output = model(video)
loss = criterion(output, target) loss = criterion(output, target)
# Use softmax to convert output into prediction probability
preds = torch.softmax(output, dim=1)
for b in range(video.size(0)):
idx = video_idx[b].item()
agg_preds[idx] += preds[b].detach()
agg_targets[idx] = target[b].detach().item()
acc1, acc5 = utils.accuracy(output, target, topk=(1, 5)) acc1, acc5 = utils.accuracy(output, target, topk=(1, 5))
# FIXME need to take into account that the datasets # FIXME need to take into account that the datasets
# could have been padded in distributed setup # could have been padded in distributed setup
...@@ -95,6 +108,11 @@ def evaluate(model, criterion, data_loader, device): ...@@ -95,6 +108,11 @@ def evaluate(model, criterion, data_loader, device):
top1=metric_logger.acc1, top5=metric_logger.acc5 top1=metric_logger.acc1, top5=metric_logger.acc5
) )
) )
# Reduce the agg_preds and agg_targets from all gpu and show result
agg_preds = utils.reduce_across_processes(agg_preds)
agg_targets = utils.reduce_across_processes(agg_targets, op=torch.distributed.ReduceOp.MAX)
agg_acc1, agg_acc5 = utils.accuracy(agg_preds, agg_targets, topk=(1, 5))
print(" * Video Acc@1 {acc1:.3f} Video Acc@5 {acc5:.3f}".format(acc1=agg_acc1, acc5=agg_acc5))
return metric_logger.acc1.global_avg return metric_logger.acc1.global_avg
...@@ -110,7 +128,7 @@ def _get_cache_path(filepath, args): ...@@ -110,7 +128,7 @@ def _get_cache_path(filepath, args):
def collate_fn(batch): def collate_fn(batch):
# remove audio from the batch # remove audio from the batch
batch = [(d[0], d[2]) for d in batch] batch = [(d[0], d[2], d[3]) for d in batch]
return default_collate(batch) return default_collate(batch)
...@@ -146,7 +164,7 @@ def main(args): ...@@ -146,7 +164,7 @@ def main(args):
else: else:
if args.distributed: if args.distributed:
print("It is recommended to pre-compute the dataset cache on a single-gpu first, as it will be faster") print("It is recommended to pre-compute the dataset cache on a single-gpu first, as it will be faster")
dataset = torchvision.datasets.Kinetics( dataset = datasets.KineticsWithVideoId(
args.data_path, args.data_path,
frames_per_clip=args.clip_len, frames_per_clip=args.clip_len,
num_classes=args.kinetics_version, num_classes=args.kinetics_version,
...@@ -183,7 +201,7 @@ def main(args): ...@@ -183,7 +201,7 @@ def main(args):
else: else:
if args.distributed: if args.distributed:
print("It is recommended to pre-compute the dataset cache on a single-gpu first, as it will be faster") print("It is recommended to pre-compute the dataset cache on a single-gpu first, as it will be faster")
dataset_test = torchvision.datasets.Kinetics( dataset_test = datasets.KineticsWithVideoId(
args.data_path, args.data_path,
frames_per_clip=args.clip_len, frames_per_clip=args.clip_len,
num_classes=args.kinetics_version, num_classes=args.kinetics_version,
...@@ -313,10 +331,10 @@ def main(args): ...@@ -313,10 +331,10 @@ def main(args):
print(f"Training time {total_time_str}") print(f"Training time {total_time_str}")
def parse_args(): def get_args_parser(add_help=True):
import argparse import argparse
parser = argparse.ArgumentParser(description="PyTorch Video Classification Training") parser = argparse.ArgumentParser(description="PyTorch Video Classification Training", add_help=add_help)
parser.add_argument("--data-path", default="/datasets01_101/kinetics/070618/", type=str, help="dataset path") parser.add_argument("--data-path", default="/datasets01_101/kinetics/070618/", type=str, help="dataset path")
parser.add_argument( parser.add_argument(
...@@ -387,11 +405,9 @@ def parse_args(): ...@@ -387,11 +405,9 @@ def parse_args():
# Mixed precision training parameters # Mixed precision training parameters
parser.add_argument("--amp", action="store_true", help="Use torch.cuda.amp for mixed precision training") parser.add_argument("--amp", action="store_true", help="Use torch.cuda.amp for mixed precision training")
args = parser.parse_args() return parser
return args
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() args = get_args_parser().parse_args()
main(args) main(args)
...@@ -253,12 +253,12 @@ def init_distributed_mode(args): ...@@ -253,12 +253,12 @@ def init_distributed_mode(args):
setup_for_distributed(args.rank == 0) setup_for_distributed(args.rank == 0)
def reduce_across_processes(val): def reduce_across_processes(val, op=dist.ReduceOp.SUM):
if not is_dist_avail_and_initialized(): if not is_dist_avail_and_initialized():
# nothing to sync, but we still convert to tensor for consistency with the distributed case. # nothing to sync, but we still convert to tensor for consistency with the distributed case.
return torch.tensor(val) return torch.tensor(val)
t = torch.tensor(val, device="cuda") t = torch.tensor(val, device="cuda")
dist.barrier() dist.barrier()
dist.all_reduce(t) dist.all_reduce(t, op=op)
return t return t
...@@ -445,12 +445,15 @@ class MViT_V1_B_Weights(WeightsEnum): ...@@ -445,12 +445,15 @@ class MViT_V1_B_Weights(WeightsEnum):
"min_temporal_size": 16, "min_temporal_size": 16,
"categories": _KINETICS400_CATEGORIES, "categories": _KINETICS400_CATEGORIES,
"recipe": "https://github.com/facebookresearch/pytorchvideo/blob/main/docs/source/model_zoo.md", "recipe": "https://github.com/facebookresearch/pytorchvideo/blob/main/docs/source/model_zoo.md",
"_docs": """These weights support 16-frame clip inputs and were ported from the paper.""", "_docs": (
"The weights were ported from the paper. The accuracies are estimated on video-level "
"with parameters `frame_rate=7.5`, `clips_per_video=5`, and `clip_len=16`"
),
"num_params": 36610672, "num_params": 36610672,
"_metrics": { "_metrics": {
"Kinetics-400": { "Kinetics-400": {
"acc@1": 78.47, "acc@1": 78.477,
"acc@5": 93.65, "acc@5": 93.582,
} }
}, },
}, },
......
...@@ -312,7 +312,10 @@ _COMMON_META = { ...@@ -312,7 +312,10 @@ _COMMON_META = {
"min_size": (1, 1), "min_size": (1, 1),
"categories": _KINETICS400_CATEGORIES, "categories": _KINETICS400_CATEGORIES,
"recipe": "https://github.com/pytorch/vision/tree/main/references/video_classification", "recipe": "https://github.com/pytorch/vision/tree/main/references/video_classification",
"_docs": """These weights reproduce closely the accuracy of the paper for 16-frame clip inputs.""", "_docs": (
"The weights reproduce closely the accuracy of the paper. The accuracies are estimated on video-level "
"with parameters `frame_rate=15`, `clips_per_video=5`, and `clip_len=16`."
),
} }
...@@ -325,8 +328,8 @@ class R3D_18_Weights(WeightsEnum): ...@@ -325,8 +328,8 @@ class R3D_18_Weights(WeightsEnum):
"num_params": 33371472, "num_params": 33371472,
"_metrics": { "_metrics": {
"Kinetics-400": { "Kinetics-400": {
"acc@1": 52.75, "acc@1": 63.200,
"acc@5": 75.45, "acc@5": 83.479,
} }
}, },
}, },
...@@ -343,8 +346,8 @@ class MC3_18_Weights(WeightsEnum): ...@@ -343,8 +346,8 @@ class MC3_18_Weights(WeightsEnum):
"num_params": 11695440, "num_params": 11695440,
"_metrics": { "_metrics": {
"Kinetics-400": { "Kinetics-400": {
"acc@1": 53.90, "acc@1": 63.960,
"acc@5": 76.29, "acc@5": 84.130,
} }
}, },
}, },
...@@ -361,8 +364,8 @@ class R2Plus1D_18_Weights(WeightsEnum): ...@@ -361,8 +364,8 @@ class R2Plus1D_18_Weights(WeightsEnum):
"num_params": 31505325, "num_params": 31505325,
"_metrics": { "_metrics": {
"Kinetics-400": { "Kinetics-400": {
"acc@1": 57.50, "acc@1": 67.463,
"acc@5": 78.81, "acc@5": 86.175,
} }
}, },
}, },
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment