v1.0

0063a668 · chenzk · 0063a668 · 0063a668 · 0063a668 · 0063a668
Commit 0063a668 authored May 13, 2025 by chenzk
20 changed files
--- a/facebookresearch/co-tracker/cotracker/evaluation/configs/eval_tapvid_robotap_first.yaml
+++ b/facebookresearch/co-tracker/cotracker/evaluation/configs/eval_tapvid_robotap_first.yaml
+defaults:
+  - default_config_eval
+exp_dir: ./outputs/cotracker
+dataset_name: tapvid_robotap_first
\ No newline at end of file
--- a/facebookresearch/co-tracker/cotracker/evaluation/configs/eval_tapvid_stacking_first.yaml
+++ b/facebookresearch/co-tracker/cotracker/evaluation/configs/eval_tapvid_stacking_first.yaml
+defaults:
+  - default_config_eval
+exp_dir: ./outputs/cotracker
+dataset_name: tapvid_stacking_first
+
+   
\ No newline at end of file
--- a/facebookresearch/co-tracker/cotracker/evaluation/configs/eval_tapvid_stacking_strided.yaml
+++ b/facebookresearch/co-tracker/cotracker/evaluation/configs/eval_tapvid_stacking_strided.yaml
+defaults:
+  - default_config_eval
+exp_dir: ./outputs/cotracker
+dataset_name: tapvid_stacking_strided
+
+   
\ No newline at end of file
--- a/facebookresearch/co-tracker/cotracker/evaluation/core/__init__.py
+++ b/facebookresearch/co-tracker/cotracker/evaluation/core/__init__.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
--- a/facebookresearch/co-tracker/cotracker/evaluation/core/eval_utils.py
+++ b/facebookresearch/co-tracker/cotracker/evaluation/core/eval_utils.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+from typing import Iterable, Mapping, Tuple, Union
+
+
+def compute_tapvid_metrics(
+    query_points: np.ndarray,
+    gt_occluded: np.ndarray,
+    gt_tracks: np.ndarray,
+    pred_occluded: np.ndarray,
+    pred_tracks: np.ndarray,
+    query_mode: str,
+) -> Mapping[str, np.ndarray]:
+    """Computes TAP-Vid metrics (Jaccard, Pts. Within Thresh, Occ. Acc.)
+    See the TAP-Vid paper for details on the metric computation.  All inputs are
+    given in raster coordinates.  The first three arguments should be the direct
+    outputs of the reader: the 'query_points', 'occluded', and 'target_points'.
+    The paper metrics assume these are scaled relative to 256x256 images.
+    pred_occluded and pred_tracks are your algorithm's predictions.
+    This function takes a batch of inputs, and computes metrics separately for
+    each video.  The metrics for the full benchmark are a simple mean of the
+    metrics across the full set of videos.  These numbers are between 0 and 1,
+    but the paper multiplies them by 100 to ease reading.
+    Args:
+       query_points: The query points, an in the format [t, y, x].  Its size is
+         [b, n, 3], where b is the batch size and n is the number of queries
+       gt_occluded: A boolean array of shape [b, n, t], where t is the number
+         of frames.  True indicates that the point is occluded.
+       gt_tracks: The target points, of shape [b, n, t, 2].  Each point is
+         in the format [x, y]
+       pred_occluded: A boolean array of predicted occlusions, in the same
+         format as gt_occluded.
+       pred_tracks: An array of track predictions from your algorithm, in the
+         same format as gt_tracks.
+       query_mode: Either 'first' or 'strided', depending on how queries are
+         sampled.  If 'first', we assume the prior knowledge that all points
+         before the query point are occluded, and these are removed from the
+         evaluation.
+    Returns:
+        A dict with the following keys:
+        occlusion_accuracy: Accuracy at predicting occlusion.
+        pts_within_{x} for x in [1, 2, 4, 8, 16]: Fraction of points
+          predicted to be within the given pixel threshold, ignoring occlusion
+          prediction.
+        jaccard_{x} for x in [1, 2, 4, 8, 16]: Jaccard metric for the given
+          threshold
+        average_pts_within_thresh: average across pts_within_{x}
+        average_jaccard: average across jaccard_{x}
+    """
+
+    metrics = {}
+    # Fixed bug is described in:
+    # https://github.com/facebookresearch/co-tracker/issues/20
+    eye = np.eye(gt_tracks.shape[2], dtype=np.int32)
+
+    if query_mode == "first":
+        # evaluate frames after the query frame
+        query_frame_to_eval_frames = np.cumsum(eye, axis=1) - eye
+    elif query_mode == "strided":
+        # evaluate all frames except the query frame
+        query_frame_to_eval_frames = 1 - eye
+    else:
+        raise ValueError("Unknown query mode " + query_mode)
+
+    query_frame = query_points[..., 0]
+    query_frame = np.round(query_frame).astype(np.int32)
+    evaluation_points = query_frame_to_eval_frames[query_frame] > 0
+
+    # Occlusion accuracy is simply how often the predicted occlusion equals the
+    # ground truth.
+    occ_acc = np.sum(
+        np.equal(pred_occluded, gt_occluded) & evaluation_points,
+        axis=(1, 2),
+    ) / np.sum(evaluation_points)
+    metrics["occlusion_accuracy"] = occ_acc
+
+    # Next, convert the predictions and ground truth positions into pixel
+    # coordinates.
+    visible = np.logical_not(gt_occluded)
+    pred_visible = np.logical_not(pred_occluded)
+    all_frac_within = []
+    all_jaccard = []
+    for thresh in [1, 2, 4, 8, 16]:
+        # True positives are points that are within the threshold and where both
+        # the prediction and the ground truth are listed as visible.
+        within_dist = np.sum(
+            np.square(pred_tracks - gt_tracks),
+            axis=-1,
+        ) < np.square(thresh)
+        is_correct = np.logical_and(within_dist, visible)
+
+        # Compute the frac_within_threshold, which is the fraction of points
+        # within the threshold among points that are visible in the ground truth,
+        # ignoring whether they're predicted to be visible.
+        count_correct = np.sum(
+            is_correct & evaluation_points,
+            axis=(1, 2),
+        )
+        count_visible_points = np.sum(visible & evaluation_points, axis=(1, 2))
+        frac_correct = count_correct / count_visible_points
+        metrics["pts_within_" + str(thresh)] = frac_correct
+        all_frac_within.append(frac_correct)
+
+        true_positives = np.sum(
+            is_correct & pred_visible & evaluation_points, axis=(1, 2)
+        )
+
+        # The denominator of the jaccard metric is the true positives plus
+        # false positives plus false negatives.  However, note that true positives
+        # plus false negatives is simply the number of points in the ground truth
+        # which is easier to compute than trying to compute all three quantities.
+        # Thus we just add the number of points in the ground truth to the number
+        # of false positives.
+        #
+        # False positives are simply points that are predicted to be visible,
+        # but the ground truth is not visible or too far from the prediction.
+        gt_positives = np.sum(visible & evaluation_points, axis=(1, 2))
+        false_positives = (~visible) & pred_visible
+        false_positives = false_positives | ((~within_dist) & pred_visible)
+        false_positives = np.sum(false_positives & evaluation_points, axis=(1, 2))
+        jaccard = true_positives / (gt_positives + false_positives)
+        metrics["jaccard_" + str(thresh)] = jaccard
+        all_jaccard.append(jaccard)
+    metrics["average_jaccard"] = np.mean(
+        np.stack(all_jaccard, axis=1),
+        axis=1,
+    )
+    metrics["average_pts_within_thresh"] = np.mean(
+        np.stack(all_frac_within, axis=1),
+        axis=1,
+    )
+    return metrics
--- a/facebookresearch/co-tracker/cotracker/evaluation/core/evaluator.py
+++ b/facebookresearch/co-tracker/cotracker/evaluation/core/evaluator.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import defaultdict
+import os
+from typing import Optional
+import torch
+from tqdm import tqdm
+import numpy as np
+
+from torch.utils.tensorboard import SummaryWriter
+from cotracker.datasets.utils import dataclass_to_cuda_
+from cotracker.utils.visualizer import Visualizer
+from cotracker.models.core.model_utils import reduce_masked_mean
+from cotracker.evaluation.core.eval_utils import compute_tapvid_metrics
+from cotracker.predictor import CoTrackerOnlinePredictor
+from cotracker.models.core.cotracker.cotracker3_offline import CoTrackerThreeOffline
+from cotracker.models.core.cotracker.cotracker3_online import CoTrackerThreeOnline
+import logging
+
+
+class Evaluator:
+    """
+    A class defining the CoTracker evaluator.
+    """
+
+    def __init__(self, exp_dir) -> None:
+        # Visualization
+        self.exp_dir = exp_dir
+        os.makedirs(exp_dir, exist_ok=True)
+        self.visualization_filepaths = defaultdict(lambda: defaultdict(list))
+        self.visualize_dir = os.path.join(exp_dir, "visualisations")
+
+    def compute_metrics(self, metrics, sample, pred_trajectory, dataset_name):
+        if isinstance(pred_trajectory, tuple):
+            pred_trajectory, pred_visibility = pred_trajectory
+        else:
+            pred_visibility = None
+        if "tapvid" in dataset_name:
+            B, T, N, D = sample.trajectory.shape
+            traj = sample.trajectory.clone()
+            thr = 0.6
+
+            if pred_visibility is None:
+                logging.warning("visibility is NONE")
+                pred_visibility = torch.zeros_like(sample.visibility)
+
+            if not pred_visibility.dtype == torch.bool:
+                pred_visibility = pred_visibility > thr
+
+            query_points = sample.query_points.clone().cpu().numpy()
+
+            pred_visibility = pred_visibility[:, :, :N]
+            pred_trajectory = pred_trajectory[:, :, :N]
+
+            gt_tracks = traj.permute(0, 2, 1, 3).cpu().numpy()
+            gt_occluded = (
+                torch.logical_not(sample.visibility.clone().permute(0, 2, 1))
+                .cpu()
+                .numpy()
+            )
+
+            pred_occluded = (
+                torch.logical_not(pred_visibility.clone().permute(0, 2, 1))
+                .cpu()
+                .numpy()
+            )
+            pred_tracks = pred_trajectory.permute(0, 2, 1, 3).cpu().numpy()
+
+            out_metrics = compute_tapvid_metrics(
+                query_points,
+                gt_occluded,
+                gt_tracks,
+                pred_occluded,
+                pred_tracks,
+                query_mode="strided" if "strided" in dataset_name else "first",
+            )
+
+            metrics[sample.seq_name[0]] = out_metrics
+            for metric_name in out_metrics.keys():
+                if "avg" not in metrics:
+                    metrics["avg"] = {}
+                metrics["avg"][metric_name] = np.mean(
+                    [v[metric_name] for k, v in metrics.items() if k != "avg"]
+                )
+
+            logging.info(f"Metrics: {out_metrics}")
+            logging.info(f"avg: {metrics['avg']}")
+            print("metrics", out_metrics)
+            print("avg", metrics["avg"])
+        elif dataset_name == "dynamic_replica" or dataset_name == "pointodyssey":
+            *_, N, _ = sample.trajectory.shape
+            B, T, N = sample.visibility.shape
+            H, W = sample.video.shape[-2:]
+            device = sample.video.device
+
+            out_metrics = {}
+
+            d_vis_sum = d_occ_sum = d_sum_all = 0.0
+            thrs = [1, 2, 4, 8, 16]
+            sx_ = (W - 1) / 255.0
+            sy_ = (H - 1) / 255.0
+            sc_py = np.array([sx_, sy_]).reshape([1, 1, 2])
+            sc_pt = torch.from_numpy(sc_py).float().to(device)
+            __, first_visible_inds = torch.max(sample.visibility, dim=1)
+
+            frame_ids_tensor = torch.arange(T, device=device)[None, :, None].repeat(
+                B, 1, N
+            )
+            start_tracking_mask = frame_ids_tensor > (first_visible_inds.unsqueeze(1))
+
+            for thr in thrs:
+                d_ = (
+                    torch.norm(
+                        pred_trajectory[..., :2] / sc_pt
+                        - sample.trajectory[..., :2] / sc_pt,
+                        dim=-1,
+                    )
+                    < thr
+                ).float()  # B,S-1,N
+                d_occ = (
+                    reduce_masked_mean(
+                        d_, (1 - sample.visibility) * start_tracking_mask
+                    ).item()
+                    * 100.0
+                )
+                d_occ_sum += d_occ
+                out_metrics[f"accuracy_occ_{thr}"] = d_occ
+
+                d_vis = (
+                    reduce_masked_mean(
+                        d_, sample.visibility * start_tracking_mask
+                    ).item()
+                    * 100.0
+                )
+                d_vis_sum += d_vis
+                out_metrics[f"accuracy_vis_{thr}"] = d_vis
+
+                d_all = reduce_masked_mean(d_, start_tracking_mask).item() * 100.0
+                d_sum_all += d_all
+                out_metrics[f"accuracy_{thr}"] = d_all
+
+            d_occ_avg = d_occ_sum / len(thrs)
+            d_vis_avg = d_vis_sum / len(thrs)
+            d_all_avg = d_sum_all / len(thrs)
+
+            sur_thr = 50
+            dists = torch.norm(
+                pred_trajectory[..., :2] / sc_pt - sample.trajectory[..., :2] / sc_pt,
+                dim=-1,
+            )  # B,S,N
+            dist_ok = 1 - (dists > sur_thr).float() * sample.visibility  # B,S,N
+            survival = torch.cumprod(dist_ok, dim=1)  # B,S,N
+            out_metrics["survival"] = torch.mean(survival).item() * 100.0
+
+            out_metrics["accuracy_occ"] = d_occ_avg
+            out_metrics["accuracy_vis"] = d_vis_avg
+            out_metrics["accuracy"] = d_all_avg
+
+            metrics[sample.seq_name[0]] = out_metrics
+            for metric_name in out_metrics.keys():
+                if "avg" not in metrics:
+                    metrics["avg"] = {}
+                metrics["avg"][metric_name] = float(
+                    np.mean([v[metric_name] for k, v in metrics.items() if k != "avg"])
+                )
+
+            logging.info(f"Metrics: {out_metrics}")
+            logging.info(f"avg: {metrics['avg']}")
+            print("metrics", out_metrics)
+            print("avg", metrics["avg"])
+
+    @torch.no_grad()
+    def evaluate_sequence(
+        self,
+        model,
+        test_dataloader: torch.utils.data.DataLoader,
+        dataset_name: str,
+        train_mode=False,
+        visualize_every: int = 50,
+        writer: Optional[SummaryWriter] = None,
+        step: Optional[int] = 0,
+    ):
+        metrics = {}
+
+        vis = Visualizer(
+            save_dir=self.exp_dir,
+            fps=7,
+        )
+
+        for ind, sample in enumerate(tqdm(test_dataloader)):
+            if isinstance(sample, tuple):
+                sample, gotit = sample
+                if not all(gotit):
+                    print("batch is None")
+                    continue
+            if torch.cuda.is_available():
+                dataclass_to_cuda_(sample)
+                device = torch.device("cuda")
+            else:
+                device = torch.device("cpu")
+
+            if (
+                not train_mode
+                and hasattr(model, "sequence_len")
+                and (sample.visibility[:, : model.sequence_len].sum() == 0)
+            ):
+                print(f"skipping batch {ind}")
+                continue
+
+            if "tapvid" in dataset_name:
+                queries = sample.query_points.clone().float()
+
+                queries = torch.stack(
+                    [
+                        queries[:, :, 0],
+                        queries[:, :, 2],
+                        queries[:, :, 1],
+                    ],
+                    dim=2,
+                ).to(device)
+            else:
+                queries = torch.cat(
+                    [
+                        torch.zeros_like(sample.trajectory[:, 0, :, :1]),
+                        sample.trajectory[:, 0],
+                    ],
+                    dim=2,
+                ).to(device)
+
+            if isinstance(model.model, CoTrackerThreeOnline):
+                online_model = CoTrackerOnlinePredictor(checkpoint=None)
+                online_model.model = model.model
+                online_model.step = model.model.window_len // 2
+                online_model(
+                    video_chunk=sample.video,
+                    is_first_step=True,
+                    queries=queries,
+                    add_support_grid=False,
+                )
+                # Process the video
+                for ind in range(
+                    0, sample.video.shape[1] - online_model.step, online_model.step
+                ):
+                    pred_tracks, pred_visibility = online_model(
+                        video_chunk=sample.video[:, ind : ind + online_model.step * 2],
+                        add_support_grid=False,
+                        grid_size=0,
+                    )  # B T N 2,  B T N 1
+                pred_tracks = (pred_tracks, pred_visibility)
+            else:
+                pred_tracks = model(sample.video, queries)
+
+            if "strided" in dataset_name:
+                inv_video = sample.video.flip(1).clone()
+                inv_queries = queries.clone()
+                inv_queries[:, :, 0] = inv_video.shape[1] - inv_queries[:, :, 0] - 1
+
+                pred_trj, pred_vsb = pred_tracks
+                inv_pred_trj, inv_pred_vsb = model(inv_video, inv_queries)
+
+                inv_pred_trj = inv_pred_trj.flip(1)
+                inv_pred_vsb = inv_pred_vsb.flip(1)
+
+                mask = pred_trj == 0
+
+                pred_trj[mask] = inv_pred_trj[mask]
+                pred_vsb[mask[:, :, :, 0]] = inv_pred_vsb[mask[:, :, :, 0]]
+
+                pred_tracks = pred_trj, pred_vsb
+
+            if dataset_name == "badja" or dataset_name == "fastcapture":
+                seq_name = sample.seq_name[0]
+            else:
+                seq_name = str(ind)
+            if ind % visualize_every == 0:
+                vis.visualize(
+                    sample.video,
+                    pred_tracks[0] if isinstance(pred_tracks, tuple) else pred_tracks,
+                    filename=dataset_name + "_" + seq_name,
+                    writer=writer,
+                    step=step,
+                )
+            self.compute_metrics(metrics, sample, pred_tracks, dataset_name)
+        return metrics
--- a/facebookresearch/co-tracker/cotracker/evaluation/evaluate.py
+++ b/facebookresearch/co-tracker/cotracker/evaluation/evaluate.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import os
+import hydra
+import numpy as np
+import torch
+
+from typing import Optional
+from dataclasses import dataclass, field
+
+from omegaconf import OmegaConf
+
+from cotracker.datasets.utils import collate_fn
+from cotracker.models.evaluation_predictor import EvaluationPredictor
+
+from cotracker.evaluation.core.evaluator import Evaluator
+from cotracker.models.build_cotracker import build_cotracker
+
+
+@dataclass(eq=False)
+class DefaultConfig:
+    # Directory where all outputs of the experiment will be saved.
+    exp_dir: str = "./outputs"
+
+    # Name of the dataset to be used for the evaluation.
+    dataset_name: str = "tapvid_davis_first"
+    # The root directory of the dataset.
+    dataset_root: str = "./"
+
+    # Path to the pre-trained model checkpoint to be used for the evaluation.
+    # The default value is the path to a specific CoTracker model checkpoint.
+    checkpoint: str = "./checkpoints/scaled_online.pth"
+    # EvaluationPredictor parameters
+    # The size (N) of the support grid used in the predictor.
+    # The total number of points is (N*N).
+    grid_size: int = 5
+    # The size (N) of the local support grid.
+    local_grid_size: int = 8
+    num_uniformly_sampled_pts: int = 0
+    sift_size: int = 0
+    # A flag indicating whether to evaluate one ground truth point at a time.
+    single_point: bool = False
+    offline_model: bool = False
+    window_len: int = 16
+    # The number of iterative updates for each sliding window.
+    n_iters: int = 6
+
+    seed: int = 0
+    gpu_idx: int = 0
+    local_extent: int = 50
+
+    v2: bool = False
+
+    # Override hydra's working directory to current working dir,
+    # also disable storing the .hydra logs:
+    hydra: dict = field(
+        default_factory=lambda: {
+            "run": {"dir": "."},
+            "output_subdir": None,
+        }
+    )
+
+
+def run_eval(cfg: DefaultConfig):
+    """
+    The function evaluates CoTracker on a specified benchmark dataset based on a provided configuration.
+
+    Args:
+        cfg (DefaultConfig): An instance of DefaultConfig class which includes:
+            - exp_dir (str): The directory path for the experiment.
+            - dataset_name (str): The name of the dataset to be used.
+            - dataset_root (str): The root directory of the dataset.
+            - checkpoint (str): The path to the CoTracker model's checkpoint.
+            - single_point (bool): A flag indicating whether to evaluate one ground truth point at a time.
+            - n_iters (int): The number of iterative updates for each sliding window.
+            - seed (int): The seed for setting the random state for reproducibility.
+            - gpu_idx (int): The index of the GPU to be used.
+    """
+    # Creating the experiment directory if it doesn't exist
+    os.makedirs(cfg.exp_dir, exist_ok=True)
+
+    # Saving the experiment configuration to a .yaml file in the experiment directory
+    cfg_file = os.path.join(cfg.exp_dir, "expconfig.yaml")
+    with open(cfg_file, "w") as f:
+        OmegaConf.save(config=cfg, f=f)
+
+    evaluator = Evaluator(cfg.exp_dir)
+    cotracker_model = build_cotracker(
+        cfg.checkpoint, offline=cfg.offline_model, window_len=cfg.window_len, v2=cfg.v2
+    )
+
+    # Creating the EvaluationPredictor object
+    predictor = EvaluationPredictor(
+        cotracker_model,
+        grid_size=cfg.grid_size,
+        local_grid_size=cfg.local_grid_size,
+        sift_size=cfg.sift_size,
+        single_point=cfg.single_point,
+        num_uniformly_sampled_pts=cfg.num_uniformly_sampled_pts,
+        n_iters=cfg.n_iters,
+        local_extent=cfg.local_extent,
+        interp_shape=(384, 512),
+    )
+
+    if torch.cuda.is_available():
+        predictor.model = predictor.model.cuda()
+
+    # Setting the random seeds
+    torch.manual_seed(cfg.seed)
+    np.random.seed(cfg.seed)
+
+    # Constructing the specified dataset
+    curr_collate_fn = collate_fn
+    if "tapvid" in cfg.dataset_name:
+        from cotracker.datasets.tap_vid_datasets import TapVidDataset
+
+        dataset_type = cfg.dataset_name.split("_")[1]
+        if dataset_type == "davis":
+            data_root = os.path.join(
+                cfg.dataset_root, "tapvid_davis", "tapvid_davis.pkl"
+            )
+        elif dataset_type == "kinetics":
+            data_root = os.path.join(cfg.dataset_root, "tapvid_kinetics")
+        elif dataset_type == "robotap":
+            data_root = os.path.join(cfg.dataset_root, "tapvid_robotap")
+        elif dataset_type == "stacking":
+            data_root = os.path.join(
+                cfg.dataset_root, "tapvid_rgb_stacking", "tapvid_rgb_stacking.pkl"
+            )
+
+        test_dataset = TapVidDataset(
+            dataset_type=dataset_type,
+            data_root=data_root,
+            queried_first=not "strided" in cfg.dataset_name,
+            # resize_to=None,
+        )
+    elif cfg.dataset_name == "dynamic_replica":
+        from cotracker.datasets.dr_dataset import DynamicReplicaDataset
+
+        test_dataset = DynamicReplicaDataset(
+            cfg.dataset_root, sample_len=300, only_first_n_samples=1
+        )
+
+    # Creating the DataLoader object
+    test_dataloader = torch.utils.data.DataLoader(
+        test_dataset,
+        batch_size=1,
+        shuffle=False,
+        num_workers=1,
+        collate_fn=curr_collate_fn,
+    )
+
+    # Timing and conducting the evaluation
+    import time
+
+    start = time.time()
+    evaluate_result = evaluator.evaluate_sequence(
+        predictor, test_dataloader, dataset_name=cfg.dataset_name
+    )
+    end = time.time()
+    print(end - start)
+
+    # Saving the evaluation results to a .json file
+    evaluate_result = evaluate_result["avg"]
+    print("evaluate_result", evaluate_result)
+    result_file = os.path.join(cfg.exp_dir, f"result_eval_.json")
+    evaluate_result["time"] = end - start
+    print(f"Dumping eval results to {result_file}.")
+    with open(result_file, "w") as f:
+        json.dump(evaluate_result, f)
+
+
+cs = hydra.core.config_store.ConfigStore.instance()
+cs.store(name="default_config_eval", node=DefaultConfig)
+
+
+@hydra.main(config_path="./configs/", config_name="default_config_eval")
+def evaluate(cfg: DefaultConfig) -> None:
+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(cfg.gpu_idx)
+    run_eval(cfg)
+
+
+if __name__ == "__main__":
+    evaluate()
--- a/facebookresearch/co-tracker/cotracker/models/__init__.py
+++ b/facebookresearch/co-tracker/cotracker/models/__init__.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
--- a/facebookresearch/co-tracker/cotracker/models/__pycache__/__init__.cpython-310.pyc
+++ b/facebookresearch/co-tracker/cotracker/models/__pycache__/__init__.cpython-310.pyc
--- a/facebookresearch/co-tracker/cotracker/models/__pycache__/build_cotracker.cpython-310.pyc
+++ b/facebookresearch/co-tracker/cotracker/models/__pycache__/build_cotracker.cpython-310.pyc
--- a/facebookresearch/co-tracker/cotracker/models/bootstap_predictor.py
+++ b/facebookresearch/co-tracker/cotracker/models/bootstap_predictor.py
+import torch
+import torch.nn.functional as F
+
+import sys
+
+import matplotlib.pyplot as plt
+import mediapy as media
+import numpy as np
+from tapnet.torch.tapir_model import TAPIR
+
+
+def postprocess_occlusions(occlusions, expected_dist):
+    visibles = (1 - F.sigmoid(occlusions)) * (1 - F.sigmoid(expected_dist)) > 0.5
+    return visibles
+
+
+class TAPIRPredictor(torch.nn.Module):
+    def __init__(self, bootstap=False, model=None):
+        super().__init__()
+        self.interp_shape = (256, 256)
+        if model is None:
+            if bootstap:
+                checkpoint = "./tapnet/bootstapir_checkpoint.pt"
+                model = TAPIR(pyramid_level=1, extra_convs=True)
+            else:
+                checkpoint = "./tapnet/tapir_checkpoint_panning.pt"
+                model = TAPIR(pyramid_level=0, extra_convs=False)
+            model.load_state_dict(torch.load(checkpoint))
+        self.model = model.eval().to("cuda")
+
+    def forward(self, rgbs, queries=None, grid_size=0, iters=6, eval_depth=False):
+        B, T, C, H, W = rgbs.shape
+        rgbs_ = rgbs.reshape(B * T, C, H, W)
+        rgbs_ = F.interpolate(rgbs_, tuple(self.interp_shape), mode="bilinear")
+        rgbs_ = rgbs_.reshape(B, T, 3, self.interp_shape[0], self.interp_shape[1])
+        rgbs_ = rgbs_[0].permute(0, 2, 3, 1)
+        rgbs_ = (rgbs_ / 255.0) * 2 - 1
+
+        if queries is not None:
+            queries = queries.clone().float()
+            B, N, D = queries.shape
+            assert D == 3
+            assert B == 1
+            queries[:, :, 1] *= self.interp_shape[1] / W
+            queries[:, :, 2] *= self.interp_shape[0] / H
+            queries = torch.stack(
+                [queries[..., 0], queries[..., 2], queries[..., 1]], dim=-1
+            )
+
+        outputs = self.model(video=rgbs_[None], query_points=queries)
+        tracks, occlusions, expected_dist = (
+            outputs["tracks"],
+            outputs["occlusion"][0],
+            outputs["expected_dist"][0],
+        )
+        visibility = postprocess_occlusions(occlusions, expected_dist)[None].permute(
+            0, 2, 1
+        )
+
+        tracks = tracks.permute(0, 2, 1, 3)
+
+        tracks[:, :, :, 0] *= W / float(self.interp_shape[1])
+        tracks[:, :, :, 1] *= H / float(self.interp_shape[0])
+
+        return tracks, visibility
--- a/facebookresearch/co-tracker/cotracker/models/build_cotracker.py
+++ b/facebookresearch/co-tracker/cotracker/models/build_cotracker.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from cotracker.models.core.cotracker.cotracker import CoTracker2
+from cotracker.models.core.cotracker.cotracker3_offline import CoTrackerThreeOffline
+from cotracker.models.core.cotracker.cotracker3_online import CoTrackerThreeOnline
+
+
+def build_cotracker(
+    checkpoint: str,
+):
+    if checkpoint is None:
+        return build_cotracker()
+    model_name = checkpoint.split("/")[-1].split(".")[0]
+    if model_name == "cotracker":
+        return build_cotracker(checkpoint=checkpoint)
+    else:
+        raise ValueError(f"Unknown model name {model_name}")
+
+
+def build_cotracker(checkpoint=None, offline=True, window_len=16, v2=False):
+    if v2:
+        cotracker = CoTracker2(stride=4, window_len=window_len)
+    else:
+        if offline:
+            cotracker = CoTrackerThreeOffline(
+                stride=4, corr_radius=3, window_len=window_len
+            )
+        else:
+            cotracker = CoTrackerThreeOnline(
+                stride=4, corr_radius=3, window_len=window_len
+            )
+
+    if checkpoint is not None:
+        with open(checkpoint, "rb") as f:
+            state_dict = torch.load(f, map_location="cpu")
+            if "model" in state_dict:
+                state_dict = state_dict["model"]
+        cotracker.load_state_dict(state_dict)
+    return cotracker
--- a/facebookresearch/co-tracker/cotracker/models/core/__init__.py
+++ b/facebookresearch/co-tracker/cotracker/models/core/__init__.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
--- a/facebookresearch/co-tracker/cotracker/models/core/__pycache__/__init__.cpython-310.pyc
+++ b/facebookresearch/co-tracker/cotracker/models/core/__pycache__/__init__.cpython-310.pyc
--- a/facebookresearch/co-tracker/cotracker/models/core/__pycache__/embeddings.cpython-310.pyc
+++ b/facebookresearch/co-tracker/cotracker/models/core/__pycache__/embeddings.cpython-310.pyc
--- a/facebookresearch/co-tracker/cotracker/models/core/__pycache__/model_utils.cpython-310.pyc
+++ b/facebookresearch/co-tracker/cotracker/models/core/__pycache__/model_utils.cpython-310.pyc
--- a/facebookresearch/co-tracker/cotracker/models/core/cotracker/__init__.py
+++ b/facebookresearch/co-tracker/cotracker/models/core/cotracker/__init__.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
--- a/facebookresearch/co-tracker/cotracker/models/core/cotracker/__pycache__/__init__.cpython-310.pyc
+++ b/facebookresearch/co-tracker/cotracker/models/core/cotracker/__pycache__/__init__.cpython-310.pyc
--- a/facebookresearch/co-tracker/cotracker/models/core/cotracker/__pycache__/blocks.cpython-310.pyc
+++ b/facebookresearch/co-tracker/cotracker/models/core/cotracker/__pycache__/blocks.cpython-310.pyc
--- a/facebookresearch/co-tracker/cotracker/models/core/cotracker/__pycache__/cotracker.cpython-310.pyc
+++ b/facebookresearch/co-tracker/cotracker/models/core/cotracker/__pycache__/cotracker.cpython-310.pyc