init

754fbc04 · bailuo · 7aa1ab82 · 754fbc04 · 754fbc04 · 754fbc04
Commit 754fbc04 authored Jul 16, 2024 by bailuo
Expand all Hide whitespace changes
Inline Side-by-side

Showing with 1706 additions and 0 deletions

train.py train.py +106 -0

trainer.py trainer.py +1068 -0

util.py util.py +396 -0

viz.py viz.py +136 -0

No files found.
--- a/train.py
+++ b/train.py
+import os
+import subprocess
+import random
+import datetime
+import shutil
+import numpy as np
+import torch
+import torch.utils.data
+import torch.distributed as dist
+from config import config_parser
+from tensorboardX import SummaryWriter
+from loaders.create_training_dataset import get_training_dataset
+from trainer import BaseTrainer
+torch.manual_seed(1234)
+def synchronize():
+    """
+    Helper function to synchronize (barrier) among all processes when
+    using distributed training
+    """
+    if not dist.is_available():
+        return
+    if not dist.is_initialized():
+        return
+    world_size = dist.get_world_size()
+    if world_size == 1:
+        return
+    dist.barrier()
+def seed_worker(worker_id):
+    worker_seed = torch.initial_seed() % 2**32
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+def train(args):
+    seq_name = os.path.basename(args.data_dir.rstrip('/'))
+    out_dir = os.path.join(args.save_dir, '{}_{}'.format(args.expname, seq_name))
+    os.makedirs(out_dir, exist_ok=True)
+    print('optimizing for {}...\n output is saved in {}'.format(seq_name, out_dir))
+    args.out_dir = out_dir
+    # save the args and config files
+    f = os.path.join(out_dir, 'args.txt')
+    with open(f, 'w') as file:
+        for arg in sorted(vars(args)):
+            if not arg.startswith('_'):
+                attr = getattr(args, arg)
+                file.write('{} = {}\n'.format(arg, attr))
+    if args.config:
+        f = os.path.join(out_dir, 'config.txt')
+        if not os.path.isfile(f):
+            shutil.copy(args.config, f)
+    log_dir = 'logs/{}_{}'.format(args.expname, seq_name)
+    writer = SummaryWriter(log_dir)
+    g = torch.Generator()
+    g.manual_seed(args.loader_seed)
+    dataset, data_sampler = get_training_dataset(args, max_interval=args.start_interval)
+    data_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_size=args.num_pairs,
+                                              worker_init_fn=seed_worker,
+                                              generator=g,
+                                              num_workers=args.num_workers,
+                                              sampler=data_sampler,
+                                              shuffle=True if data_sampler is None else False,
+                                              pin_memory=True)
+    # get trainer
+    trainer = BaseTrainer(args)
+    start_step = trainer.step + 1
+    step = start_step
+    epoch = 0
+    while step < args.num_iters + start_step + 1:
+        for batch in data_loader:
+            trainer.train_one_step(step, batch)
+            trainer.log(writer, step)
+            step += 1
+            dataset.set_max_interval(args.start_interval + step // 2000)
+            if step >= args.num_iters + start_step + 1:
+                break
+        epoch += 1
+        if args.distributed:
+            data_sampler.set_epoch(epoch)
+if __name__ == '__main__':
+    args = config_parser()
+    if args.distributed:
+        torch.cuda.set_device(args.local_rank)
+        torch.distributed.init_process_group(backend="nccl", init_method="env://")
+        synchronize()
+    train(args)
--- a/trainer.py
+++ b/trainer.py
--- a/util.py
+++ b/util.py
+import numpy as np
+import os, sys, time
+import imageio
+import cv2
+import shutil
+from datetime import datetime
+import matplotlib.pyplot as plt
+import torch
+import torch.nn.functional as F
+import socket
+import contextlib
+from matplotlib import cm
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+from matplotlib.figure import Figure
+import matplotlib as mpl
+import subprocess
+TINY_NUMBER = 1e-6      # float32 only has 7 decimal digits precision
+torch.manual_seed(1234)
+np.random.seed(0)
+sigma2alpha = lambda sigma: 1. - torch.exp(-sigma)
+def float2uint8(x):
+    return (255. * x).astype(np.uint8)
+def uint82float(img):
+    return np.ascontiguousarray(img) / 255.
+def skew(x):
+    if 'torch' in str(x.dtype):
+        return torch.tensor([[0, -x[2], x[1]],
+                             [x[2], 0, -x[0]],
+                             [-x[1], x[0], 0]],
+                            device=x.device)
+    else:
+        return np.array([[0, -x[2], x[1]],
+                         [x[2], 0, -x[0]],
+                         [-x[1], x[0], 0]])
+def img2mse(x, y, mask=None):
+    '''
+    :param x: img 1, [(...), 3]
+    :param y: img 2, [(...), 3]
+    :param mask: optional, [(...)]
+    :return: mse score
+    '''
+    if mask is None:
+        return torch.mean((x - y) * (x - y))
+    else:
+        return torch.sum((x - y) * (x - y) * mask.unsqueeze(-1)) / (torch.sum(mask) * x.shape[-1] + TINY_NUMBER)
+def homogenize(coord):
+    coord = torch.cat((coord, torch.ones_like(coord[..., [0]])), -1)
+    return coord
+def normalize_coords(coords, h, w, no_shift=False):
+    assert coords.shape[-1] == 2
+    if no_shift:
+        return coords / torch.tensor([w-1., h-1.], device=coords.device) * 2
+    else:
+        return coords / torch.tensor([w-1., h-1.], device=coords.device) * 2 - 1.
+def denormalize_coords(coords, h, w, no_shift=False):
+    assert coords.shape[-1] == 2
+    if no_shift:
+        return coords * torch.tensor([w-1., h-1.], device=coords.device) / 2.
+    else:
+        return (coords + 1.) * torch.tensor([w-1., h-1.], device=coords.device) / 2.
+def gen_grid(h, w, device, normalize=False, homogeneous=False):
+    if normalize:
+        lin_y = torch.linspace(-1., 1., steps=h, device=device)
+        lin_x = torch.linspace(-1., 1., steps=w, device=device)
+    else:
+        lin_y = torch.arange(0, h, device=device)
+        lin_x = torch.arange(0, w, device=device)
+    grid_y, grid_x = torch.meshgrid((lin_y, lin_x))
+    grid = torch.stack((grid_x, grid_y), -1)
+    if homogeneous:
+        grid = torch.cat([grid, torch.ones_like(grid[..., :1])], dim=-1)
+    return grid  # [h, w, 2 or 3]
+def gen_grid_np(h, w, normalize=False, homogeneous=False):
+    if normalize:
+        lin_y = np.linspace(-1., 1., num=h)
+        lin_x = np.linspace(-1., 1., num=w)
+    else:
+        lin_y = np.arange(0, h)
+        lin_x = np.arange(0, w)
+    grid_x, grid_y = np.meshgrid(lin_x, lin_y)
+    grid = np.stack((grid_x, grid_y), -1)
+    if homogeneous:
+        grid = np.concatenate([grid, np.ones_like(grid[..., :1])], axis=-1)
+    return grid  # [h, w, 2 or 3]
+def save_current_code(outdir):
+    now = datetime.now()  # current date and time
+    date_time = now.strftime("%m_%d-%H:%M:%S")
+    src_dir = '.'
+    dst_dir = os.path.join(outdir, 'code', '{}'.format(date_time))
+    shutil.copytree(src_dir, dst_dir,
+                    ignore=shutil.ignore_patterns(
+                        'data*', 'OLD*',
+                        'logs*', 'out*', 'runs*', '*.png', '*.mp4', '*__pycache__*',
+                        '*.git*', '*.idea*', '*.zip', '*.jpg'))
+def drawMatches(img1, img2, kp1, kp2, num_vis=200, idx_vis=None, radius=2, mask=None):
+    num_pts = len(kp1)
+    if idx_vis is None:
+        if num_vis < num_pts:
+            idx_vis = np.random.choice(num_pts, num_vis, replace=False)
+        else:
+            idx_vis = np.arange(num_pts)
+    kp1_vis = kp1[idx_vis]
+    kp2_vis = kp2[idx_vis]
+    h1, w1 = img1.shape[:2]
+    h2, w2 = img2.shape[:2]
+    img1 = float2uint8(img1)
+    img2 = float2uint8(img2)
+    center = np.median(kp1, axis=0)
+    set_max = range(128)
+    colors = {m: i for i, m in enumerate(set_max)}
+    colors = {m: (255 * np.array(plt.cm.hsv(i/float(len(colors))))[:3][::-1]).astype(np.int32)
+              for m, i in colors.items()}
+    if mask is not None:
+        ind = np.argsort(mask)[::-1]
+        kp1_vis = kp1_vis[ind]
+        kp2_vis = kp2_vis[ind]
+        mask = mask[ind]
+    for i, (pt1, pt2) in enumerate(zip(kp1_vis, kp2_vis)):
+        # random_color = tuple(np.random.randint(low=0, high=255, size=(3,)).tolist())
+        coord_angle = np.arctan2(pt1[1] - center[1], pt1[0] - center[0])
+        corr_color = np.int32(64 * coord_angle / np.pi) % 128
+        color = tuple(colors[corr_color].tolist())
+        if (pt1[0] <= w1 - 1) and (pt1[0] >= 0) and (pt1[1] <= h1 - 1) and (pt1[1] >= 0):
+            img1 = cv2.circle(img1, (int(pt1[0]), int(pt1[1])), radius, color, -1, cv2.LINE_AA)
+        if (pt2[0] <= w2 - 1) and (pt2[0] >= 0) and (pt2[1] <= h2 - 1) and (pt2[1] >= 0):
+            if mask is not None and mask[i]:
+                img2 = cv2.drawMarker(img2, (int(pt2[0]), int(pt2[1])), color, markerType=cv2.MARKER_CROSS,
+                                      markerSize=int(5*radius), thickness=int(radius/2), line_type=cv2.LINE_AA)
+            else:
+                img2 = cv2.circle(img2, (int(pt2[0]), int(pt2[1])), radius, color, -1, cv2.LINE_AA)
+    out = np.concatenate([img1, img2], axis=1)
+    return out
+def get_vertical_colorbar(h, vmin, vmax, cmap_name='jet', label=None, cbar_precision=2):
+    '''
+    :param w: pixels
+    :param h: pixels
+    :param vmin: min value
+    :param vmax: max value
+    :param cmap_name:
+    :param label
+    :return:
+    '''
+    fig = Figure(figsize=(2, 8), dpi=100)
+    fig.subplots_adjust(right=1.5)
+    canvas = FigureCanvasAgg(fig)
+    # Do some plotting.
+    ax = fig.add_subplot(111)
+    cmap = cm.get_cmap(cmap_name)
+    norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax)
+    tick_cnt = 6
+    tick_loc = np.linspace(vmin, vmax, tick_cnt)
+    cb1 = mpl.colorbar.ColorbarBase(ax, cmap=cmap,
+                                    norm=norm,
+                                    ticks=tick_loc,
+                                    orientation='vertical')
+    tick_label = [str(np.round(x, cbar_precision)) for x in tick_loc]
+    if cbar_precision == 0:
+        tick_label = [x[:-2] for x in tick_label]
+    cb1.set_ticklabels(tick_label)
+    cb1.ax.tick_params(labelsize=18, rotation=0)
+    if label is not None:
+        cb1.set_label(label)
+    fig.tight_layout()
+    canvas.draw()
+    s, (width, height) = canvas.print_to_buffer()
+    im = np.frombuffer(s, np.uint8).reshape((height, width, 4))
+    im = im[:, :, :3].astype(np.float32) / 255.
+    if h != im.shape[0]:
+        w = int(im.shape[1] / im.shape[0] * h)
+        im = cv2.resize(im, (w, h), interpolation=cv2.INTER_AREA)
+    return im
+def colorize_np(x, cmap_name='jet', mask=None, range=None, append_cbar=False, cbar_in_image=False, cbar_precision=2):
+    '''
+    turn a grayscale image into a color image
+    :param x: input grayscale, [H, W]
+    :param cmap_name: the colorization method
+    :param mask: the mask image, [H, W]
+    :param range: the range for scaling, automatic if None, [min, max]
+    :param append_cbar: if append the color bar
+    :param cbar_in_image: put the color bar inside the image to keep the output image the same size as the input image
+    :return: colorized image, [H, W]
+    '''
+    if range is not None:
+        vmin, vmax = range
+    elif mask is not None:
+        # vmin, vmax = np.percentile(x[mask], (2, 100))
+        vmin = np.min(x[mask][np.nonzero(x[mask])])
+        vmax = np.max(x[mask])
+        # vmin = vmin - np.abs(vmin) * 0.01
+        x[np.logical_not(mask)] = vmin
+        # print(vmin, vmax)
+    else:
+        vmin, vmax = np.percentile(x, (1, 100))
+        vmax += TINY_NUMBER
+    x = np.clip(x, vmin, vmax)
+    x = (x - vmin) / (vmax - vmin)
+    # x = np.clip(x, 0., 1.)
+    cmap = cm.get_cmap(cmap_name)
+    x_new = cmap(x)[:, :, :3]
+    if mask is not None:
+        mask = np.float32(mask[:, :, np.newaxis])
+        x_new = x_new * mask + np.ones_like(x_new) * (1. - mask)
+    cbar = get_vertical_colorbar(h=x.shape[0], vmin=vmin, vmax=vmax, cmap_name=cmap_name, cbar_precision=cbar_precision)
+    if append_cbar:
+        if cbar_in_image:
+            x_new[:, -cbar.shape[1]:, :] = cbar
+        else:
+            x_new = np.concatenate((x_new, np.zeros_like(x_new[:, :5, :]), cbar), axis=1)
+        return x_new
+    else:
+        return x_new
+# tensor
+def colorize(x, cmap_name='jet', mask=None, range=None, append_cbar=False, cbar_in_image=False):
+    device = x.device
+    x = x.cpu().numpy()
+    if mask is not None:
+        mask = mask.cpu().numpy() > 0.99
+    x = colorize_np(x, cmap_name, mask, range, append_cbar, cbar_in_image)
+    x = torch.from_numpy(x).to(device)
+    return x
+def make_colorwheel():
+    """
+    Generates a color wheel for optical flow visualization as presented in:
+        Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007)
+        URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf
+    Code follows the original C++ source code of Daniel Scharstein.
+    Code follows the the Matlab source code of Deqing Sun.
+    Returns:
+        np.ndarray: Color wheel
+    """
+    RY = 15
+    YG = 6
+    GC = 4
+    CB = 11
+    BM = 13
+    MR = 6
+    ncols = RY + YG + GC + CB + BM + MR
+    colorwheel = np.zeros((ncols, 3))
+    col = 0
+    # RY
+    colorwheel[0:RY, 0] = 255
+    colorwheel[0:RY, 1] = np.floor(255*np.arange(0,RY)/RY)
+    col = col+RY
+    # YG
+    colorwheel[col:col+YG, 0] = 255 - np.floor(255*np.arange(0,YG)/YG)
+    colorwheel[col:col+YG, 1] = 255
+    col = col+YG
+    # GC
+    colorwheel[col:col+GC, 1] = 255
+    colorwheel[col:col+GC, 2] = np.floor(255*np.arange(0,GC)/GC)
+    col = col+GC
+    # CB
+    colorwheel[col:col+CB, 1] = 255 - np.floor(255*np.arange(CB)/CB)
+    colorwheel[col:col+CB, 2] = 255
+    col = col+CB
+    # BM
+    colorwheel[col:col+BM, 2] = 255
+    colorwheel[col:col+BM, 0] = np.floor(255*np.arange(0,BM)/BM)
+    col = col+BM
+    # MR
+    colorwheel[col:col+MR, 2] = 255 - np.floor(255*np.arange(MR)/MR)
+    colorwheel[col:col+MR, 0] = 255
+    return colorwheel
+def flow_uv_to_colors(u, v, convert_to_bgr=False):
+    """
+    Applies the flow color wheel to (possibly clipped) flow components u and v.
+    According to the C++ source code of Daniel Scharstein
+    According to the Matlab source code of Deqing Sun
+    Args:
+        u (np.ndarray): Input horizontal flow of shape [H,W]
+        v (np.ndarray): Input vertical flow of shape [H,W]
+        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
+    Returns:
+        np.ndarray: Flow visualization image of shape [H,W,3]
+    """
+    flow_image = np.zeros((u.shape[0], u.shape[1], 3), np.uint8)
+    colorwheel = make_colorwheel()  # shape [55x3]
+    ncols = colorwheel.shape[0]
+    rad = np.sqrt(np.square(u) + np.square(v))
+    a = np.arctan2(-v, -u)/np.pi
+    fk = (a+1) / 2*(ncols-1)
+    k0 = np.floor(fk).astype(np.int32)
+    k1 = k0 + 1
+    k1[k1 == ncols] = 0
+    f = fk - k0
+    for i in range(colorwheel.shape[1]):
+        tmp = colorwheel[:,i]
+        col0 = tmp[k0] / 255.0
+        col1 = tmp[k1] / 255.0
+        col = (1-f)*col0 + f*col1
+        idx = (rad <= 1)
+        col[idx] = 1 - rad[idx] * (1-col[idx])
+        col[~idx] = col[~idx] * 0.75   # out of range
+        # Note the 2-i => BGR instead of RGB
+        ch_idx = 2-i if convert_to_bgr else i
+        flow_image[:, :, ch_idx] = np.floor(255 * col)
+    return flow_image
+def flow_to_image(flow_uv, clip_flow=None, convert_to_bgr=False):
+    """
+    Expects a two dimensional flow image of shape.
+    Args:
+        flow_uv (np.ndarray): Flow UV image of shape [H,W,2]
+        clip_flow (float, optional): Clip maximum of flow values. Defaults to None.
+        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
+    Returns:
+        np.ndarray: Flow visualization image of shape [H,W,3]
+    """
+    assert flow_uv.ndim == 3 or flow_uv.ndim == 4, 'input flow must have three or four dimensions'
+    assert flow_uv.shape[-1] == 2, 'input flow must have shape [..., H,W,2]'
+    if clip_flow is not None:
+        flow_uv = np.clip(flow_uv, 0, clip_flow)
+    u = flow_uv[..., 0]
+    v = flow_uv[..., 1]
+    rad = np.sqrt(np.square(u) + np.square(v))
+    rad_max = np.max(rad)
+    epsilon = 1e-5
+    u = u / (rad_max + epsilon)
+    v = v / (rad_max + epsilon)
+    if flow_uv.ndim == 4:
+        return np.stack([flow_uv_to_colors(u_, v_, convert_to_bgr) for (u_, v_) in zip(u, v)], axis=0)
+    else:
+        return flow_uv_to_colors(u, v, convert_to_bgr)
--- a/viz.py
+++ b/viz.py
+import os
+import imageio
+import glob
+import torch
+import numpy as np
+import util
+import subprocess
+from config import config_parser
+from trainer import BaseTrainer
+import colorsys
+from matplotlib import cm
+import cv2
+color_map = cm.get_cmap("jet")
+def vis_trail(scene_dir, kpts_foreground, kpts_background, save_path):
+    """
+    This function calculates the median motion of the background, which is subsequently
+    subtracted from the foreground motion. This subtraction process "stabilizes" the camera and
+    improves the interpretability of the foreground motion trails.
+    """
+    img_dir = os.path.join(scene_dir, "color")
+    img_files = sorted(list(glob.glob(os.path.join(img_dir, "*"))))
+    images = np.array([imageio.imread(img_file) for img_file in img_files])
+    kpts_foreground = kpts_foreground[:, ::1]  # can adjust kpts sampling rate here
+    num_imgs, num_pts = kpts_foreground.shape[:2]
+    frames = []
+    for i in range(num_imgs):
+        kpts = kpts_foreground - np.median(kpts_background - kpts_background[i], axis=1, keepdims=True)
+        img_curr = images[i]
+        for t in range(i):
+            img1 = img_curr.copy()
+            # changing opacity
+            alpha = max(1 - 0.9 * ((i - t) / ((i + 1) * .99)), 0.1)
+            for j in range(num_pts):
+                color = np.array(color_map(j/max(1, float(num_pts - 1)))[:3]) * 255
+                color_alpha = 1
+                hsv = colorsys.rgb_to_hsv(color[0], color[1], color[2])
+                color = colorsys.hsv_to_rgb(hsv[0], hsv[1]*color_alpha, hsv[2])
+                pt1 = kpts[t, j]
+                pt2 = kpts[t+1, j]
+                p1 = (int(round(pt1[0])), int(round(pt1[1])))
+                p2 = (int(round(pt2[0])), int(round(pt2[1])))
+                cv2.line(img1, p1, p2, color, thickness=1, lineType=16)
+            img_curr = cv2.addWeighted(img1, alpha, img_curr, 1 - alpha, 0)
+        for j in range(num_pts):
+            color = np.array(color_map(j/max(1, float(num_pts - 1)))[:3]) * 255
+            pt1 = kpts[i, j]
+            p1 = (int(round(pt1[0])), int(round(pt1[1])))
+            cv2.circle(img_curr, p1, 2, color, -1, lineType=16)
+        frames.append(img_curr)
+    imageio.mimwrite(save_path, frames, quality=8, fps=10)
+if __name__ == '__main__':
+    args = config_parser()
+    seq_name = os.path.basename(args.data_dir.rstrip('/'))
+    trainer = BaseTrainer(args)
+    num_imgs = trainer.num_imgs
+    vis_dir = os.path.join(args.save_dir, '{}_{}'.format(args.expname, seq_name), 'vis')
+    print('output will be saved in {}'.format(vis_dir))
+    os.makedirs(vis_dir, exist_ok=True)
+    query_id = args.query_frame_id
+    radius = 3  # the point radius for point correspondence visualization
+    mask = None
+    if os.path.exists(args.foreground_mask_path):
+        h, w = trainer.h, trainer.w
+        mask = imageio.imread(args.foreground_mask_path)[..., -1]  # rgba image, take the alpha channel
+        mask = cv2.resize(mask, dsize=(w, h)) == 255
+    # for DAVIS video sequences which come with segmentation masks
+    # or when a foreground mask for the query frame is provided
+    if trainer.with_mask or mask is not None:
+        # foreground
+        frames, kpts_forground = trainer.eval_video_correspondences(query_id, use_mask=True,
+                                                                    mask=mask,
+                                                                    vis_occlusion=args.vis_occlusion,
+                                                                    occlusion_th=args.occlusion_th,
+                                                                    use_max_loc=args.use_max_loc,
+                                                                    radius=radius,
+                                                                    return_kpts=True)
+        imageio.mimwrite(os.path.join(vis_dir, '{}_{:06d}_foreground_{}.mp4'.format(seq_name, trainer.step, query_id)),
+                         frames, quality=8, fps=10)
+        kpts_forground = kpts_forground.cpu().numpy()
+        # background
+        frames, kpts_background = trainer.eval_video_correspondences(query_id, use_mask=True,
+                                                                     reverse_mask=True,
+                                                                     mask=mask,
+                                                                     vis_occlusion=args.vis_occlusion,
+                                                                     occlusion_th=args.occlusion_th,
+                                                                     use_max_loc=args.use_max_loc,
+                                                                     radius=radius,
+                                                                     return_kpts=True)
+        kpts_background = kpts_background.cpu().numpy()
+        imageio.mimwrite(os.path.join(vis_dir, '{}_{:06d}_background_{}.mp4'.format(seq_name, trainer.step, query_id)),
+                         frames, quality=8, fps=10)
+        # visualize trails
+        vis_trail(args.data_dir, kpts_forground, kpts_background,
+                  os.path.join(vis_dir, '{}_{:06d}_{}_trails.mp4'.format(seq_name, trainer.step, query_id)))
+    else:
+        frames = trainer.eval_video_correspondences(query_id,
+                                                    vis_occlusion=args.vis_occlusion,
+                                                    occlusion_th=args.occlusion_th,
+                                                    use_max_loc=args.use_max_loc,
+                                                    radius=radius)
+        imageio.mimwrite(os.path.join(vis_dir, '{}_{:06d}_{}.mp4'.format(seq_name, trainer.step, query_id)),
+                         frames, quality=8, fps=10)