visualize_heatmap_volume.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "speaking-algebra",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import cv2\n",
    "import os.path as osp\n",
    "import decord\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import urllib\n",
    "import moviepy.editor as mpy\n",
    "import random as rd\n",
    "from mmpose.apis import vis_pose_result\n",
    "from mmpose.models import TopDown\n",
    "from mmcv import load, dump\n",
    "\n",
    "# We assume the annotation is already prepared\n",
    "gym_train_ann_file = '../data/skeleton/gym_train.pkl'\n",
    "gym_val_ann_file = '../data/skeleton/gym_val.pkl'\n",
    "ntu60_xsub_train_ann_file = '../data/skeleton/ntu60_xsub_train.pkl'\n",
    "ntu60_xsub_val_ann_file = '../data/skeleton/ntu60_xsub_val.pkl'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "alive-consolidation",
   "metadata": {},
   "outputs": [],
   "source": [
    "FONTFACE = cv2.FONT_HERSHEY_DUPLEX\n",
    "FONTSCALE = 0.6\n",
    "FONTCOLOR = (255, 255, 255)\n",
    "BGBLUE = (0, 119, 182)\n",
    "THICKNESS = 1\n",
    "LINETYPE = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ranging-conjunction",
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_label(frame, label, BGCOLOR=BGBLUE):\n",
    "    threshold = 30\n",
    "    def split_label(label):\n",
    "        label = label.split()\n",
    "        lines, cline = [], ''\n",
    "        for word in label:\n",
    "            if len(cline) + len(word) < threshold:\n",
    "                cline = cline + ' ' + word\n",
    "            else:\n",
    "                lines.append(cline)\n",
    "                cline = word\n",
    "        if cline != '':\n",
    "            lines += [cline]\n",
    "        return lines\n",
    "    \n",
    "    if len(label) > 30:\n",
    "        label = split_label(label)\n",
    "    else:\n",
    "        label = [label]\n",
    "    label = ['Action: '] + label\n",
    "    \n",
    "    sizes = []\n",
    "    for line in label:\n",
    "        sizes.append(cv2.getTextSize(line, FONTFACE, FONTSCALE, THICKNESS)[0])\n",
    "    box_width = max([x[0] for x in sizes]) + 10\n",
    "    text_height = sizes[0][1]\n",
    "    box_height = len(sizes) * (text_height + 6)\n",
    "    \n",
    "    cv2.rectangle(frame, (0, 0), (box_width, box_height), BGCOLOR, -1)\n",
    "    for i, line in enumerate(label):\n",
    "        location = (5, (text_height + 6) * i + text_height + 3)\n",
    "        cv2.putText(frame, line, location, FONTFACE, FONTSCALE, FONTCOLOR, THICKNESS, LINETYPE)\n",
    "    return frame\n",
    "    \n",
    "\n",
    "def vis_skeleton(vid_path, anno, category_name=None, ratio=0.5):\n",
    "    vid = decord.VideoReader(vid_path)\n",
    "    frames = [x.asnumpy() for x in vid]\n",
    "    \n",
    "    h, w, _ = frames[0].shape\n",
    "    new_shape = (int(w * ratio), int(h * ratio))\n",
    "    frames = [cv2.resize(f, new_shape) for f in frames]\n",
    "    \n",
    "    assert len(frames) == anno['total_frames']\n",
    "    # The shape is N x T x K x 3\n",
    "    kps = np.concatenate([anno['keypoint'], anno['keypoint_score'][..., None]], axis=-1)\n",
    "    kps[..., :2] *= ratio\n",
    "    # Convert to T x N x K x 3\n",
    "    kps = kps.transpose([1, 0, 2, 3])\n",
    "    vis_frames = []\n",
    "\n",
    "    # we need an instance of TopDown model, so build a minimal one\n",
    "    model = TopDown(backbone=dict(type='ShuffleNetV1'))\n",
    "\n",
    "    for f, kp in zip(frames, kps):\n",
    "        result = [dict(keypoints=k) for k in kp]\n",
    "        vis_frame = vis_pose_result(model, f, result)\n",
    "        \n",
    "        if category_name is not None:\n",
    "            vis_frame = add_label(vis_frame, category_name)\n",
    "        \n",
    "        vis_frames.append(vis_frame)\n",
    "    return vis_frames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "applied-humanity",
   "metadata": {},
   "outputs": [],
   "source": [
    "keypoint_pipeline = [\n",
    "    dict(type='PoseDecode'),\n",
    "    dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True),\n",
    "    dict(type='Resize', scale=(-1, 64)),\n",
    "    dict(type='CenterCrop', crop_size=64),\n",
    "    dict(type='GeneratePoseTarget', sigma=0.6, use_score=True, with_kp=True, with_limb=False)\n",
    "]\n",
    "\n",
    "limb_pipeline = [\n",
    "    dict(type='PoseDecode'),\n",
    "    dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True),\n",
    "    dict(type='Resize', scale=(-1, 64)),\n",
    "    dict(type='CenterCrop', crop_size=64),\n",
    "    dict(type='GeneratePoseTarget', sigma=0.6, use_score=True, with_kp=False, with_limb=True)\n",
    "]\n",
    "\n",
    "from mmaction.datasets.pipelines import Compose\n",
    "def get_pseudo_heatmap(anno, flag='keypoint'):\n",
    "    assert flag in ['keypoint', 'limb']\n",
    "    pipeline = Compose(keypoint_pipeline if flag == 'keypoint' else limb_pipeline)\n",
    "    return pipeline(anno)['imgs']\n",
    "\n",
    "def vis_heatmaps(heatmaps, channel=-1, ratio=8):\n",
    "    # if channel is -1, draw all keypoints / limbs on the same map\n",
    "    import matplotlib.cm as cm\n",
    "    h, w, _ = heatmaps[0].shape\n",
    "    newh, neww = int(h * ratio), int(w * ratio)\n",
    "    \n",
    "    if channel == -1:\n",
    "        heatmaps = [np.max(x, axis=-1) for x in heatmaps]\n",
    "    cmap = cm.viridis\n",
    "    heatmaps = [(cmap(x)[..., :3] * 255).astype(np.uint8) for x in heatmaps]\n",
    "    heatmaps = [cv2.resize(x, (neww, newh)) for x in heatmaps]\n",
    "    return heatmaps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "automatic-commons",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load GYM annotations\n",
    "lines = list(urllib.request.urlopen('https://sdolivia.github.io/FineGym/resources/dataset/gym99_categories.txt'))\n",
    "gym_categories = [x.decode().strip().split('; ')[-1] for x in lines]\n",
    "gym_annos = load(gym_train_ann_file) + load(gym_val_ann_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "numerous-bristol",
   "metadata": {},
   "outputs": [],
   "source": [
    "# download sample videos of GYM\n",
    "!wget https://download.openmmlab.com/mmaction/posec3d/gym_samples.tar\n",
    "!tar -xf gym_samples.tar\n",
    "!rm gym_samples.tar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ranging-harrison",
   "metadata": {},
   "outputs": [],
   "source": [
    "gym_root = 'gym_samples/'\n",
    "gym_vids = os.listdir(gym_root)\n",
    "# visualize pose of which video? index in 0 - 50.\n",
    "idx = 1\n",
    "vid = gym_vids[idx]\n",
    "\n",
    "frame_dir = vid.split('.')[0]\n",
    "vid_path = osp.join(gym_root, vid)\n",
    "anno = [x for x in gym_annos if x['frame_dir'] == frame_dir][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fitting-courage",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize Skeleton\n",
    "vis_frames = vis_skeleton(vid_path, anno, gym_categories[anno['label']])\n",
    "vid = mpy.ImageSequenceClip(vis_frames, fps=24)\n",
    "vid.ipython_display()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "orange-logging",
   "metadata": {},
   "outputs": [],
   "source": [
    "keypoint_heatmap = get_pseudo_heatmap(anno)\n",
    "keypoint_mapvis = vis_heatmaps(keypoint_heatmap)\n",
    "keypoint_mapvis = [add_label(f, gym_categories[anno['label']]) for f in keypoint_mapvis]\n",
    "vid = mpy.ImageSequenceClip(keypoint_mapvis, fps=24)\n",
    "vid.ipython_display()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "residential-conjunction",
   "metadata": {},
   "outputs": [],
   "source": [
    "limb_heatmap = get_pseudo_heatmap(anno, 'limb')\n",
    "limb_mapvis = vis_heatmaps(limb_heatmap)\n",
    "limb_mapvis = [add_label(f, gym_categories[anno['label']]) for f in limb_mapvis]\n",
    "vid = mpy.ImageSequenceClip(limb_mapvis, fps=24)\n",
    "vid.ipython_display()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "coupled-stranger",
   "metadata": {},
   "outputs": [],
   "source": [
    "# The name list of \n",
    "ntu_categories = ['drink water', 'eat meal/snack', 'brushing teeth', 'brushing hair', 'drop', 'pickup', \n",
    "                  'throw', 'sitting down', 'standing up (from sitting position)', 'clapping', 'reading', \n",
    "                  'writing', 'tear up paper', 'wear jacket', 'take off jacket', 'wear a shoe', \n",
    "                  'take off a shoe', 'wear on glasses', 'take off glasses', 'put on a hat/cap', \n",
    "                  'take off a hat/cap', 'cheer up', 'hand waving', 'kicking something', \n",
    "                  'reach into pocket', 'hopping (one foot jumping)', 'jump up', \n",
    "                  'make a phone call/answer phone', 'playing with phone/tablet', 'typing on a keyboard', \n",
    "                  'pointing to something with finger', 'taking a selfie', 'check time (from watch)', \n",
    "                  'rub two hands together', 'nod head/bow', 'shake head', 'wipe face', 'salute', \n",
    "                  'put the palms together', 'cross hands in front (say stop)', 'sneeze/cough', \n",
    "                  'staggering', 'falling', 'touch head (headache)', 'touch chest (stomachache/heart pain)', \n",
    "                  'touch back (backache)', 'touch neck (neckache)', 'nausea or vomiting condition', \n",
    "                  'use a fan (with hand or paper)/feeling warm', 'punching/slapping other person', \n",
    "                  'kicking other person', 'pushing other person', 'pat on back of other person', \n",
    "                  'point finger at the other person', 'hugging other person', \n",
    "                  'giving something to other person', \"touch other person's pocket\", 'handshaking', \n",
    "                  'walking towards each other', 'walking apart from each other']\n",
    "ntu_annos = load(ntu60_xsub_train_ann_file) + load(ntu60_xsub_val_ann_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "seasonal-palmer",
   "metadata": {},
   "outputs": [],
   "source": [
    "# download sample videos of NTU-60\n",
    "!wget https://download.openmmlab.com/mmaction/posec3d/ntu_samples.tar\n",
    "!tar -xf ntu_samples.tar\n",
    "!rm ntu_samples.tar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "critical-review",
   "metadata": {},
   "outputs": [],
   "source": [
    "ntu_root = 'ntu_samples/'\n",
    "ntu_vids = os.listdir(ntu_root)\n",
    "# visualize pose of which video? index in 0 - 50.\n",
    "idx = 20\n",
    "vid = ntu_vids[idx]\n",
    "\n",
    "frame_dir = vid.split('.')[0]\n",
    "vid_path = osp.join(ntu_root, vid)\n",
    "anno = [x for x in ntu_annos if x['frame_dir'] == frame_dir.split('_')[0]][0]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "accompanied-invitation",
   "metadata": {},
   "outputs": [],
   "source": [
    "vis_frames = vis_skeleton(vid_path, anno, ntu_categories[anno['label']])\n",
    "vid = mpy.ImageSequenceClip(vis_frames, fps=24)\n",
    "vid.ipython_display()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "respiratory-conclusion",
   "metadata": {},
   "outputs": [],
   "source": [
    "keypoint_heatmap = get_pseudo_heatmap(anno)\n",
    "keypoint_mapvis = vis_heatmaps(keypoint_heatmap)\n",
    "keypoint_mapvis = [add_label(f, gym_categories[anno['label']]) for f in keypoint_mapvis]\n",
    "vid = mpy.ImageSequenceClip(keypoint_mapvis, fps=24)\n",
    "vid.ipython_display()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "thirty-vancouver",
   "metadata": {},
   "outputs": [],
   "source": [
    "limb_heatmap = get_pseudo_heatmap(anno, 'limb')\n",
    "limb_mapvis = vis_heatmaps(limb_heatmap)\n",
    "limb_mapvis = [add_label(f, gym_categories[anno['label']]) for f in limb_mapvis]\n",
    "vid = mpy.ImageSequenceClip(limb_mapvis, fps=24)\n",
    "vid.ipython_display()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}