COTS: Add tutorial sections

cbc1cc17 · Mark Daoust · 5c654a03 · cbc1cc17
Commit cbc1cc17 authored May 16, 2022 by Mark Daoust
Hide whitespace changes
Inline Side-by-side

Showing with 990 additions and 386 deletions

official/projects/cots_detector/crown_of_thorns_starfish_detection_pipeline.ipynb ...etector/crown_of_thorns_starfish_detection_pipeline.ipynb +990 -386

No files found.
--- a/official/projects/cots_detector/crown_of_thorns_starfish_detection_pipeline.ipynb
+++ b/official/projects/cots_detector/crown_of_thorns_starfish_detection_pipeline.ipynb
@@ -64,7 +64,7 @@
    "id": "jDiIX2xawkJw"
   },
   "source": [
-    "### This notebook\n",
+    "## This notebook\n",
    "\n",
    "This notebook tutorial shows how to detect COTS using a pre-trained COTS detector implemented in TensorFlow. On top of just running the model on each frame of the video, the tracking code in this notebook aligns detections from frame to frame creating a consistent track for each COTS. Each track is given an id and frame count. Here is an example image from a video of a reef showing labeled COTS starfish.\n",
    "\n",
@@ -86,6 +86,8 @@
    "id": "a4R2T97u442o"
   },
   "source": [
+    "## Setup \n",
+    "\n",
    "Install all needed packages."
   ]
  },
@@ -99,7 +101,8 @@
   "source": [
    "# remove the existing datascience package to avoid package conflicts in the colab environment\n",
    "!pip3 uninstall -y datascience\n",
-    "!pip3 install -q opencv-python"
+    "!pip3 install -q opencv-python\n",
+    "!pip3 install PILLOW"
   ]
  },
  {
@@ -122,124 +125,689 @@
    "import subprocess\n",
    "import time\n",
    "import textwrap\n",
+    "from typing import Dict, Iterable, List, Optional, Tuple\n",
+    "\n",
+    "from absl import logging as absl_logging\n",
+    "from IPython import display\n",
+    "import cv2\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import PIL.Image\n",
+    "import tensorflow as tf\n",
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "gsSclJg4sJbX"
+   },
+   "source": [
+    "Define all needed variables."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "iKMCvnZEXBBT"
+   },
+   "outputs": [],
+   "source": [
+    "model_name = \"cots_1080_v1\" #@param [\"cots_1080_v1\", \"cots_720_v1\"]\n",
+    "test_sequence_name = \"test3\" #@param [\"test1\", \"test2\", \"test3\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "ORLJSdLq4-gd"
+   },
+   "outputs": [],
+   "source": [
+    "cots_model = f\"https://storage.googleapis.com/download.tensorflow.org/models/cots_detection/{model_name}.zip\"\n",
+    "\n",
+    "# Alternatively, this dataset can be downloaded through CSIRO's Data Access Portal at https://data.csiro.au/collection/csiro:54830v2\n",
+    "sample_data_link = f\"https://storage.googleapis.com/download.tensorflow.org/data/cots_detection/sample_images.zip\"\n",
+    "\n",
+    "preview_video_path = \"preview.mp4\"\n",
+    "detection_small_video_path = \"COTS_detection.mp4\"\n",
+    "detection_csv_path = \"detections.csv\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "FNwP3s-5xgaF"
+   },
+   "source": [
+    "You also need to retrieve the sample data.  This sample data is made up of a series of chronological images."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "DF_c_ZMXdPRN"
+   },
+   "outputs": [],
+   "source": [
+    "sample_data_path = tf.keras.utils.get_file(origin=sample_data_link)\n",
+    "# Unzip data\n",
+    "!mkdir sample_images\n",
+    "!unzip -o -q {sample_data_path} -d sample_images"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Kkg3SazB1edC"
+   },
+   "source": [
+    "Create a list of images to work on from the downloaded files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "u0fqXQUzdZCu"
+   },
+   "outputs": [],
+   "source": [
+    "def embed_video_file(path: os.PathLike) -> display.HTML:\n",
+    "  \"\"\"Embeds a file in the notebook as an html tag with a data-url.\"\"\"\n",
+    "  path = pathlib.Path(path)\n",
+    "  mime, unused_encoding = mimetypes.guess_type(str(path))\n",
+    "  data = path.read_bytes()\n",
+    "\n",
+    "  b64 = base64.b64encode(data).decode()\n",
+    "  return display.HTML(\n",
+    "      textwrap.dedent(\"\"\"\n",
+    "      <video width=\"640\" height=\"480\" controls>\n",
+    "        <source src=\"data:{mime};base64,{b64}\" type=\"{mime}\">\n",
+    "        Your browser does not support the video tag.\n",
+    "      </video>\n",
+    "      \"\"\").format(mime=mime, b64=b64))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Ghf-4E5-ZiJn"
+   },
+   "source": [
+    "Write the images to a video file:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "kCdWsbO1afIJ"
+   },
+   "outputs": [],
+   "source": [
+    "tmp_video_path = \"tmp_preview.mp4\"\n",
+    "\n",
+    "filenames = sorted(glob.glob(f\"sample_images/{test_sequence_name}/*.jpg\"))\n",
+    "img = cv2.imread(filenames[0])\n",
+    "height, width, layers = img.shape\n",
+    "size = (width, height)\n",
+    "\n",
+    "video_writer = cv2.VideoWriter(\n",
+    "    filename=tmp_video_path,\n",
+    "    fourcc=cv2.VideoWriter_fourcc(*\"MP4V\"), \n",
+    "    fps=15, \n",
+    "    frameSize=size)\n",
+    " \n",
+    "for filename in tqdm(filenames):\n",
+    "    img = cv2.imread(filename)\n",
+    "    video_writer.write(img)\n",
+    "cv2.destroyAllWindows()\n",
+    "video_writer.release()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "cHsKpPyviWmF"
+   },
+   "source": [
+    "Re-encode the video, and reduce its size (Colab crashes if you try to embed the full size video)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "_li0qe-gh1iT"
+   },
+   "outputs": [],
+   "source": [
+    "subprocess.check_call([\n",
+    "    \"ffmpeg\", \"-y\", \"-i\", tmp_video_path,\n",
+    "    \"-vf\",\"scale=800:-1\",\n",
+    "    \"-crf\", \"18\",\n",
+    "    \"-preset\", \"veryfast\",\n",
+    "    \"-vcodec\", \"libx264\", preview_video_path])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "2ItoiHyYQGya"
+   },
+   "source": [
+    "The images you downloaded are frames of a movie showing a top view of a coral reef with crown-of-thorns starfish.  The movie looks like this:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "SiOsbr8xePkg"
+   },
+   "outputs": [],
+   "source": [
+    "embed_video_file(preview_video_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "9Z0DTbWrZMZ-"
+   },
+   "source": [
+    "Can you se them? there are lots. The goal of the model is to put boxes around all of the starfish.  Each starfish will get its own ID, and that ID will be stable as the camera passes over it."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "d0iALUwM0g2p"
+   },
+   "source": [
+    "## Load the model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "fVq6vNBTxM62"
+   },
+   "source": [
+    "Download the trained COTS detection model that matches your preferences from earlier."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "No5jRA1TxXj0"
+   },
+   "outputs": [],
+   "source": [
+    "model_path = tf.keras.utils.get_file(origin=cots_model)\n",
+    "# Unzip model\n",
+    "!mkdir {model_name}\n",
+    "!unzip -o -q {model_path} -d {model_name}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ezyuSHK5ap__"
+   },
+   "source": [
+    "Load trained model from disk and create the inference function `model_fn()`.  This might take a little while."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "HXQnNjwl8Beu"
+   },
+   "outputs": [],
+   "source": [
+    "absl_logging.set_verbosity(absl_logging.ERROR)\n",
+    "\n",
+    "tf.config.optimizer.set_experimental_options({'auto_mixed_precision': True})\n",
+    "tf.config.optimizer.set_jit(True)\n",
+    "\n",
+    "model_fn = tf.saved_model.load(model_name).signatures['serving_default']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "OvLuznhUa7uG"
+   },
+   "source": [
+    "Here's one test image, how many COTS can you see:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "XmQF_2L_a7Hu"
+   },
+   "outputs": [],
+   "source": [
+    "example_frame_number = 52\n",
+    "image = tf.io.read_file(filenames[example_frame_number])\n",
+    "image = tf.io.decode_jpeg(image)\n",
+    "\n",
+    "# Caution PIL and tf use \"RGB\" color order, while cv2 uses \"BGR\".\n",
+    "PIL.Image.fromarray(image.numpy())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "KSOf4V8WhTHF"
+   },
+   "source": [
+    "Try running the model on the image. The model expects a batch of images so add an outer `batch` dimension before calling the model.\n",
+    "\n",
+    "Note: The model only runs correctly with a batch size of 1.\n",
+    "\n",
+    "The result is a dictionary with a number of fields. For all fields the first dimension of the shape is the `batch` dimension, "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "iqLHo8h0c2pW"
+   },
+   "outputs": [],
+   "source": [
+    "image_batch = image[tf.newaxis, ...]\n",
+    "result = model_fn(image_batch)\n",
+    "\n",
+    "print(f\"{'image_batch':20s}- shape: {image_batch.shape}\")\n",
+    "\n",
+    "for key, value in result.items():\n",
+    "  print(f\"{key:20s}- shape: {value.shape}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "0xuNoKLCjyDz"
+   },
+   "source": [
+    "The `num_detections` field gives the number of valid detections, but this is always 100. There are always 100 locations that _could_ be a COTS."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "nGCDZJQvkIOL"
+   },
+   "outputs": [],
+   "source": [
+    "print('\\nnum_detections: ', result['num_detections'].numpy())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "cSd7JJYqkPz7"
+   },
+   "source": [
+    "Similarly the `detection_classes` field is always `0`, since the model only detects 1 class: COTS."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "JoY8bJrfkcuS"
+   },
+   "outputs": [],
+   "source": [
+    "print('detection_classes: \\n', result['detection_classes'].numpy())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "X2nVLSOokyog"
+   },
+   "source": [
+    "What actually matters here is the detection scores, indicating the quality of each detection: "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "iepEgCc2jsRD"
+   },
+   "outputs": [],
+   "source": [
+    "result['detection_scores'].numpy()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Fn2B0nbplAFy"
+   },
+   "source": [
+    "You need to choose a threshold that determines what counts as a good detection. This frame has a few good detections:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "a30Uyc0WlK2a"
+   },
+   "outputs": [],
+   "source": [
+    "good_detections = result['detection_scores'] > 0.4\n",
+    "good_detections.numpy()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Y_xrbQiAlWrK"
+   },
+   "source": [
+    "Build a class to handle the detection boxes:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "S5inzqu-4JhT"
+   },
+   "outputs": [],
+   "source": [
+    "@dataclasses.dataclass(frozen=True)\n",
+    "class BBox:\n",
+    "  x0: float\n",
+    "  y0: float\n",
+    "  x1: float\n",
+    "  y1: float\n",
+    "\n",
+    "  def replace(self, **kwargs):\n",
+    "    d = self.__dict__.copy()\n",
+    "    d.update(kwargs)\n",
+    "    return type(self)(**d)\n",
+    "\n",
+    "  def center(self)-> Tuple[float, float]:\n",
+    "    return ((self.x0+self.x1)/2, (self.y0+self.y1)/2)\n",
+    "  \n",
+    "  def width(self) -> float:\n",
+    "    return self.x1 - self.x0\n",
+    "\n",
+    "  def height(self) -> float:\n",
+    "    return self.y1 - self.y0\n",
+    "\n",
+    "  def area(self)-> float:\n",
+    "    return (self.x1 - self.x0 + 1) * (self.y1 - self.y0 + 1)\n",
+    "  \n",
+    "  def intersection(self, other)-> Optional['BBox']:\n",
+    "    x0 = max(self.x0, other.x0)\n",
+    "    y0 = max(self.y0, other.y0)\n",
+    "    x1 = min(self.x1, other.x1)\n",
+    "    y1 = min(self.y1, other.y1)\n",
+    "    if x0 > x1 or y0 > y1:\n",
+    "      return None\n",
+    "    return BBox(x0, y0, x1, y1)\n",
+    "\n",
+    "  def iou(self, other):\n",
+    "    intersection = self.intersection(other)\n",
+    "    if intersection is None:\n",
+    "      return 0\n",
+    "    \n",
+    "    ia = intersection.area()\n",
+    "\n",
+    "    return ia/(self.area() + other.area() - ia)\n",
+    "  \n",
+    "  def draw(self, image, label=None, color=(0, 140, 255)):\n",
+    "    image = np.asarray(image)\n",
+    "    cv2.rectangle(image, \n",
+    "                  (int(self.x0), int(self.y0)),\n",
+    "                  (int(self.x1), int(self.y1)),\n",
+    "                  color,\n",
+    "                  thickness=2)\n",
+    "    if label is not None:\n",
+    "      cv2.putText(image, str(label), \n",
+    "                  (int(self.x0), int(self.y0-10)),\n",
+    "                  cv2.FONT_HERSHEY_SIMPLEX,\n",
+    "                  0.9, color, thickness=2)\n",
+    "    return image"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "2izYMR9Q6Dn0"
+   },
+   "source": [
+    "And a class to represent a `Detection`, with a method to create a list of detections from the model's output:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "tybwY3eaY803"
+   },
+   "outputs": [],
+   "source": [
+    "@dataclasses.dataclass(frozen=True)\n",
+    "class Detection:\n",
+    "    \"\"\"Detection dataclass.\"\"\"\n",
+    "    class_id: int\n",
+    "    score: float\n",
+    "    bbox: BBox\n",
+    "    threshold:float = 0.4\n",
+    "\n",
+    "    def replace(self, **kwargs):\n",
+    "      d = self.__dict__.copy()\n",
+    "      d.update(kwargs)\n",
+    "      return type(self)(**d)\n",
+    "\n",
+    "    @classmethod\n",
+    "    def process_model_output(\n",
+    "      cls, image, detections: Dict[str, tf.Tensor]\n",
+    "    ) -> Iterable[Iterable['Detection']]:\n",
+    "      \n",
+    "      # The model only works on a batch size of 1.\n",
+    "      detection_boxes = detections['detection_boxes'].numpy()[0]\n",
+    "      detection_classes = detections['detection_classes'].numpy()[0].astype(np.int32)\n",
+    "      detection_scores = detections['detection_scores'].numpy()[0]\n",
+    "\n",
+    "      img_h, img_w = image.shape[0:2]\n",
+    "\n",
+    "      valid_indices = detection_scores >= cls.threshold\n",
+    "      classes = detection_classes[valid_indices]\n",
+    "      scores = detection_scores[valid_indices]\n",
+    "      boxes = detection_boxes[valid_indices, :]\n",
+    "      detections = []\n",
    "\n",
-    "from absl import logging as absl_logging\n",
+    "      for class_id, score, box in zip(classes, scores, boxes):\n",
-    "from IPython import display\n",
+    "          detections.append(\n",
-    "import cv2\n",
+    "              Detection(\n",
-    "import numpy as np\n",
+    "                  class_id=class_id,\n",
-    "import tensorflow as tf\n",
+    "                  score=score,\n",
-    "from tqdm import tqdm"
+    "                  bbox=BBox(\n",
+    "                      x0=box[1] * img_w,\n",
+    "                      y0=box[0] * img_h,\n",
+    "                      x1=box[3] * img_w,\n",
+    "                      y1=box[2] * img_h,)))\n",
+    "\n",
+    "      return detections"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
-    "id": "gsSclJg4sJbX"
+    "id": "QRZ9Q5meHl84"
   },
   "source": [
-    "Define all needed variables."
+    "Now you can preview the model's output:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
-    "id": "iKMCvnZEXBBT"
+    "id": "Px7AoFCn-psx"
   },
   "outputs": [],
   "source": [
-    "model_name = \"cots_1080_v1\" #@param [\"cots_1080_v1\", \"cots_720_v1\"]\n",
+    "detections = Detection.process_model_output(image, result)\n",
-    "test_sequence_name = \"test3\" #@param [\"test1\", \"test2\", \"test3\"]"
+    "\n",
+    "for n, det in enumerate(detections):\n",
+    "  det.bbox.draw(image, label=n+1, color=(255, 140, 0))\n",
+    "\n",
+    "PIL.Image.fromarray(image.numpy())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "B1q_n1xJLm60"
+   },
+   "source": [
+    "That works well for one frame, but to count the number of COTS in a video you'll need to track the detections from frame to frame. The raw detection indices are not stable, they're just sorted by the detection score. Below both sets of detections are overlaid on the second image with the first frame's detections in white and the second frame's in orange, the indices are not aligned. The positions are shifted because of camera motion between teh two frames:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
-    "id": "ORLJSdLq4-gd"
+    "id": "PLtxJFPuLma0"
   },
   "outputs": [],
   "source": [
-    "cots_model = f\"https://storage.googleapis.com/download.tensorflow.org/models/cots_detection/{model_name}.zip\"\n",
+    "image2 = tf.io.read_file(filenames[example_frame_number+5]) # five frames later\n",
+    "image2 = tf.io.decode_jpeg(image2)\n",
+    "result2 = model_fn(image2[tf.newaxis, ...])\n",
+    "detections2 = Detection.process_model_output(image2, result2)\n",
    "\n",
-    "# Alternatively, this dataset can be downloaded through CSIRO's Data Access Portal at https://data.csiro.au/collection/csiro:54830v2\n",
+    "for n, det in enumerate(detections):\n",
-    "sample_data_link = f\"https://storage.googleapis.com/download.tensorflow.org/data/cots_detection/sample_images.zip\"\n",
+    "  det.bbox.draw(image2, label=n+1, color=(255, 255, 255))\n",
    "\n",
-    "preview_video_path = \"preview.mp4\"\n",
+    "for n, det in enumerate(detections2):\n",
-    "detection_small_video_path = \"COTS_detection.mp4\"\n",
+    "  det.bbox.draw(image2, label=n+1, color=(255, 140, 0))\n",
-    "detection_csv_path = \"detections.csv\""
+    "\n",
+    "PIL.Image.fromarray(image2.numpy())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
-    "id": "fVq6vNBTxM62"
+    "id": "CoRxLon5MZ35"
   },
   "source": [
-    "Also, download the trained COTS detection model that matches your preferences above."
+    "The two sets of bounding boxes above don't line up because of camera movement. \n",
+    "To see in more detail how tracks are aligned, initialize the tracker with the first image, and then run the optical flow step, `propagate_tracks`. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
-    "id": "No5jRA1TxXj0"
+    "id": "wb_nkcPJJx2t"
   },
   "outputs": [],
   "source": [
-    "model_path = tf.keras.utils.get_file(origin=cots_model)\n",
+    "def default_of_params():\n",
-    "# Unzip model\n",
+    "  its=20\n",
-    "!mkdir {model_name}\n",
+    "  eps=0.03\n",
-    "!unzip -o -q {model_path} -d {model_name}"
+    "  return {\n",
-   ]
+    "      'winSize': (64,64),\n",
-  },
+    "      'maxLevel': 3,\n",
-  {
+    "      'criteria': (cv2.TermCriteria_COUNT + cv2.TermCriteria_EPS, its, eps)\n",
-   "cell_type": "markdown",
+    "  }"
-   "metadata": {
-    "id": "FNwP3s-5xgaF"
-   },
-   "source": [
-    "You also need to retrieve the sample data.  This sample data is made up of a series of chronological images."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
-    "id": "DF_c_ZMXdPRN"
+    "id": "mHVPymG8F2ke"
   },
   "outputs": [],
   "source": [
-    "sample_data_path = tf.keras.utils.get_file(origin=sample_data_link)\n",
+    "def propagate_detections(detections, image1, image2, of_params=None):\n",
-    "# Unzip data\n",
+    "  if of_params is None:\n",
-    "!mkdir sample_images\n",
+    "    of_params = default_of_params()\n",
-    "!unzip -o -q {sample_data_path} -d sample_images"
+    "\n",
+    "  bboxes = [det.bbox for det in detections]\n",
+    "  centers = np.float32([[bbox.center() for bbox in bboxes]])\n",
+    "  widths = np.float32([[bbox.width() for bbox in bboxes]])\n",
+    "  heights = np.float32([[bbox.height() for bbox in bboxes]])\n",
+    "\n",
+    "\n",
+    "  new_centers, status, error = cv2.calcOpticalFlowPyrLK(\n",
+    "      image1, image2, centers, None, **of_params)\n",
+    "\n",
+    "  x0s = new_centers[...,0] - widths/2\n",
+    "  x1s = new_centers[...,0] + widths/2\n",
+    "  y0s = new_centers[...,1] - heights/2\n",
+    "  y1s = new_centers[...,1] + heights/2\n",
+    "\n",
+    "  updated_detections = []\n",
+    "  for i, det in enumerate(detections):\n",
+    "    det = det.replace(\n",
+    "        bbox = BBox(x0s[0,i], y0s[0,i], x1s[0,i], y1s[0,i]))\n",
+    "    updated_detections.append(det)\n",
+    "  return updated_detections"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
-    "id": "d0iALUwM0g2p"
+    "id": "dCjgvoZnOcBu"
   },
   "source": [
-    "# Load the model and perform inference and tracking on sample data\n",
+    "Now keep the white boxes for the initial detections, and the the orange boxes for the new set of detections. But add the the optical-flow propagated tracks in green. You can see that by using optical-flow to propagate the old detections to the new frame the alignment is quite good. It's this alignment between the old and new detections (between the green and orange boxes) that allows the tracker to make a persistemt track for each COTS.  "
-    "Load trained model from disk and create the inference function `model_fn()`.  This might take a little while."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
-    "id": "HXQnNjwl8Beu"
+    "id": "aeTny8YnHwTw"
   },
   "outputs": [],
   "source": [
-    "absl_logging.set_verbosity(absl_logging.ERROR)\n",
+    "image = tf.io.read_file(filenames[example_frame_number])\n",
+    "image = tf.io.decode_jpeg(image).numpy()\n",
+    "image_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)\n",
    "\n",
-    "tf.config.optimizer.set_experimental_options({'auto_mixed_precision': True})\n",
+    "image2 = tf.io.read_file(filenames[example_frame_number+5]) # five frames later\n",
-    "tf.config.optimizer.set_jit(True)\n",
+    "image2 = tf.io.decode_jpeg(image2).numpy()\n",
+    "image2_gray = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)\n",
    "\n",
-    "model_fn = tf.saved_model.load(model_name).signatures['serving_default']"
+    "updated_detections = propagate_detections(detections, image_gray, image2_gray)\n",
+    "\n",
+    "\n",
+    "for det in detections:\n",
+    "  det.bbox.draw(image2, color=(255, 255, 255))\n",
+    "\n",
+    "for det in updated_detections:\n",
+    "  det.bbox.draw(image2, color=(0, 255, 0))\n",
+    "\n",
+    "for det in detections2:\n",
+    "  det.bbox.draw(image2, color=(255, 140, 0))\n",
+    "\n",
+    "PIL.Image.fromarray(image2)"
   ]
  },
  {
@@ -250,48 +818,62 @@
   "source": [
    "# Define **OpticalFlowTracker** class and its related classes\n",
    "\n",
-    "These help track the movement of each COTS object throughout the image frames."
+    "These help track the movement of each COTS object across the video frames.\n",
+    "\n",
+    "The tracker collects related detections into `Track` objects. \n",
+    "\n",
+    "The class's init is defined below, it's methods are defined in the following cells.\n",
+    "\n",
+    "The `__init__` method just initializes the track counter (`track_id`), and sets some default values for the tracking and optical flow configurations. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
-    "id": "tybwY3eaY803"
+    "id": "3j2Ka1uGEoz4"
   },
   "outputs": [],
   "source": [
-    "def box_area(x0, y0, x1, y1):\n",
+    "class OpticalFlowTracker:\n",
-    "    return (x1 - x0 + 1) * (y1 - y0 + 1)\n",
+    "    \"\"\"Optical flow tracker.\"\"\"\n",
+    "\n",
+    "    @classmethod\n",
+    "    def add_method(cls, fun):\n",
+    "      \"\"\"Attach a new method to the class.\"\"\"\n",
+    "      setattr(cls, fun.__name__, fun)\n",
    "\n",
-    "@dataclasses.dataclass\n",
-    "class Detection:\n",
-    "    \"\"\"Detection dataclass.\"\"\"\n",
-    "    class_id: int\n",
-    "    score: float\n",
-    "    x0: float\n",
-    "    y0: float\n",
-    "    x1: float\n",
-    "    y1: float\n",
    "\n",
-    "    def __repr__(self):\n",
+    "    def __init__(self, tid=1, ft=3.0, iou=0.5, tt=2.0, bb=32, of_params=None):\n",
-    "        return (f'Class {self.class_id}, score {self.score}, '\n",
+    "      # Bookkeeping for the tracks.\n",
-    "                f'box ({self.x0}, {self.y0}, {self.x1}, {self.y1})')\n",
+    "      # The running track count, incremented for each new track.\n",
+    "      self.track_id = tid\n",
+    "      self.tracks = []\n",
+    "      self.prev_image = None\n",
+    "      self.prev_time = None\n",
    "\n",
-    "    def area(self):\n",
+    "      # Configuration for the track cleanup logic.\n",
-    "        return box_area(self.x0, self.y0, self.x1, self.y1)\n",
+    "      # How long to apply optical flow tracking without getting positive \n",
+    "      # detections (sec).\n",
+    "      self.track_flow_time = ft * 1000\n",
+    "      # Required IoU overlap to link a detection to a track.\n",
+    "      self.overlap_threshold = iou\n",
+    "      # Used to detect if detector needs to be reset.\n",
+    "      self.time_threshold = tt * 1000\n",
+    "      self.border = bb\n",
    "\n",
-    "    def iou(self, other):\n",
+    "      if of_params is None:\n",
-    "        overlap_x0 = max(self.x0, other.x0)\n",
+    "        of_params = default_of_params()\n",
-    "        overlap_y0 = max(self.y0, other.y0)\n",
+    "      self.of_params = of_params\n"
-    "        overlap_x1 = min(self.x1, other.x1)\n",
+   ]
-    "        overlap_y1 = min(self.y1, other.y1)\n",
+  },
-    "        if overlap_x0 < overlap_x1 and overlap_y0 < overlap_y1:\n",
+  {
-    "            overlap_area = box_area(overlap_x0, overlap_y0, overlap_x1,\n",
+   "cell_type": "markdown",
-    "                                    overlap_y1)\n",
+   "metadata": {
-    "            return overlap_area / (self.area() + other.area() - overlap_area)\n",
+    "id": "yBLSv0Fi_JJD"
-    "        else:\n",
+   },
-    "            return 0\n"
+   "source": [
+    "Internally the tracker will use small `Track` and `Tracklet` classes to organize the data. The `Tracklet` class is just a `Detection` with a timestamp, while a `Track` is a track ID, the most recent detection and a list of `Tracklet` objects forming the history of the track."
   ]
  },
  {
@@ -302,16 +884,15 @@
   },
   "outputs": [],
   "source": [
+    "@dataclasses.dataclass(frozen=True)\n",
    "class Tracklet:\n",
-    "    def __init__(self, timestamp, detection):\n",
+    "    timestamp:float\n",
-    "        self.timestamp = timestamp\n",
+    "    detection:Detection\n",
-    "        # Store a copy here to make sure the coordinates will not be updated\n",
-    "        # when the optical flow propagation runs using another reference to this\n",
-    "        # detection.\n",
-    "        self.detection = copy.deepcopy(detection)\n",
    "\n",
-    "    def __repr__(self):\n",
+    "    def replace(self, **kwargs):\n",
-    "        return f'Time {self.timestamp}, ' + self.detection.__repr__()\n"
+    "      d = self.__dict__.copy()\n",
+    "      d.update(kwargs)\n",
+    "      return type(self)(**d)"
   ]
  },
  {
@@ -322,284 +903,307 @@
   },
   "outputs": [],
   "source": [
+    "@dataclasses.dataclass(frozen=True)\n",
    "class Track:\n",
    "    \"\"\"Tracker entries.\"\"\"\n",
-    "    def __init__(self, id, detection):\n",
+    "    id:int\n",
-    "        self.id = id\n",
+    "    det: Detection\n",
-    "        self.linked_dets = []\n",
+    "    linked_dets:List[Tracklet] = dataclasses.field(default_factory=list)\n",
-    "        self.det = detection\n",
+    "\n",
+    "    def replace(self, **kwargs):\n",
+    "      d = self.__dict__.copy()\n",
+    "      d.update(kwargs)\n",
+    "      return type(self)(**d)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Ntl_4oUp_1nD"
+   },
+   "source": [
+    "The tracker keeps a list of active `Track` objects.\n",
+    "\n",
+    "The main `update` method takes an image, along with the list of detections and the timestamp for that image. On each frame step it performs the following sub-tasks:\n",
    "\n",
-    "    def __repr__(self):\n",
+    "* The tracker uses optical flow to calculate where each `Track` expects to see a new `Detection`.\n",
-    "        result = f'Track {self.id}'\n",
+    "* The tracker matches up the actual detections for the frame to the expected detections for each Track.\n",
-    "        for linked_det in self.linked_dets:\n",
+    "* If a detection doesn't get matched to an existing track, a new track is created for the detection.\n",
-    "            result += '\\n' + linked_det.__repr__()\n",
+    "* If a track stops getting assigned new detections, it is eventually deactivated. "
-    "        return result\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
-    "id": "3j2Ka1uGEoz4"
+    "id": "koZ0mjFTpiTv"
   },
   "outputs": [],
   "source": [
-    "class OpticalFlowTracker:\n",
+    "@OpticalFlowTracker.add_method\n",
-    "    \"\"\"Optical flow tracker.\"\"\"\n",
+    "def update(self, image_bgr, detections, timestamp):\n",
-    "    def __init__(self, tid, ft=3.0, iou=0.5, tt=2.0, bb=32, size=64, its=20,\n",
+    "    start = time.time()\n",
-    "                 eps=0.03, levels=3):\n",
+    "\n",
-    "        self.track_id = tid\n",
+    "    image = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)\n",
-    "        # How long to apply optical flow tracking without getting positive \n",
+    "\n",
-    "        # detections (sec).\n",
+    "    # Remove dead tracks.\n",
-    "        self.track_flow_time = ft * 1000\n",
+    "    self.tracks = self.cleanup_tracks(image, timestamp)\n",
-    "        # Required IoU overlap to link a detection to a track.\n",
+    "\n",
-    "        self.overlap_threshold = iou\n",
+    "    # Run optical flow to update existing tracks.\n",
-    "        # Used to detect if detector needs to be reset.\n",
+    "    if self.prev_time is not None:\n",
-    "        self.time_threshold = tt * 1000\n",
+    "        self.tracks = self.propagate_tracks(image)\n",
-    "        self.border = bb\n",
+    "\n",
-    "        # Size of optical flow region.\n",
+    "    # Update the track list based on the new detections\n",
-    "        self.of_size = (size, size)\n",
+    "    self.apply_detections_to_tracks(image, detections, timestamp)\n",
-    "        self.of_criteria = (cv2.TermCriteria_COUNT + cv2.TermCriteria_EPS, its, \n",
+    "\n",
-    "                            eps)\n",
+    "    self.prev_image = image\n",
-    "        self.of_levels= levels\n",
+    "    self.prev_time = timestamp\n",
    "\n",
-    "        self.tracks = []\n",
+    "    return self.tracks"
-    "        self.prev_image = None\n",
-    "        self.prev_time = -1\n",
-    "\n",
-    "    def update(self, image_bgr, detections, timestamp):\n",
-    "        start = time.time()\n",
-    "        num_optical_flow_calls = 0\n",
-    "\n",
-    "        image = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)\n",
-    "\n",
-    "        image_w = image.shape[1]\n",
-    "        image_h = image.shape[0]\n",
-    "\n",
-    "        # Assume tracker is invalid if too much time has passed!\n",
-    "        if (self.prev_time > 0 and\n",
-    "                timestamp - self.prev_time > self.time_threshold):\n",
-    "            logging.info(\n",
-    "                'Too much time since last update, resetting tracker.')\n",
-    "            self.tracks = []\n",
-    "\n",
-    "        # Remove tracks which are:\n",
-    "        # - Touching the image edge.\n",
-    "        # - Have existed for a long time without linking a real detection.\n",
-    "        active_tracks = []\n",
-    "        for track in self.tracks:\n",
-    "            if (track.det.x0 < self.border or track.det.y0 < self.border or\n",
-    "                    track.det.x1 >= (image_w - self.border) or\n",
-    "                    track.det.y1 >= (image_h - self.border)):\n",
-    "                logging.info(f'Removing track {track.id} because it\\'s near the border')\n",
-    "                continue\n",
-    "\n",
-    "            time_since_last_detection = timestamp - track.linked_dets[-1].timestamp\n",
-    "            if (time_since_last_detection > self.track_flow_time):\n",
-    "                logging.info(f'Removing track {track.id} because it\\'s too old '\n",
-    "                             f'({time_since_last_detection:.02f}s)')\n",
-    "                continue\n",
-    "\n",
-    "            active_tracks.append(track)\n",
-    "\n",
-    "        self.tracks = active_tracks\n",
-    "\n",
-    "        # Run optical flow to update existing tracks.\n",
-    "        if self.prev_time > 0:\n",
-    "            # print('Running optical flow propagation.')\n",
-    "            of_params = {\n",
-    "                'winSize': self.of_size,\n",
-    "                'maxLevel': self.of_levels,\n",
-    "                'criteria': self.of_criteria\n",
-    "            }\n",
-    "            for track in self.tracks:\n",
-    "                input_points = np.float32([[[(track.det.x0 + track.det.x1) / 2,\n",
-    "                                             (track.det.y0 + track.det.y1) / 2]]])\n",
-    "                output_points, status, error = cv2.calcOpticalFlowPyrLK(\n",
-    "                    self.prev_image, image, input_points, None, **of_params)\n",
-    "                num_optical_flow_calls += 1\n",
-    "                w = track.det.x1 - track.det.x0\n",
-    "                h = track.det.y1 - track.det.y0\n",
-    "                # print(f'Detection before flow update: {track.det}')\n",
-    "                track.det.x0 = output_points[0][0][0] - w * 0.5\n",
-    "                track.det.y0 = output_points[0][0][1] - h * 0.5\n",
-    "                track.det.x1 = output_points[0][0][0] + w * 0.5\n",
-    "                track.det.y1 = output_points[0][0][1] + h * 0.5\n",
-    "                # print(f'Detection after flow update: {track.det}')\n",
-    "\n",
-    "\n",
-    "        # Insert new detections.\n",
-    "        detected_obj_track_ids = set()\n",
-    "\n",
-    "        for detection in detections:\n",
-    "            if (detection.x0 < self.border or detection.y0 < self.border or\n",
-    "                    detection.x1 >= image_w - self.border or\n",
-    "                    detection.y1 >= image_h - self.border):\n",
-    "                # print('Skipping detection because it\\'s close to the border.')\n",
-    "                continue\n",
-    "\n",
-    "            # See if detection can be linked to an existing track.\n",
-    "            linked = False\n",
-    "            overlap_index = 0\n",
-    "            overlap_max = -1000\n",
-    "            for track_index, track in enumerate(self.tracks):\n",
-    "                # print(f'Testing track {track_index}')\n",
-    "                if track.det.class_id != detection.class_id:\n",
-    "                    continue\n",
-    "                overlap = detection.iou(track.det)\n",
-    "                if overlap > overlap_max:\n",
-    "                    overlap_index = track_index\n",
-    "                    overlap_max = overlap\n",
-    "\n",
-    "            # Link to existing track with maximal IoU.\n",
-    "            if overlap_max > self.overlap_threshold:\n",
-    "                track = self.tracks[overlap_index]\n",
-    "                track.det = detection\n",
-    "                track.linked_dets.append(Tracklet(timestamp, detection))\n",
-    "                detected_obj_track_ids.add(track.id)\n",
-    "                linked = True\n",
-    "\n",
-    "            if not linked:\n",
-    "                logging.info(f'Creating new track with ID {self.track_id}')\n",
-    "                new_track = Track(self.track_id, detection)\n",
-    "                new_track.linked_dets.append(Tracklet(timestamp, detection))\n",
-    "                detected_obj_track_ids.add(self.track_id)\n",
-    "                self.tracks.append(new_track)\n",
-    "                self.track_id += 1\n",
-    "\n",
-    "        for track in self.tracks:\n",
-    "            # If the detector does not find the obj but estimated in the tracker, \n",
-    "            # add the estimated one to that tracker's linked_dets\n",
-    "            if track.id not in detected_obj_track_ids:\n",
-    "                track.linked_dets.append(Tracklet(timestamp, track.det))\n",
-    "\n",
-    "        self.prev_image = image\n",
-    "        self.prev_time = timestamp\n",
-    "\n",
-    "        if num_optical_flow_calls > 0:\n",
-    "            tracking_ms = int(1000 * (time.time() - start))\n",
-    "            logging.info(f'Tracking took {tracking_ms}ms, '\n",
-    "                         f'{num_optical_flow_calls} optical flow calls')\n",
-    "\n",
-    "        return self.tracks"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
-    "id": "Kkg3SazB1edC"
+    "id": "U-6__zF2CHFS"
   },
   "source": [
-    "Create a list of images to work on from the downloaded files."
+    "The `cleanup_tracks` method clears tracks that are too old or are too close to the edge of the image."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
-    "id": "u0fqXQUzdZCu"
+    "id": "HQBj8GihjF3-"
   },
   "outputs": [],
   "source": [
-    "def embed_video_file(path: os.PathLike) -> display.HTML:\n",
+    "@OpticalFlowTracker.add_method\n",
-    "  \"\"\"Embeds a file in the notebook as an html tag with a data-url.\"\"\"\n",
+    "def cleanup_tracks(self, image, timestamp) -> List[Track]:\n",
-    "  path = pathlib.Path(path)\n",
+    "  image_w = image.shape[1]\n",
-    "  mime, unused_encoding = mimetypes.guess_type(str(path))\n",
+    "  image_h = image.shape[0]\n",
-    "  data = path.read_bytes()\n",
    "\n",
-    "  b64 = base64.b64encode(data).decode()\n",
+    "  # Assume tracker is invalid if too much time has passed!\n",
-    "  return display.HTML(\n",
+    "  if (self.prev_time is not None and\n",
-    "      textwrap.dedent(\"\"\"\n",
+    "          timestamp - self.prev_time > self.time_threshold):\n",
-    "      <video width=\"640\" height=\"480\" controls>\n",
+    "      logging.info(\n",
-    "        <source src=\"data:{mime};base64,{b64}\" type=\"{mime}\">\n",
+    "          'Too much time since last update, resetting tracker.')\n",
-    "        Your browser does not support the video tag.\n",
+    "      return []\n",
-    "      </video>\n",
+    "\n",
-    "      \"\"\").format(mime=mime, b64=b64))\n"
+    "  # Remove tracks which are:\n",
+    "  # - Touching the image edge.\n",
+    "  # - Have existed for a long time without linking a real detection.\n",
+    "  active_tracks = []\n",
+    "  for track in self.tracks:\n",
+    "      bbox = track.det.bbox\n",
+    "      if (bbox.x0 < self.border or bbox.y0 < self.border or\n",
+    "              bbox.x1 >= (image_w - self.border) or\n",
+    "              bbox.y1 >= (image_h - self.border)):\n",
+    "          logging.info(f'Removing track {track.id} because it\\'s near the border')\n",
+    "          continue\n",
+    "\n",
+    "      time_since_last_detection = timestamp - track.linked_dets[-1].timestamp\n",
+    "      if (time_since_last_detection > self.track_flow_time):\n",
+    "          logging.info(f'Removing track {track.id} because it\\'s too old '\n",
+    "                        f'({time_since_last_detection:.02f}s)')\n",
+    "          continue\n",
+    "\n",
+    "      active_tracks.append(track)\n",
+    "\n",
+    "  return active_tracks"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "DVzNcESxC6vY"
+   },
+   "source": [
+    "The `propagate_tracks` method uses optical flow to update each track's bounding box's position to preduict their location in the new image: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
-    "id": "kCdWsbO1afIJ"
+    "id": "0GycdAflCs6v"
   },
   "outputs": [],
   "source": [
-    "tmp_video_path = \"tmp_preview.mp4\"\n",
+    "@OpticalFlowTracker.add_method\n",
+    "def propagate_tracks(self, image):\n",
+    "  if not self.tracks:\n",
+    "    return self.tracks[:]\n",
    "\n",
-    "filenames = sorted(glob.glob(f\"sample_images/{test_sequence_name}/*.jpg\"))\n",
+    "  detections = [track.det for track in self.tracks]\n",
-    "img = cv2.imread(filenames[0])\n",
+    "  detections = propagate_detections(detections, self.prev_image, image, self.of_params)\n",
-    "height, width, layers = img.shape\n",
-    "size = (width, height)\n",
    "\n",
-    "video_writer = cv2.VideoWriter(\n",
+    "  updated_tracks = []\n",
-    "    filename=tmp_video_path,\n",
+    "  for det,track in zip(updated_detections, self.tracks):\n",
-    "    fourcc=cv2.VideoWriter_fourcc(*\"MP4V\"), \n",
+    "    track = track.replace(det=det)\n",
-    "    fps=15, \n",
+    "    updated_tracks.append(track.replace(det=det))\n",
-    "    frameSize=size)\n",
+    "\n",
-    " \n",
+    "  return [track.replace(det=det) \n",
-    "for filename in tqdm(filenames):\n",
+    "          for track, det in zip(self.tracks, detections)]\n"
-    "    img = cv2.imread(filename)\n",
-    "    video_writer.write(img)\n",
-    "cv2.destroyAllWindows()\n",
-    "video_writer.release()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
-    "id": "cHsKpPyviWmF"
+    "id": "uLbVeetwD0ph"
   },
   "source": [
-    "Re-encode the video, and reduce its size (Colab crashes if you try to embed the full size video)."
+    "The `apply_detections_to_tracks` method compares each detection to the updated bounding box for each track. The detection is added to the track that matches best, if the match is better than the `overlap_threshold`. If no track is better than the threshold, the detection is used to create a new track. \n",
+    "\n",
+    "If a track has no new detection assigned to it the predicted-detection is used."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
-    "id": "_li0qe-gh1iT"
+    "id": "j6pfRhDRlApe"
   },
   "outputs": [],
   "source": [
-    "subprocess.check_call([\n",
+    "@OpticalFlowTracker.add_method\n",
-    "    \"ffmpeg\", \"-y\", \"-i\", tmp_video_path,\n",
+    "def apply_detections_to_tracks(self, image, detections, timestamp):\n",
-    "    \"-vf\",\"scale=800:-1\",\n",
+    "  image_w = image.shape[1]\n",
-    "    \"-crf\", \"18\",\n",
+    "  image_h = image.shape[0]\n",
-    "    \"-preset\", \"veryfast\",\n",
+    "\n",
-    "    \"-vcodec\", \"libx264\", preview_video_path])"
+    "  # Insert new detections.\n",
+    "  detected_obj_track_ids = set()\n",
+    "\n",
+    "  for detection in detections:\n",
+    "      bbox = detection.bbox\n",
+    "      if (bbox.x0 < self.border or bbox.y0 < self.border or\n",
+    "              bbox.x1 >= image_w - self.border or\n",
+    "              bbox.y1 >= image_h - self.border):\n",
+    "          # print('Skipping detection because it\\'s close to the border.')\n",
+    "          continue\n",
+    "\n",
+    "      # See if detection can be linked to an existing track.\n",
+    "      linked = False\n",
+    "      overlap_index = 0\n",
+    "      overlap_max = -1000\n",
+    "      for track_index, track in enumerate(self.tracks):\n",
+    "          # print(f'Testing track {track_index}')\n",
+    "          if track.det.class_id != detection.class_id:\n",
+    "              continue\n",
+    "          overlap = detection.bbox.iou(track.det.bbox)\n",
+    "          if overlap > overlap_max:\n",
+    "              overlap_index = track_index\n",
+    "              overlap_max = overlap\n",
+    "\n",
+    "      # Link to existing track with maximal IoU.\n",
+    "      if overlap_max > self.overlap_threshold:\n",
+    "          track = self.tracks[overlap_index]\n",
+    "          self.tracks[overlap_index] = track.replace(det=detection)\n",
+    "          track.linked_dets.append(Tracklet(timestamp, detection))\n",
+    "          detected_obj_track_ids.add(track.id)\n",
+    "          linked = True\n",
+    "\n",
+    "      if not linked:\n",
+    "          logging.info(f'Creating new track with ID {self.track_id}')\n",
+    "          new_track = Track(self.track_id, detection)\n",
+    "          new_track.linked_dets.append(Tracklet(timestamp, detection))\n",
+    "          detected_obj_track_ids.add(self.track_id)\n",
+    "          self.tracks.append(new_track)\n",
+    "          self.track_id += 1\n",
+    "\n",
+    "  for track in self.tracks:\n",
+    "      # If the detector does not find the obj but estimated in the tracker, \n",
+    "      # add the estimated one to that tracker's linked_dets\n",
+    "      if track.id not in detected_obj_track_ids:\n",
+    "          track.linked_dets.append(Tracklet(timestamp, track.det))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
-    "id": "2ItoiHyYQGya"
+    "id": "gY0AH-KUHPlC"
   },
   "source": [
-    "The images you downloaded are frames of a movie showing a top view of a coral reef with crown-of-thorns starfish.  The movie looks like this:"
+    "So reload the test images, and run the detections to test out the tracker.\n",
+    "\n",
+    "On the first frame it creates and returns one track per detection:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
-    "colab": {
+    "id": "7Ekkj_XFGdfq"
-     "background_save": true
-    },
-    "id": "SiOsbr8xePkg"
   },
   "outputs": [],
   "source": [
-    "embed_video_file(preview_video_path)"
+    "example_frame_number = 52\n",
+    "image = tf.io.read_file(filenames[example_frame_number])\n",
+    "image = tf.io.decode_jpeg(image)\n",
+    "result = model_fn(image[tf.newaxis, ...])\n",
+    "detections = Detection.process_model_output(image, result)\n",
+    "\n",
+    "tracker = OpticalFlowTracker()\n",
+    "tracks = tracker.update(image.numpy(), detections, timestamp = 0)\n",
+    "\n",
+    "print(f'detections : {len(detections)}') \n",
+    "print(f'tracks     : {len(tracks)}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
-    "id": "9Z0DTbWrZMZ-"
+    "id": "WovDYdNMII-n"
+   },
+   "source": [
+    "On the second frame many of the detections get assigned to existing tracks:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "7iFEKwgMGi5n"
+   },
+   "outputs": [],
+   "source": [
+    "image2 = tf.io.read_file(filenames[example_frame_number+5]) # five frames later\n",
+    "image2 = tf.io.decode_jpeg(image2)\n",
+    "result2 = model_fn(image2[tf.newaxis, ...])\n",
+    "detections2 = Detection.process_model_output(image2, result2)\n",
+    "\n",
+    "new_tracks = tracker.update(image2.numpy(), detections2, timestamp = 1000)\n",
+    "\n",
+    "print(f'detections : {len(detections2)}') \n",
+    "print(f'tracks     : {len(new_tracks)}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "dbkedwiVrxnQ"
+   },
+   "source": [
+    "Now the track IDs should be consistent:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "QexJR5gerw6q"
   },
+   "outputs": [],
   "source": [
-    "The goal of the model is to put boxes around all of the starfish.  Each starfish gets its own ID, and that ID will be stable as the camera passes over it."
+    "test_img = image2.numpy()\n",
+    "for n,track in enumerate(tracks):\n",
+    "  track.det.bbox.draw(test_img, label=n, color=(255, 255, 255))\n",
+    "\n",
+    "for n,track in enumerate(new_tracks):\n",
+    "  track.det.bbox.draw(test_img, label=n, color=(255, 140, 0))\n",
+    "\n",
+    "PIL.Image.fromarray(test_img)"
   ]
  },
  {
@@ -621,16 +1225,49 @@
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
-    "colab": {
+    "id": "lESJE0qXxubm"
-     "background_save": true
+   },
-    },
+   "outputs": [],
-    "id": "vHIarsxH1svL"
+   "source": [
+    "@dataclasses.dataclass(frozen=True)\n",
+    "class TrackAnnotation:\n",
+    "  det: Detection\n",
+    "  seq_id: int\n",
+    "  seq_idx: int\n",
+    "  seq_length: Optional[int] = None\n",
+    "\n",
+    "  def replace(self, **kwargs):\n",
+    "    d = self.__dict__.copy()\n",
+    "    d.update(kwargs)\n",
+    "    return type(self)(**d)\n",
+    "\n",
+    "  def annotation_str(self):\n",
+    "    return f\"{self.seq_id} ({self.seq_idx}/{self.seq_length})\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Dn7efhr0GBGz"
+   },
+   "outputs": [],
+   "source": [
+    "# Read a jpg image and decode it to a uint8 tf tensor.\n",
+    "def parse_image(filename):\n",
+    "    image = tf.io.read_file(filename)\n",
+    "    image = tf.io.decode_jpeg(image)\n",
+    "    return (tf.timestamp(), filename, image)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "cqN8RGBgVbr4"
   },
   "outputs": [],
   "source": [
-    "# Record all the detected COTS objects with the scores equal to or greater than the threshold\n",
-    "threshold = 0.4\n",
-    "_CLASS_ID_TO_LABEL = ('COTS',)\n",
    "# Create a tracker object\n",
    "tracker = OpticalFlowTracker(tid=1)\n",
    "# Record tracking responses from the tracker\n",
@@ -640,36 +1277,6 @@
    "\n",
    "base_time = tf.timestamp()\n",
    "\n",
-    "# Format tracker response, and save it into a new object.\n",
-    "def format_tracker_response(file_path, tracks, seq_length_dict):\n",
-    "    new_track_list = []\n",
-    "    for track in tracks:\n",
-    "        detection_columns = [\n",
-    "            _CLASS_ID_TO_LABEL[track.det.class_id],\n",
-    "            str(track.det.score),\n",
-    "            str(track.id),\n",
-    "            str(len(track.linked_dets)),\n",
-    "            str(round(track.det.x0)),\n",
-    "            str(round(track.det.y0)),\n",
-    "            str(round(track.det.x1 - track.det.x0)),\n",
-    "            str(round(track.det.y1 - track.det.y0))\n",
-    "        ]\n",
-    "\n",
-    "        if str(track.id) not in seq_length_dict:\n",
-    "            seq_length_dict[str(track.id)] = len(track.linked_dets)\n",
-    "        else:\n",
-    "            if len(track.linked_dets) > seq_length_dict[str(track.id)]:\n",
-    "                seq_length_dict[str(track.id)] = len(track.linked_dets)\n",
-    "        new_track_list.append({\"score\":str(round(track.det.score, 3)), \"seq_id\": str(track.id), \"seq_idx\": str(len(track.linked_dets)),\n",
-    "                            \"x0\": round(track.det.x0), \"y0\": round(track.det.y0), \"x1\": round(track.det.x1), \"y1\": round(track.det.y1)})\n",
-    "    return file_path, new_track_list, seq_length_dict\n",
-    "\n",
-    "# Read a jpg image and decode it to a uint8 tf tensor.\n",
-    "def parse_image(filename):\n",
-    "    image = tf.io.read_file(filename)\n",
-    "    image = tf.io.decode_jpeg(image)\n",
-    "    return (tf.timestamp(), filename, image)\n",
-    "\n",
    "# Create a data loader\n",
    "file_list = sorted(glob.glob(f\"sample_images/{test_sequence_name}/*.jpg\"))\n",
    "list_ds = tf.data.Dataset.from_tensor_slices(file_list)\n",
@@ -678,38 +1285,54 @@
    "# Traverse the dataset with batch size = 1, you cannot change the batch size\n",
    "for data in tqdm(images_ds.batch(1, drop_remainder=True)):\n",
    "    # timestamp is used for recording the order of frames\n",
-    "    timestamp, file_path, image = data\n",
+    "    timestamp, file_path, images = data\n",
    "    timestamp = (timestamp - base_time) * 1000\n",
    "    # get detection result\n",
-    "    detections = model_fn(image)\n",
+    "    detections = Detection.process_model_output(images[0], model_fn(images))\n",
-    "    num_detections = detections['num_detections'].numpy().astype(np.int32)\n",
-    "    detection_boxes = detections['detection_boxes'].numpy()\n",
-    "    detection_classes = detections['detection_classes'].numpy().astype(np.int32)\n",
-    "    detection_scores = detections['detection_scores'].numpy()\n",
    "\n",
-    "    batch_size, img_h, img_w = image.shape[0:3]\n",
+    "    # Feed detection results and the corresponding timestamp to the tracker, and then get tracker response\n",
-    "\n",
+    "    tracks = tracker.update(images[0].numpy(), detections, timestamp[0])\n",
-    "    for batch_index in range(batch_size):\n",
+    "    annotations = []\n",
-    "        valid_indices = detection_scores[batch_index, :] >= threshold\n",
+    "    for track in tracks:\n",
-    "        classes = detection_classes[batch_index, valid_indices]\n",
+    "      anno = TrackAnnotation(\n",
-    "        scores = detection_scores[batch_index, valid_indices]\n",
+    "          det=track.det,\n",
-    "        boxes = detection_boxes[batch_index, valid_indices, :]\n",
+    "          seq_id = track.id,\n",
-    "        detections = []\n",
+    "          seq_idx = len(track.linked_dets)\n",
-    "\n",
+    "      )\n",
-    "        for class_id, score, box in zip(classes, scores, boxes):\n",
+    "      annotations.append(anno)\n",
-    "            detections.append(\n",
+    "      track_length_dict[track.id] = len(track.linked_dets)\n",
-    "                Detection(\n",
+    "    \n",
-    "                    class_id=class_id,\n",
+    "    detection_result.append((file_path.numpy()[0].decode(), annotations))"
-    "                    score=score,\n",
+   ]
-    "                    x0=box[1] * img_w,\n",
+  },
-    "                    y0=box[0] * img_h,\n",
+  {
-    "                    x1=box[3] * img_w,\n",
+   "cell_type": "code",
-    "                    y1=box[2] * img_h,\n",
+   "execution_count": null,
-    "                ))\n",
+   "metadata": {
-    "        # Feed detection results and the corresponding timestamp to the tracker, and then get tracker response\n",
+    "id": "oPSfnQ1o04Rx"
-    "        tracks = tracker.update(image[batch_index].numpy(), detections, timestamp[batch_index])\n",
+   },
-    "        base_file_path, track_list, track_length_dict = format_tracker_response(file_path[batch_index].numpy().decode(\"utf-8\"), tracks, track_length_dict)\n",
+   "outputs": [],
-    "        detection_result.append((base_file_path, track_list))"
+   "source": [
+    "def update_annotation_lengths(detection_result, track_length_dict):\n",
+    "  new_result = []\n",
+    "  for file_path, annotations in detection_result:\n",
+    "    new_annotations = []\n",
+    "    for anno in annotations:\n",
+    "      anno = anno.replace(seq_length=track_length_dict[anno.seq_id])\n",
+    "      new_annotations.append(anno)\n",
+    "    new_result.append((file_path, new_annotations))\n",
+    "  return new_result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "zda914lv1o_v"
+   },
+   "outputs": [],
+   "source": [
+    "detection_result = update_annotation_lengths(detection_result, track_length_dict)"
   ]
  },
  {
@@ -726,9 +1349,6 @@
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
-    "colab": {
-     "background_save": true
-    },
    "id": "gWMJG7g95MGk"
   },
   "outputs": [],
@@ -740,26 +1360,10 @@
    "    fps=15, \n",
    "    frameSize=size)\n",
    "\n",
-    "for file_path, tracks in tqdm(detection_result):\n",
+    "for file_path, annotations in tqdm(detection_result):\n",
    "    image = cv2.imread(file_path)\n",
-    "    for track in tracks:\n",
+    "    for anno in annotations:\n",
-    "      # Draw the predicted bounding box\n",
+    "      anno.det.bbox.draw(image, label=anno.annotation_str(), color=(0, 140, 255))\n",
-    "      cv2.rectangle(image, (track['x0'], track['y0']),\n",
-    "                (track['x1'], track['y1']),\n",
-    "                (0, 140, 255), thickness=2,)\n",
-    "      # Write the tracked COTS ID, and its corresponding tracking index and tracking sequence length\n",
-    "      cv2.putText(image, f\"{track['seq_id']}\", (track['x0'], track['y0']-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 140, 255), 2)\n",
-    "      if len(track[\"seq_id\"]) == 1:\n",
-    "          offset = 20\n",
-    "      elif len(track[\"seq_id\"]) == 2:\n",
-    "          offset = 40\n",
-    "      else:\n",
-    "          offset = 60\n",
-    "      cv2.putText(image, \n",
-    "                  f\"({track['seq_idx']}/{track_length_dict[track['seq_id']]})\",\n",
-    "                  (track['x0'] + offset, track['y0']-10),\n",
-    "                  cv2.FONT_HERSHEY_SIMPLEX,\n",
-    "                  0.6, (0, 140, 255), 2)\n",
    "    detect_video_writer.write(image)\n",
    "cv2.destroyAllWindows()\n",
    "\n",
@@ -770,9 +1374,6 @@
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
-    "colab": {
-     "background_save": true
-    },
    "id": "9s1myz67jcV8"
   },
   "outputs": [],
@@ -789,9 +1390,6 @@
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
-    "colab": {
-     "background_save": true
-    },
    "id": "wsK5cvX5jkL7"
   },
   "outputs": [],
@@ -812,9 +1410,6 @@
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
-    "colab": {
-     "background_save": true
-    },
    "id": "tyHucK8lbGXk"
   },
   "outputs": [],
@@ -835,14 +1430,23 @@
   "provenance": []
  },
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
   "name": "python3"
  },
  "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.10"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }