add new script, clean up both

d9d47174 · Kaushik Shivakumar · 54c6d319 · d9d47174 · d9d47174
Commit d9d47174 authored Aug 12, 2020 by Kaushik Shivakumar
2 changed files
--- a/research/object_detection/dataset_tools/create_ava_actions_tf_record.py
+++ b/research/object_detection/dataset_tools/create_ava_actions_tf_record.py
@@ -31,7 +31,7 @@ Running this code as a module generates the data set on disk. First, the
 required files are downloaded (_download_data) which enables constructing the
 label map. Then (in generate_examples), for each split in the data set, the
 metadata and image frames are generated from the annotations for each sequence
-example (_generate_metadata). The data set is written to disk as a set of
+example (_generate_examples). The data set is written to disk as a set of
 numbered TFRecord files.

 Generating the data on disk can take considerable time and disk space.
@@ -96,8 +96,8 @@ SPLITS = {
        "csv": '',
        "excluded-csv": ''
    }
-
 }
+
 NUM_CLASSES = 80

 def feature_list_feature(value):
@@ -188,7 +188,7 @@ class Ava(object):
      reader = csv.DictReader(annotations, fieldnames)
      frame_annotations = collections.defaultdict(list)
      ids = set()
-      # aggregate by video and timestamp:
+      # aggreggate by video and timestamp:
      for row in reader:
        ids.add(row["id"])
        key = (row["id"], int(float(row["timestamp_seconds"])))
@@ -197,8 +197,6 @@ class Ava(object):
      logging.info("Generating metadata...")
      media_num = 1
      for media_id in ids:
-        if media_num > 2:
-          continue
        logging.info("%d/%d, ignore warnings.\n" % (media_num, len(ids)))
        media_num += 1

@@ -261,7 +259,6 @@ class Ava(object):
            windowed_timestamp += 1

          if len(total_boxes) > 0:
-            print(total_boxes)
            yield seq_example_util.make_sequence_example("AVA", media_id, total_images,
                int(height), int(width), 'jpeg', total_source_ids, None, total_is_annotated,
                total_boxes, total_label_strings, use_strs_for_source_id=True)

--- a/research/object_detection/dataset_tools/create_ava_tf_record_for_context.py
+++ b/research/object_detection/dataset_tools/create_ava_tf_record_for_context.py
-# Copyright 2019 The MediaPipe Authors.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,55 +12,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# Modified by Kaushik Shivakumar for the AVA Actions Dataset
-# to work without MediaPipe, code started by Bryan Seybold.
+r"""Code to download and parse the AVA Actions dataset for TensorFlow models.

-r"""Code to download and parse the AVA dataset for TensorFlow models.
-
-The [AVA data set](
+The [AVA Actions data set](
 https://research.google.com/ava/index.html)
-is a data set for human action recognition.
+is a dataset for human action recognition.

 This script downloads the annotations and prepares data from similar annotations
 if local video files are available. The video files can be downloaded
 from the following website:
-https://github.com/cvdfoundation/ava-datset
+https://github.com/cvdfoundation/ava-dataset

 Prior to running this script, please run download_and_preprocess_ava.sh to
-download and trim input videos.
+download input videos.

 Running this code as a module generates the data set on disk. First, the
 required files are downloaded (_download_data) which enables constructing the
 label map. Then (in generate_examples), for each split in the data set, the
-metadata is generated from the annotations for each example
-(_generate_metadata), and MediaPipe is used to fill in the video frames
-(_run_mediapipe). This script processes local video files defined in a custom
-CSV in a comparable manner to the Kinetics data set for evaluating and
-predicting values on your own data. The data set is written to disk as a set of
+metadata and image frames are generated from the annotations for each sequence
+example (_generate_examples). The data set is written to disk as a set of
 numbered TFRecord files.

-The custom CSV format must match the Kinetics data set format, with columns
-corresponding to [[label_name], video, start, end, split] followed by lines with
-those fields. (Label_name is optional.) These field names can be used to
-construct the paths to the video files using the Python string formatting
-specification and the video_path_format_string flag:
-   --video_path_format_string="/path/to/video/{video}.mp4"
-
 Generating the data on disk can take considerable time and disk space.
-(Image compression quality is the primary determiner of disk usage. TVL1 flow
-determines runtime.)
-
-Once the data is on disk, reading the data as a tf.data.Dataset is accomplished
-with the following lines:
+(Image compression quality is the primary determiner of disk usage.

-   kinetics = Kinetics("kinetics_data_path")
-   dataset = kinetics.as_dataset("custom")
-   # implement additional processing and batching here
-   images_and_labels = dataset.make_one_shot_iterator().get_next()
-   images = images_and_labels["images"]
-   labels = image_and_labels["labels"]
-
-IF using TFOD API, use the sequence example configuration in the config.proto.
+If using the Tensorflow Object Detection API, set the input_type field 
+in the input_reader to TF_SEQUENCE_EXAMPLE. 

 This data is structured for per-clip action classification where images is
 the sequence of images and labels are a one-hot encoded value. See
@@ -68,24 +45,20 @@ as_dataset() for more details.

 Note that the number of videos changes in the data set over time, so it will
 likely be necessary to change the expected number of examples.
-"""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+The argument video_path_format_string expects a value as such:
+  "/path/to/videos/{0}"
+
+"""

 import contextlib
 import csv
 import os
 import random
-import subprocess
 import sys
-import tarfile
 import zipfile
-import tempfile
 import collections
 import glob
-import hashlib

 from absl import app
 from absl import flags
@@ -94,23 +67,23 @@ from six.moves import range
 from six.moves import urllib
 import tensorflow.compat.v1 as tf
 import cv2
+import hashlib

 from object_detection.utils import dataset_util

-GLOBAL_SOURCE_ID = 0
 POSSIBLE_TIMESTAMPS = range(902, 1798)
 ANNOTATION_URL = "https://research.google.com/ava/download/ava_v2.2.zip"
 SECONDS_TO_MILLI = 1000
 FILEPATTERN = "ava_actions_%s_1fps_rgb"
 SPLITS = {
    "train": {
-        "shards": 100,
+        "shards": 1000,
        "examples": 862663,
        "csv": '',
        "excluded-csv": ''
    },
    "val": {
-        "shards": 50,
+        "shards": 100,
        "examples": 243029,
        "csv": '',
        "excluded-csv": ''
@@ -122,15 +95,15 @@ SPLITS = {
        "csv": '',
        "excluded-csv": ''
    }
-
 }
+
 NUM_CLASSES = 80

 def feature_list_feature(value):
  return tf.train.FeatureList(feature=value)

 class Ava(object):
-  """Generates and loads the Kinetics data set."""
+  """Generates and loads the AVA Actions 2.2 data set."""

  def __init__(self, path_to_output_dir, path_to_data_download):
    if not path_to_output_dir:
@@ -138,10 +111,9 @@ class Ava(object):
    self.path_to_data_download = path_to_data_download
    self.path_to_output_dir = path_to_output_dir

-  def generate_examples(self,
+  def generate_and_write_records(self,
                                 splits_to_process="train,val,test",
                                 video_path_format_string=None,
-                        download_labels_for_map=True,
                                 seconds_per_sequence=10,
                                 hop_between_sequences=10):
    """Downloads data and generates sharded TFRecords.
@@ -156,17 +128,15 @@ class Ava(object):
        a custom CSV with the CSV flag. The original data is still downloaded
        to generate the label_map.
      video_path_format_string: The format string for the path to local files.
-      download_labels_for_map: If true, download the annotations to create the
-        label map.
      seconds_per_sequence: The length of each sequence, in seconds.
      hop_between_sequences: The gap between the centers of
      successive sequences.
    """
    logging.info("Downloading data.")
-    download_output = self._download_data(download_labels_for_map)
+    download_output = self._download_data()
    for key in splits_to_process.split(","):
-      logging.info("Generating metadata for split: %s", key)
-      all_metadata = list(self._generate_metadata(
+      logging.info("Generating examples for split: %s", key)
+      all_metadata = list(self._generate_examples(
          download_output[0][key][0], download_output[0][key][1],
          download_output[1], seconds_per_sequence, hop_between_sequences,
          video_path_format_string))
@@ -184,10 +154,14 @@ class Ava(object):
          writers[i % len(writers)].write(seq_ex.SerializeToString())
    logging.info("Data extraction complete.")

-  def _generate_metadata(self, annotation_file, excluded_file, label_map,
+  def _generate_examples(self, annotation_file, excluded_file, label_map,
                         seconds_per_sequence, hop_between_sequences,
                         video_path_format_string):
-    """For each row in the annotation CSV, generates the corresponding metadata.
+    """For each row in the annotation CSV, generates the corresponding
+    examples. When iterating through frames for a single example, skips
+    over excluded frames. Generates equal-length sequence examples, each with
+    length seconds_per_sequence (1 fps) and gaps of hop_between_sequences
+    frames (and seconds) between them, possible greater due to excluded frames.

    Args:
      annotation_file: path to the file of AVA CSV annotations.
@@ -197,9 +171,8 @@ class Ava(object):
      hop_between_sequences: The hop between sequences. If less than
          seconds_per_sequence, will overlap.
    Yields:
-      Each tf.SequenceExample of metadata, ready to pass to MediaPipe.
+      Each prepared tf.Example of metadata also containing video frames
    """
-    global GLOBAL_SOURCE_ID
    fieldnames = ["id", "timestamp_seconds", "xmin", "ymin", "xmax", "ymax",
                  "action_label"]
    frame_excluded = {}
@@ -217,7 +190,7 @@ class Ava(object):
        ids.add(row["id"])
        key = (row["id"], int(float(row["timestamp_seconds"])))
        frame_annotations[key].append(row)
-      # for each video, find aggregates near each sampled frame.:
+      # for each video, find aggreggates near each sampled frame.:
      logging.info("Generating metadata...")
      media_num = 1
      for media_id in ids:
@@ -240,8 +213,6 @@ class Ava(object):
        middle_frame_time = POSSIBLE_TIMESTAMPS[0]
        cur_frame_num = 0
        while middle_frame_time < POSSIBLE_TIMESTAMPS[-1]:
-          GLOBAL_SOURCE_ID += 1
-
          cur_vid.set(cv2.CAP_PROP_POS_MSEC,
                      (middle_frame_time) * SECONDS_TO_MILLI)
          success, image = cur_vid.read()
@@ -255,8 +226,7 @@ class Ava(object):
            continue

          cur_frame_num += 1
-          source_id = str(GLOBAL_SOURCE_ID) + "_" + media_id
-          GLOBAL_SOURCE_ID += 1
+          source_id = str(middle_frame_time) + "_" + media_id

          xmins = []
          xmaxs = []
@@ -280,23 +250,7 @@ class Ava(object):
            else:
              logging.warning("Unknown label: %s", row["action_label"])

-          #Display the image and bounding boxes being
-          #processed (for debugging purposes)
-          """
-          for i in range(len(xmins)):
-            cv2.rectangle(image, (int(xmins[i] * width), 
-                                  int(ymaxs[i] * height)), 
-                                  (int(xmaxs[i] * width), 
-                                  int(ymins[i] * height)), (255, 0, 0), 2)
-          cv2.imshow("mywindow", image)
-          cv2.waitKey(1000)
-          """
-          middle_frame_time += 1/3
-          if abs(middle_frame_time - round(middle_frame_time) < 0.000001):
-            middle_frame_time = round(middle_frame_time)
-
-          num_frames_in_adjusted = (middle_time_frame - 900) * 3 * 2
-
+          middle_frame_time += 1
          key = hashlib.sha256(bufstring).hexdigest()
          date_captured_feature = ("2020-06-17 00:%02d:%02d" % ((middle_frame_time - 900) // 60, (middle_frame_time - 900) % 60))
          context_feature_dict = {
@@ -345,7 +299,7 @@ class Ava(object):

        cur_vid.release()

-  def _download_data(self, download_labels_for_map):
+  def _download_data(self):
    """Downloads and extracts data if not already available."""
    if sys.version_info >= (3, 0):
      urlretrieve = urllib.request.urlretrieve
@@ -355,7 +309,6 @@ class Ava(object):
    tf.io.gfile.makedirs(self.path_to_data_download)
    logging.info("Downloading annotations.")
    paths = {}
-    if download_labels_for_map:
    zip_path = os.path.join(self.path_to_data_download,
                            ANNOTATION_URL.split("/")[-1])
    urlretrieve(ANNOTATION_URL, zip_path)
@@ -400,7 +353,6 @@ def bytes23(string):
  """Creates a bytes string in either Python 2 or  3."""
  if sys.version_info >= (3, 0):
    return bytes(string, "utf8")
-  else:
  return bytes(string)

 @contextlib.contextmanager
@@ -416,10 +368,9 @@ def main(argv):
  if len(argv) > 1:
    raise app.UsageError("Too many command-line arguments.")
  Ava(flags.FLAGS.path_to_output_dir,
-      flags.FLAGS.path_to_download_data).generate_examples(
+      flags.FLAGS.path_to_download_data).generate_and_write_records(
          flags.FLAGS.splits_to_process,
          flags.FLAGS.video_path_format_string,
-          flags.FLAGS.download_labels_for_map,
          flags.FLAGS.seconds_per_sequence,
          flags.FLAGS.hop_between_sequences)

@@ -430,10 +381,6 @@ if __name__ == "__main__":
  flags.DEFINE_string("path_to_output_dir",
                      "",
                      "Path to directory to write data to.")
-  flags.DEFINE_boolean("download_labels_for_map",
-                       True,
-                       "If true, download the annotations to construct the "
-                       "label map.")
  flags.DEFINE_string("splits_to_process",
                      "train,val",
                      "Process these splits. Useful for custom data splits.")