Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
54c6d319
Commit
54c6d319
authored
Aug 11, 2020
by
Kaushik Shivakumar
Browse files
add file temporarily for meeting
parent
416081c3
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
453 additions
and
0 deletions
+453
-0
research/object_detection/dataset_tools/create_ava_tf_record_for_context.py
...tection/dataset_tools/create_ava_tf_record_for_context.py
+453
-0
No files found.
research/object_detection/dataset_tools/create_ava_tf_record_for_context.py
0 → 100644
View file @
54c6d319
# Copyright 2019 The MediaPipe Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified by Kaushik Shivakumar for the AVA Actions Dataset
# to work without MediaPipe, code started by Bryan Seybold.
r
"""Code to download and parse the AVA dataset for TensorFlow models.
The [AVA data set](
https://research.google.com/ava/index.html)
is a data set for human action recognition.
This script downloads the annotations and prepares data from similar annotations
if local video files are available. The video files can be downloaded
from the following website:
https://github.com/cvdfoundation/ava-datset
Prior to running this script, please run download_and_preprocess_ava.sh to
download and trim input videos.
Running this code as a module generates the data set on disk. First, the
required files are downloaded (_download_data) which enables constructing the
label map. Then (in generate_examples), for each split in the data set, the
metadata is generated from the annotations for each example
(_generate_metadata), and MediaPipe is used to fill in the video frames
(_run_mediapipe). This script processes local video files defined in a custom
CSV in a comparable manner to the Kinetics data set for evaluating and
predicting values on your own data. The data set is written to disk as a set of
numbered TFRecord files.
The custom CSV format must match the Kinetics data set format, with columns
corresponding to [[label_name], video, start, end, split] followed by lines with
those fields. (Label_name is optional.) These field names can be used to
construct the paths to the video files using the Python string formatting
specification and the video_path_format_string flag:
--video_path_format_string="/path/to/video/{video}.mp4"
Generating the data on disk can take considerable time and disk space.
(Image compression quality is the primary determiner of disk usage. TVL1 flow
determines runtime.)
Once the data is on disk, reading the data as a tf.data.Dataset is accomplished
with the following lines:
kinetics = Kinetics("kinetics_data_path")
dataset = kinetics.as_dataset("custom")
# implement additional processing and batching here
images_and_labels = dataset.make_one_shot_iterator().get_next()
images = images_and_labels["images"]
labels = image_and_labels["labels"]
IF using TFOD API, use the sequence example configuration in the config.proto.
This data is structured for per-clip action classification where images is
the sequence of images and labels are a one-hot encoded value. See
as_dataset() for more details.
Note that the number of videos changes in the data set over time, so it will
likely be necessary to change the expected number of examples.
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
contextlib
import
csv
import
os
import
random
import
subprocess
import
sys
import
tarfile
import
zipfile
import
tempfile
import
collections
import
glob
import
hashlib
from
absl
import
app
from
absl
import
flags
from
absl
import
logging
from
six.moves
import
range
from
six.moves
import
urllib
import
tensorflow.compat.v1
as
tf
import
cv2
from
object_detection.utils
import
dataset_util
GLOBAL_SOURCE_ID
=
0
POSSIBLE_TIMESTAMPS
=
range
(
902
,
1798
)
ANNOTATION_URL
=
"https://research.google.com/ava/download/ava_v2.2.zip"
SECONDS_TO_MILLI
=
1000
FILEPATTERN
=
"ava_actions_%s_1fps_rgb"
SPLITS
=
{
"train"
:
{
"shards"
:
100
,
"examples"
:
862663
,
"csv"
:
''
,
"excluded-csv"
:
''
},
"val"
:
{
"shards"
:
50
,
"examples"
:
243029
,
"csv"
:
''
,
"excluded-csv"
:
''
},
#Test doesn't have ground truth, so TF Records can't be created
"test"
:
{
"shards"
:
100
,
"examples"
:
0
,
"csv"
:
''
,
"excluded-csv"
:
''
}
}
NUM_CLASSES
=
80
def
feature_list_feature
(
value
):
return
tf
.
train
.
FeatureList
(
feature
=
value
)
class
Ava
(
object
):
"""Generates and loads the Kinetics data set."""
def
__init__
(
self
,
path_to_output_dir
,
path_to_data_download
):
if
not
path_to_output_dir
:
raise
ValueError
(
"You must supply the path to the data directory."
)
self
.
path_to_data_download
=
path_to_data_download
self
.
path_to_output_dir
=
path_to_output_dir
def
generate_examples
(
self
,
splits_to_process
=
"train,val,test"
,
video_path_format_string
=
None
,
download_labels_for_map
=
True
,
seconds_per_sequence
=
10
,
hop_between_sequences
=
10
):
"""Downloads data and generates sharded TFRecords.
Downloads the data files, generates metadata, and processes the metadata
with MediaPipe to produce tf.SequenceExamples for training. The resulting
files can be read with as_dataset(). After running this function the
original data files can be deleted.
Args:
splits_to_process: csv string of which splits to process. Allows providing
a custom CSV with the CSV flag. The original data is still downloaded
to generate the label_map.
video_path_format_string: The format string for the path to local files.
download_labels_for_map: If true, download the annotations to create the
label map.
seconds_per_sequence: The length of each sequence, in seconds.
hop_between_sequences: The gap between the centers of
successive sequences.
"""
logging
.
info
(
"Downloading data."
)
download_output
=
self
.
_download_data
(
download_labels_for_map
)
for
key
in
splits_to_process
.
split
(
","
):
logging
.
info
(
"Generating metadata for split: %s"
,
key
)
all_metadata
=
list
(
self
.
_generate_metadata
(
download_output
[
0
][
key
][
0
],
download_output
[
0
][
key
][
1
],
download_output
[
1
],
seconds_per_sequence
,
hop_between_sequences
,
video_path_format_string
))
logging
.
info
(
"An example of the metadata: "
)
logging
.
info
(
all_metadata
[
0
])
random
.
seed
(
47
)
random
.
shuffle
(
all_metadata
)
shards
=
SPLITS
[
key
][
"shards"
]
shard_names
=
[
os
.
path
.
join
(
self
.
path_to_output_dir
,
FILEPATTERN
%
key
+
"-%05d-of-%05d"
%
(
i
,
shards
))
for
i
in
range
(
shards
)]
writers
=
[
tf
.
io
.
TFRecordWriter
(
shard_name
)
for
shard_name
in
shard_names
]
with
_close_on_exit
(
writers
)
as
writers
:
for
i
,
seq_ex
in
enumerate
(
all_metadata
):
writers
[
i
%
len
(
writers
)].
write
(
seq_ex
.
SerializeToString
())
logging
.
info
(
"Data extraction complete."
)
def
_generate_metadata
(
self
,
annotation_file
,
excluded_file
,
label_map
,
seconds_per_sequence
,
hop_between_sequences
,
video_path_format_string
):
"""For each row in the annotation CSV, generates the corresponding metadata.
Args:
annotation_file: path to the file of AVA CSV annotations.
excluded_path: path to a CSV file of excluded timestamps for each video.
label_map: an {int: string} label map.
seconds_per_sequence: The number of seconds per example in each example.
hop_between_sequences: The hop between sequences. If less than
seconds_per_sequence, will overlap.
Yields:
Each tf.SequenceExample of metadata, ready to pass to MediaPipe.
"""
global
GLOBAL_SOURCE_ID
fieldnames
=
[
"id"
,
"timestamp_seconds"
,
"xmin"
,
"ymin"
,
"xmax"
,
"ymax"
,
"action_label"
]
frame_excluded
=
{}
# create a sparse, nested map of videos and frame indices.
with
open
(
excluded_file
,
"r"
)
as
excluded
:
reader
=
csv
.
reader
(
excluded
)
for
row
in
reader
:
frame_excluded
[(
row
[
0
],
int
(
float
(
row
[
1
])))]
=
True
with
open
(
annotation_file
,
"r"
)
as
annotations
:
reader
=
csv
.
DictReader
(
annotations
,
fieldnames
)
frame_annotations
=
collections
.
defaultdict
(
list
)
ids
=
set
()
# aggreggate by video and timestamp:
for
row
in
reader
:
ids
.
add
(
row
[
"id"
])
key
=
(
row
[
"id"
],
int
(
float
(
row
[
"timestamp_seconds"
])))
frame_annotations
[
key
].
append
(
row
)
# for each video, find aggregates near each sampled frame.:
logging
.
info
(
"Generating metadata..."
)
media_num
=
1
for
media_id
in
ids
:
logging
.
info
(
"%d/%d, ignore warnings.
\n
"
%
(
media_num
,
len
(
ids
)))
media_num
+=
1
filepath
=
glob
.
glob
(
video_path_format_string
.
format
(
media_id
)
+
"*"
)[
0
]
filename
=
filepath
.
split
(
"/"
)[
-
1
]
cur_vid
=
cv2
.
VideoCapture
(
filepath
)
width
=
cur_vid
.
get
(
cv2
.
CAP_PROP_FRAME_WIDTH
)
height
=
cur_vid
.
get
(
cv2
.
CAP_PROP_FRAME_HEIGHT
)
middle_frame_time
=
POSSIBLE_TIMESTAMPS
[
0
]
total_non_excluded
=
0
;
while
middle_frame_time
<
POSSIBLE_TIMESTAMPS
[
-
1
]:
if
(
media_id
,
middle_frame_time
)
not
in
frame_excluded
:
total_non_excluded
+=
1
middle_frame_time
+=
1
middle_frame_time
=
POSSIBLE_TIMESTAMPS
[
0
]
cur_frame_num
=
0
while
middle_frame_time
<
POSSIBLE_TIMESTAMPS
[
-
1
]:
GLOBAL_SOURCE_ID
+=
1
cur_vid
.
set
(
cv2
.
CAP_PROP_POS_MSEC
,
(
middle_frame_time
)
*
SECONDS_TO_MILLI
)
success
,
image
=
cur_vid
.
read
()
success
,
buffer
=
cv2
.
imencode
(
'.jpg'
,
image
)
bufstring
=
buffer
.
tostring
()
if
(
media_id
,
middle_frame_time
)
in
frame_excluded
:
middle_frame_time
+=
1
logging
.
info
(
"Ignoring and skipping excluded frame."
)
continue
cur_frame_num
+=
1
source_id
=
str
(
GLOBAL_SOURCE_ID
)
+
"_"
+
media_id
GLOBAL_SOURCE_ID
+=
1
xmins
=
[]
xmaxs
=
[]
ymins
=
[]
ymaxs
=
[]
areas
=
[]
labels
=
[]
label_strings
=
[]
confidences
=
[]
for
row
in
frame_annotations
[(
media_id
,
middle_frame_time
)]:
if
len
(
row
)
>
2
and
int
(
row
[
"action_label"
])
in
label_map
:
xmins
.
append
(
float
(
row
[
"xmin"
]))
xmaxs
.
append
(
float
(
row
[
"xmax"
]))
ymins
.
append
(
float
(
row
[
"ymin"
]))
ymaxs
.
append
(
float
(
row
[
"ymax"
]))
areas
.
append
(
float
((
xmaxs
[
-
1
]
-
xmins
[
-
1
])
*
(
ymaxs
[
-
1
]
-
ymins
[
-
1
]))
/
2
)
labels
.
append
(
int
(
row
[
"action_label"
]))
label_strings
.
append
(
label_map
[
int
(
row
[
"action_label"
])])
confidences
.
append
(
1
)
else
:
logging
.
warning
(
"Unknown label: %s"
,
row
[
"action_label"
])
#Display the image and bounding boxes being
#processed (for debugging purposes)
"""
for i in range(len(xmins)):
cv2.rectangle(image, (int(xmins[i] * width),
int(ymaxs[i] * height)),
(int(xmaxs[i] * width),
int(ymins[i] * height)), (255, 0, 0), 2)
cv2.imshow("mywindow", image)
cv2.waitKey(1000)
"""
middle_frame_time
+=
1
/
3
if
abs
(
middle_frame_time
-
round
(
middle_frame_time
)
<
0.000001
):
middle_frame_time
=
round
(
middle_frame_time
)
num_frames_in_adjusted
=
(
middle_time_frame
-
900
)
*
3
*
2
key
=
hashlib
.
sha256
(
bufstring
).
hexdigest
()
date_captured_feature
=
(
"2020-06-17 00:%02d:%02d"
%
((
middle_frame_time
-
900
)
//
60
,
(
middle_frame_time
-
900
)
%
60
))
context_feature_dict
=
{
'image/height'
:
dataset_util
.
int64_feature
(
int
(
height
)),
'image/width'
:
dataset_util
.
int64_feature
(
int
(
width
)),
'image/format'
:
dataset_util
.
bytes_feature
(
'jpeg'
.
encode
(
'utf8'
)),
'image/source_id'
:
dataset_util
.
bytes_feature
(
source_id
.
encode
(
"utf8"
)),
'image/filename'
:
dataset_util
.
bytes_feature
(
source_id
.
encode
(
"utf8"
)),
'image/encoded'
:
dataset_util
.
bytes_feature
(
bufstring
),
'image/key/sha256'
:
dataset_util
.
bytes_feature
(
key
.
encode
(
'utf8'
)),
'image/object/bbox/xmin'
:
dataset_util
.
float_list_feature
(
xmins
),
'image/object/bbox/xmax'
:
dataset_util
.
float_list_feature
(
xmaxs
),
'image/object/bbox/ymin'
:
dataset_util
.
float_list_feature
(
ymins
),
'image/object/bbox/ymax'
:
dataset_util
.
float_list_feature
(
ymaxs
),
'image/object/area'
:
dataset_util
.
float_list_feature
(
areas
),
'image/object/class/label'
:
dataset_util
.
int64_list_feature
(
labels
),
'image/object/class/text'
:
dataset_util
.
bytes_list_feature
(
label_strings
),
'image/location'
:
dataset_util
.
bytes_feature
(
media_id
.
encode
(
'utf8'
)),
'image/date_captured'
:
dataset_util
.
bytes_feature
(
date_captured_feature
.
encode
(
'utf8'
)),
'image/seq_num_frames'
:
dataset_util
.
int64_feature
(
total_non_excluded
),
'image/seq_frame_num'
:
dataset_util
.
int64_feature
(
cur_frame_num
),
'image/seq_id'
:
dataset_util
.
bytes_feature
(
media_id
.
encode
(
'utf8'
)),
}
yield
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
context_feature_dict
))
cur_vid
.
release
()
def
_download_data
(
self
,
download_labels_for_map
):
"""Downloads and extracts data if not already available."""
if
sys
.
version_info
>=
(
3
,
0
):
urlretrieve
=
urllib
.
request
.
urlretrieve
else
:
urlretrieve
=
urllib
.
request
.
urlretrieve
logging
.
info
(
"Creating data directory."
)
tf
.
io
.
gfile
.
makedirs
(
self
.
path_to_data_download
)
logging
.
info
(
"Downloading annotations."
)
paths
=
{}
if
download_labels_for_map
:
zip_path
=
os
.
path
.
join
(
self
.
path_to_data_download
,
ANNOTATION_URL
.
split
(
"/"
)[
-
1
])
urlretrieve
(
ANNOTATION_URL
,
zip_path
)
with
zipfile
.
ZipFile
(
zip_path
,
'r'
)
as
zip_ref
:
zip_ref
.
extractall
(
self
.
path_to_data_download
)
for
split
in
[
"train"
,
"test"
,
"val"
]:
csv_path
=
os
.
path
.
join
(
self
.
path_to_data_download
,
"ava_%s_v2.2.csv"
%
split
)
excl_name
=
"ava_%s_excluded_timestamps_v2.2.csv"
%
split
excluded_csv_path
=
os
.
path
.
join
(
self
.
path_to_data_download
,
excl_name
)
SPLITS
[
split
][
"csv"
]
=
csv_path
SPLITS
[
split
][
"excluded-csv"
]
=
excluded_csv_path
paths
[
split
]
=
(
csv_path
,
excluded_csv_path
)
label_map
=
self
.
get_label_map
(
os
.
path
.
join
(
self
.
path_to_data_download
,
"ava_action_list_v2.2.pbtxt"
))
return
paths
,
label_map
def
get_label_map
(
self
,
path
):
"""Parsess a label map into {integer:string} format."""
label_map
=
{}
with
open
(
path
,
"r"
)
as
f
:
current_id
=
-
1
current_label
=
""
for
line
in
f
:
if
"item {"
in
line
:
current_id
=
-
1
current_label
=
""
if
"name:"
in
line
:
first_quote
=
line
.
find
(
'"'
)
+
1
second_quote
=
line
.
find
(
'"'
,
first_quote
)
assert
second_quote
>
-
1
current_label
=
line
[
first_quote
:
second_quote
]
if
"id:"
in
line
:
current_id
=
int
(
line
.
split
()[
1
])
if
"}"
in
line
:
label_map
[
current_id
]
=
bytes23
(
current_label
)
logging
.
info
(
label_map
)
assert
len
(
label_map
)
==
NUM_CLASSES
return
label_map
def
bytes23
(
string
):
"""Creates a bytes string in either Python 2 or 3."""
if
sys
.
version_info
>=
(
3
,
0
):
return
bytes
(
string
,
"utf8"
)
else
:
return
bytes
(
string
)
@
contextlib
.
contextmanager
def
_close_on_exit
(
writers
):
"""Call close on all writers on exit."""
try
:
yield
writers
finally
:
for
writer
in
writers
:
writer
.
close
()
def
main
(
argv
):
if
len
(
argv
)
>
1
:
raise
app
.
UsageError
(
"Too many command-line arguments."
)
Ava
(
flags
.
FLAGS
.
path_to_output_dir
,
flags
.
FLAGS
.
path_to_download_data
).
generate_examples
(
flags
.
FLAGS
.
splits_to_process
,
flags
.
FLAGS
.
video_path_format_string
,
flags
.
FLAGS
.
download_labels_for_map
,
flags
.
FLAGS
.
seconds_per_sequence
,
flags
.
FLAGS
.
hop_between_sequences
)
if
__name__
==
"__main__"
:
flags
.
DEFINE_string
(
"path_to_download_data"
,
""
,
"Path to directory to download data to."
)
flags
.
DEFINE_string
(
"path_to_output_dir"
,
""
,
"Path to directory to write data to."
)
flags
.
DEFINE_boolean
(
"download_labels_for_map"
,
True
,
"If true, download the annotations to construct the "
"label map."
)
flags
.
DEFINE_string
(
"splits_to_process"
,
"train,val"
,
"Process these splits. Useful for custom data splits."
)
flags
.
DEFINE_string
(
"video_path_format_string"
,
None
,
"The format string for the path to local video files. "
"Uses the Python string.format() syntax with possible "
"arguments of {video}, {start}, {end}, {label_name}, and "
"{split}, corresponding to columns of the data csvs."
)
flags
.
DEFINE_integer
(
"seconds_per_sequence"
,
10
,
"The number of seconds per example in each example."
)
flags
.
DEFINE_integer
(
"hop_between_sequences"
,
10
,
"The hop between sequences. If less than "
"seconds_per_sequence, will overlap."
)
app
.
run
(
main
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment