Unverified Commit dfcc691c authored by Srihari Humbarwadi's avatar Srihari Humbarwadi Committed by GitHub
Browse files

Merge branch 'master' into panoptic-deeplab

parents 83b87f05 a9d9e633
......@@ -8,7 +8,20 @@
"source": [
"# MoViNet Tutorial\n",
"\n",
"This notebook provides basic example code to create, build, and run [MoViNets (Mobile Video Networks)](https://arxiv.org/pdf/2103.11511.pdf). Models use TF Keras and support inference in TF 1 and TF 2. Pretrained models are provided by [TensorFlow Hub](https://tfhub.dev/google/collections/movinet/), trained on [Kinetics 600](https://deepmind.com/research/open-source/kinetics) for video action classification."
"This notebook provides basic example code to build, run, and fine-tune [MoViNets (Mobile Video Networks)](https://arxiv.org/pdf/2103.11511.pdf).\n",
"\n",
"Pretrained models are provided by [TensorFlow Hub](https://tfhub.dev/google/collections/movinet/) and the [TensorFlow Model Garden](https://github.com/tensorflow/models/tree/master/official/projects/movinet), trained on [Kinetics 600](https://deepmind.com/research/open-source/kinetics) for video action classification. All Models use TensorFlow 2 with Keras for inference and training.\n",
"\n",
"The following steps will be performed:\n",
"\n",
"1. [Running base model inference with TensorFlow Hub](#scrollTo=6g0tuFvf71S9\u0026line=8\u0026uniqifier=1)\n",
"2. [Running streaming model inference with TensorFlow Hub and plotting predictions](#scrollTo=ADrHPmwGcBZ5\u0026line=4\u0026uniqifier=1)\n",
"3. [Exporting a streaming model to TensorFlow Lite for mobile](#scrollTo=W3CLHvubvdSI\u0026line=3\u0026uniqifier=1)\n",
"4. [Fine-Tuning a base Model with the TensorFlow Model Garden](#scrollTo=_s-7bEoa3f8g\u0026line=11\u0026uniqifier=1)\n",
"\n",
"![jumping jacks plot](https://storage.googleapis.com/tf_model_garden/vision/movinet/artifacts/jumpingjacks_plot.gif)\n",
"\n",
"To generate video plots like the one above, see [section 2](#scrollTo=ADrHPmwGcBZ5\u0026line=4\u0026uniqifier=1)."
]
},
{
......@@ -19,17 +32,9 @@
"source": [
"## Setup\n",
"\n",
"It is recommended to run the models using GPUs or TPUs.\n",
"\n",
"To select a GPU/TPU in Colab, select `Runtime \u003e Change runtime type \u003e Hardware accelerator` dropdown in the top menu.\n",
"\n",
"### Install the TensorFlow Model Garden pip package\n",
"\n",
"- tf-models-official is the stable Model Garden package. Note that it may not include the latest changes in the tensorflow_models github repo.\n",
"- To include latest changes, you may install tf-models-nightly, which is the nightly Model Garden package created daily automatically.\n",
"pip will install all models and dependencies automatically.\n",
"For inference on smaller models (A0-A2), CPU is sufficient for this Colab. For fine-tuning, it is recommended to run the models using GPUs.\n",
"\n",
"Install the [mediapy](https://github.com/google/mediapy) package for visualizing images/videos."
"To select a GPU in Colab, select `Runtime \u003e Change runtime type \u003e Hardware accelerator \u003e GPU` dropdown in the top menu."
]
},
{
......@@ -40,10 +45,24 @@
},
"outputs": [],
"source": [
"!pip install -q tf-models-nightly tfds-nightly\n",
"# Install packages\n",
"\n",
"# tf-models-official is the stable Model Garden package\n",
"# tf-models-nightly includes latest changes\n",
"!pip install -q tf-models-nightly\n",
"\n",
"# Install tfds nightly to download ucf101\n",
"!pip install -q tfds-nightly\n",
"\n",
"# Install the mediapy package for visualizing images/videos.\n",
"# See https://github.com/google/mediapy\n",
"!command -v ffmpeg \u003e/dev/null || (apt update \u0026\u0026 apt install -y ffmpeg)\n",
"!pip install -q mediapy"
"!pip install -q mediapy\n",
"\n",
"# Due to a bug, we reinstall opencv\n",
"# See https://stackoverflow.com/q/70537488\n",
"!pip uninstall -q -y opencv-python-headless\n",
"!pip install -q \"opencv-python-headless\u003c4.3\""
]
},
{
......@@ -54,22 +73,268 @@
},
"outputs": [],
"source": [
"# Run imports\n",
"import os\n",
"from six.moves import urllib\n",
"\n",
"import matplotlib as mpl\n",
"import matplotlib.pyplot as plt\n",
"import mediapy as media\n",
"import numpy as np\n",
"from PIL import Image\n",
"import PIL\n",
"import pandas as pd\n",
"import tensorflow as tf\n",
"import tensorflow_datasets as tfds\n",
"import tensorflow_hub as hub\n",
"import tqdm\n",
"\n",
"from official.vision.beta.configs import video_classification\n",
"from official.projects.movinet.configs import movinet as movinet_configs\n",
"from official.projects.movinet.modeling import movinet\n",
"from official.projects.movinet.modeling import movinet_layers\n",
"from official.projects.movinet.modeling import movinet_model"
"mpl.rcParams.update({\n",
" 'font.size': 10,\n",
"})"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "OnFqOXazoWgy"
},
"source": [
"Run the cell below to define helper functions and create variables."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "dx55NK3ZoZeh"
},
"outputs": [],
"source": [
"#@title Run this cell to set up some helper code.\n",
"\n",
"# Download Kinetics 600 label map\n",
"!wget https://raw.githubusercontent.com/tensorflow/models/f8af2291cced43fc9f1d9b41ddbf772ae7b0d7d2/official/projects/movinet/files/kinetics_600_labels.txt -O labels.txt -q\n",
"\n",
"with tf.io.gfile.GFile('labels.txt') as f:\n",
" lines = f.readlines()\n",
" KINETICS_600_LABELS_LIST = [line.strip() for line in lines]\n",
" KINETICS_600_LABELS = tf.constant(KINETICS_600_LABELS_LIST)\n",
"\n",
"def get_top_k(probs, k=5, label_map=KINETICS_600_LABELS):\n",
" \"\"\"Outputs the top k model labels and probabilities on the given video.\"\"\"\n",
" top_predictions = tf.argsort(probs, axis=-1, direction='DESCENDING')[:k]\n",
" top_labels = tf.gather(label_map, top_predictions, axis=-1)\n",
" top_labels = [label.decode('utf8') for label in top_labels.numpy()]\n",
" top_probs = tf.gather(probs, top_predictions, axis=-1).numpy()\n",
" return tuple(zip(top_labels, top_probs))\n",
"\n",
"def predict_top_k(model, video, k=5, label_map=KINETICS_600_LABELS):\n",
" \"\"\"Outputs the top k model labels and probabilities on the given video.\"\"\"\n",
" outputs = model.predict(video[tf.newaxis])[0]\n",
" probs = tf.nn.softmax(outputs)\n",
" return get_top_k(probs, k=k, label_map=label_map)\n",
"\n",
"def load_movinet_from_hub(model_id, model_mode, hub_version=3):\n",
" \"\"\"Loads a MoViNet model from TF Hub.\"\"\"\n",
" hub_url = f'https://tfhub.dev/tensorflow/movinet/{model_id}/{model_mode}/kinetics-600/classification/{hub_version}'\n",
"\n",
" encoder = hub.KerasLayer(hub_url, trainable=True)\n",
"\n",
" inputs = tf.keras.layers.Input(\n",
" shape=[None, None, None, 3],\n",
" dtype=tf.float32)\n",
"\n",
" if model_mode == 'base':\n",
" inputs = dict(image=inputs)\n",
" else:\n",
" # Define the state inputs, which is a dict that maps state names to tensors.\n",
" init_states_fn = encoder.resolved_object.signatures['init_states']\n",
" state_shapes = {\n",
" name: ([s if s \u003e 0 else None for s in state.shape], state.dtype)\n",
" for name, state in init_states_fn(tf.constant([0, 0, 0, 0, 3])).items()\n",
" }\n",
" states_input = {\n",
" name: tf.keras.Input(shape[1:], dtype=dtype, name=name)\n",
" for name, (shape, dtype) in state_shapes.items()\n",
" }\n",
"\n",
" # The inputs to the model are the states and the video\n",
" inputs = {**states_input, 'image': inputs}\n",
"\n",
" # Output shape: [batch_size, 600]\n",
" outputs = encoder(inputs)\n",
"\n",
" model = tf.keras.Model(inputs, outputs)\n",
" model.build([1, 1, 1, 1, 3])\n",
"\n",
" return model\n",
"\n",
"# Download example gif\n",
"!wget https://github.com/tensorflow/models/raw/f8af2291cced43fc9f1d9b41ddbf772ae7b0d7d2/official/projects/movinet/files/jumpingjack.gif -O jumpingjack.gif -q\n",
"\n",
"def load_gif(file_path, image_size=(224, 224)):\n",
" \"\"\"Loads a gif file into a TF tensor.\"\"\"\n",
" with tf.io.gfile.GFile(file_path, 'rb') as f:\n",
" video = tf.io.decode_gif(f.read())\n",
" video = tf.image.resize(video, image_size)\n",
" video = tf.cast(video, tf.float32) / 255.\n",
" return video\n",
"\n",
"def get_top_k_streaming_labels(probs, k=5, label_map=KINETICS_600_LABELS_LIST):\n",
" \"\"\"Returns the top-k labels over an entire video sequence.\n",
"\n",
" Args:\n",
" probs: probability tensor of shape (num_frames, num_classes) that represents\n",
" the probability of each class on each frame.\n",
" k: the number of top predictions to select.\n",
" label_map: a list of labels to map logit indices to label strings.\n",
"\n",
" Returns:\n",
" a tuple of the top-k probabilities, labels, and logit indices\n",
" \"\"\"\n",
" top_categories_last = tf.argsort(probs, -1, 'DESCENDING')[-1, :1]\n",
" categories = tf.argsort(probs, -1, 'DESCENDING')[:, :k]\n",
" categories = tf.reshape(categories, [-1])\n",
"\n",
" counts = sorted([\n",
" (i.numpy(), tf.reduce_sum(tf.cast(categories == i, tf.int32)).numpy())\n",
" for i in tf.unique(categories)[0]\n",
" ], key=lambda x: x[1], reverse=True)\n",
"\n",
" top_probs_idx = tf.constant([i for i, _ in counts[:k]])\n",
" top_probs_idx = tf.concat([top_categories_last, top_probs_idx], 0)\n",
" top_probs_idx = tf.unique(top_probs_idx)[0][:k+1]\n",
"\n",
" top_probs = tf.gather(probs, top_probs_idx, axis=-1)\n",
" top_probs = tf.transpose(top_probs, perm=(1, 0))\n",
" top_labels = tf.gather(label_map, top_probs_idx, axis=0)\n",
" top_labels = [label.decode('utf8') for label in top_labels.numpy()]\n",
"\n",
" return top_probs, top_labels, top_probs_idx\n",
"\n",
"def plot_streaming_top_preds_at_step(\n",
" top_probs,\n",
" top_labels,\n",
" step=None,\n",
" image=None,\n",
" legend_loc='lower left',\n",
" duration_seconds=10,\n",
" figure_height=500,\n",
" playhead_scale=0.8,\n",
" grid_alpha=0.3):\n",
" \"\"\"Generates a plot of the top video model predictions at a given time step.\n",
"\n",
" Args:\n",
" top_probs: a tensor of shape (k, num_frames) representing the top-k\n",
" probabilities over all frames.\n",
" top_labels: a list of length k that represents the top-k label strings.\n",
" step: the current time step in the range [0, num_frames].\n",
" image: the image frame to display at the current time step.\n",
" legend_loc: the placement location of the legend.\n",
" duration_seconds: the total duration of the video.\n",
" figure_height: the output figure height.\n",
" playhead_scale: scale value for the playhead.\n",
" grid_alpha: alpha value for the gridlines.\n",
"\n",
" Returns:\n",
" A tuple of the output numpy image, figure, and axes.\n",
" \"\"\"\n",
" num_labels, num_frames = top_probs.shape\n",
" if step is None:\n",
" step = num_frames\n",
"\n",
" fig = plt.figure(figsize=(6.5, 7), dpi=300)\n",
" gs = mpl.gridspec.GridSpec(8, 1)\n",
" ax2 = plt.subplot(gs[:-3, :])\n",
" ax = plt.subplot(gs[-3:, :])\n",
"\n",
" if image is not None:\n",
" ax2.imshow(image, interpolation='nearest')\n",
" ax2.axis('off')\n",
"\n",
" preview_line_x = tf.linspace(0., duration_seconds, num_frames)\n",
" preview_line_y = top_probs\n",
"\n",
" line_x = preview_line_x[:step+1]\n",
" line_y = preview_line_y[:, :step+1]\n",
"\n",
" for i in range(num_labels):\n",
" ax.plot(preview_line_x, preview_line_y[i], label=None, linewidth='1.5',\n",
" linestyle=':', color='gray')\n",
" ax.plot(line_x, line_y[i], label=top_labels[i], linewidth='2.0')\n",
"\n",
"\n",
" ax.grid(which='major', linestyle=':', linewidth='1.0', alpha=grid_alpha)\n",
" ax.grid(which='minor', linestyle=':', linewidth='0.5', alpha=grid_alpha)\n",
"\n",
" min_height = tf.reduce_min(top_probs) * playhead_scale\n",
" max_height = tf.reduce_max(top_probs)\n",
" ax.vlines(preview_line_x[step], min_height, max_height, colors='red')\n",
" ax.scatter(preview_line_x[step], max_height, color='red')\n",
"\n",
" ax.legend(loc=legend_loc)\n",
"\n",
" plt.xlim(0, duration_seconds)\n",
" plt.ylabel('Probability')\n",
" plt.xlabel('Time (s)')\n",
" plt.yscale('log')\n",
"\n",
" fig.tight_layout()\n",
" fig.canvas.draw()\n",
"\n",
" data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)\n",
" data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))\n",
" plt.close()\n",
"\n",
" figure_width = int(figure_height * data.shape[1] / data.shape[0])\n",
" image = PIL.Image.fromarray(data).resize([figure_width, figure_height])\n",
" image = np.array(image)\n",
"\n",
" return image, (fig, ax, ax2)\n",
"\n",
"def plot_streaming_top_preds(\n",
" probs,\n",
" video,\n",
" top_k=5,\n",
" video_fps=25.,\n",
" figure_height=500,\n",
" use_progbar=True):\n",
" \"\"\"Generates a video plot of the top video model predictions.\n",
"\n",
" Args:\n",
" probs: probability tensor of shape (num_frames, num_classes) that represents\n",
" the probability of each class on each frame.\n",
" video: the video to display in the plot.\n",
" top_k: the number of top predictions to select.\n",
" video_fps: the input video fps.\n",
" figure_fps: the output video fps.\n",
" figure_height: the height of the output video.\n",
" use_progbar: display a progress bar.\n",
"\n",
" Returns:\n",
" A numpy array representing the output video.\n",
" \"\"\"\n",
" video_fps = 8.\n",
" figure_height = 500\n",
" steps = video.shape[0]\n",
" duration = steps / video_fps\n",
"\n",
" top_probs, top_labels, _ = get_top_k_streaming_labels(probs, k=top_k)\n",
"\n",
" images = []\n",
" step_generator = tqdm.trange(steps) if use_progbar else range(steps)\n",
" for i in step_generator:\n",
" image, _ = plot_streaming_top_preds_at_step(\n",
" top_probs=top_probs,\n",
" top_labels=top_labels,\n",
" step=i,\n",
" image=video[i],\n",
" duration_seconds=duration,\n",
" figure_height=figure_height,\n",
" )\n",
" images.append(image)\n",
"\n",
" return np.array(images)"
]
},
{
......@@ -78,95 +343,335 @@
"id": "6g0tuFvf71S9"
},
"source": [
"## Example Usage with TensorFlow Hub\n",
"## Running Base Model Inference with TensorFlow Hub\n",
"\n",
"Load MoViNet-A2-Base from TensorFlow Hub, as part of the [MoViNet collection](https://tfhub.dev/google/collections/movinet/).\n",
"We will load MoViNet-A2-Base from TensorFlow Hub as part of the [MoViNet collection](https://tfhub.dev/google/collections/movinet/).\n",
"\n",
"The following code will:\n",
"\n",
"- Load a MoViNet KerasLayer from [tfhub.dev](https://tfhub.dev).\n",
"- Wrap the layer in a [Keras Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model).\n",
"- Load an example image, and reshape it to a single frame video.\n",
"- Classify the video"
"- Load an example gif as a video.\n",
"- Classify the video and print the top-5 predicted classes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "nTUdhlRJzl2o"
"id": "KZKKNZVBpglJ"
},
"outputs": [],
"source": [
"movinet_a2_hub_url = 'https://tfhub.dev/tensorflow/movinet/a2/base/kinetics-600/classification/1'\n",
"\n",
"inputs = tf.keras.layers.Input(\n",
" shape=[None, None, None, 3],\n",
" dtype=tf.float32)\n",
"model = load_movinet_from_hub('a2', 'base', hub_version=3)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "7kU1_pL10l0B"
},
"source": [
"To provide a simple example video for classification, we can load a short gif of jumping jacks being performed.\n",
"\n",
"encoder = hub.KerasLayer(movinet_a2_hub_url, trainable=True)\n",
"![jumping jacks](https://github.com/tensorflow/models/raw/f8af2291cced43fc9f1d9b41ddbf772ae7b0d7d2/official/projects/movinet/files/jumpingjack.gif)\n",
"\n",
"# Important: To use tf.nn.conv3d on CPU, we must compile with tf.function.\n",
"encoder.call = tf.function(encoder.call, experimental_compile=True)\n",
"Attribution: Footage shared by [Coach Bobby Bluford](https://www.youtube.com/watch?v=-AxHpj-EuPg) on YouTube under the CC-BY license."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Iy0rKRrT723_"
},
"outputs": [],
"source": [
"video = load_gif('jumpingjack.gif', image_size=(172, 172))\n",
"\n",
"# [batch_size, 600]\n",
"outputs = encoder(dict(image=inputs))\n",
"# Show video\n",
"print(video.shape)\n",
"media.show_video(video.numpy(), fps=5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "P0bZfrAsqPv2",
"outputId": "bd82571f-8dfd-4faf-ed10-e34708b0405d"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"jumping jacks 0.9166437\n",
"zumba 0.016020728\n",
"doing aerobics 0.008053946\n",
"dancing charleston 0.006083599\n",
"lunge 0.0035062772\n"
]
}
],
"source": [
"# Run the model on the video and output the top 5 predictions\n",
"outputs = predict_top_k(model, video)\n",
"\n",
"model = tf.keras.Model(inputs, outputs)"
"for label, prob in outputs:\n",
" print(label, prob)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "7kU1_pL10l0B"
"id": "ADrHPmwGcBZ5"
},
"source": [
"To provide a simple example video for classification, we can load a static image and reshape it to produce a video with a single frame."
"## Run Streaming Model Inference with TensorFlow Hub and Plot Predictions\n",
"\n",
"We will load MoViNet-A0-Stream from TensorFlow Hub as part of the [MoViNet collection](https://tfhub.dev/google/collections/movinet/).\n",
"\n",
"The following code will:\n",
"\n",
"- Load a MoViNet model from [tfhub.dev](https://tfhub.dev).\n",
"- Classify an example video and plot the streaming predictions over time."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Iy0rKRrT723_"
"id": "tXWR13wthnK5"
},
"outputs": [],
"source": [
"image_url = 'https://upload.wikimedia.org/wikipedia/commons/8/84/Ski_Famille_-_Family_Ski_Holidays.jpg'\n",
"image_height = 224\n",
"image_width = 224\n",
"model = load_movinet_from_hub('a2', 'stream', hub_version=3)\n",
"\n",
"# Create initial states for the stream model\n",
"init_states_fn = model.layers[-1].resolved_object.signatures['init_states']\n",
"init_states = init_states_fn(tf.shape(video[tf.newaxis]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "YqSkt7l8ltwt",
"outputId": "6ccf1dd6-95d1-43b1-efdb-2e931dd3a19d"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100%|██████████| 13/13 [00:08\u003c00:00, 1.58it/s]\n",
"jumping jacks 0.9998123\n",
"zumba 0.00011835508\n",
"doing aerobics 3.3375818e-05\n",
"dancing charleston 4.9819987e-06\n",
"finger snapping 3.8673647e-06\n"
]
}
],
"source": [
"# Insert your video clip here\n",
"video = load_gif('jumpingjack.gif', image_size=(172, 172))\n",
"clips = tf.split(video[tf.newaxis], video.shape[0], axis=1)\n",
"\n",
"all_logits = []\n",
"\n",
"with urllib.request.urlopen(image_url) as f:\n",
" image = Image.open(f).resize((image_height, image_width))\n",
"video = tf.reshape(np.array(image), [1, 1, image_height, image_width, 3])\n",
"video = tf.cast(video, tf.float32) / 255.\n",
"# To run on a video, pass in one frame at a time\n",
"states = init_states\n",
"for clip in tqdm.tqdm(clips):\n",
" # Input shape: [1, 1, 172, 172, 3]\n",
" logits, states = model.predict({**states, 'image': clip}, verbose=0)\n",
" all_logits.append(logits)\n",
"\n",
"image"
"logits = tf.concat(all_logits, 0)\n",
"probs = tf.nn.softmax(logits)\n",
"\n",
"final_probs = probs[-1]\n",
"top_k = get_top_k(final_probs)\n",
"print()\n",
"for label, prob in top_k:\n",
" print(label, prob)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Xdox556CtMRb"
},
"outputs": [],
"source": [
"# Generate a plot and output to a video tensor\n",
"plot_video = plot_streaming_top_preds(probs, video, video_fps=8.)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "NSStKE9klCs3"
},
"outputs": [],
"source": [
"# For gif format, set codec='gif'\n",
"media.show_video(plot_video, fps=3)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Yf6EefHuWfxC"
"id": "W3CLHvubvdSI"
},
"source": [
"Run the model and output the predicted label. Expected output should be skiing (labels 464-467). E.g., 465 = \"skiing crosscountry\".\n",
"## Export a Streaming Model to TensorFlow Lite for Mobile\n",
"\n",
"See [here](https://gist.github.com/willprice/f19da185c9c5f32847134b87c1960769#file-kinetics_600_labels-csv) for a full list of all labels."
"We will convert a MoViNet-A0-Stream model to [TensorFlow Lite](https://www.tensorflow.org/lite).\n",
"\n",
"The following code will:\n",
"- Load a MoViNet-A0-Stream model.\n",
"- Convert the model to TF Lite.\n",
"- Run inference on an example video using the Python interpreter."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "OOpEKuqH8sH7"
"id": "KH0j-07KVh06"
},
"outputs": [],
"source": [
"output = model(video)\n",
"output_label_index = tf.argmax(output, -1)[0].numpy()\n",
"# Run imports\n",
"from official.vision.configs import video_classification\n",
"from official.projects.movinet.configs import movinet as movinet_configs\n",
"from official.projects.movinet.modeling import movinet\n",
"from official.projects.movinet.modeling import movinet_layers\n",
"from official.projects.movinet.modeling import movinet_model\n",
"from official.projects.movinet.tools import export_saved_model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "RLkV0xtPvfkY"
},
"outputs": [],
"source": [
"# Export to saved model\n",
"saved_model_dir = 'model'\n",
"tflite_filename = 'model.tflite'\n",
"input_shape = [1, 1, 172, 172, 3]\n",
"batch_size, num_frames, image_size, = input_shape[:3]\n",
"\n",
"print(output_label_index)"
"tf.keras.backend.clear_session()\n",
"\n",
"# Create the model\n",
"input_specs = tf.keras.layers.InputSpec(shape=input_shape)\n",
"backbone = movinet.Movinet(\n",
" model_id='a0',\n",
" causal=True,\n",
" conv_type='2plus1d',\n",
" se_type='2plus3d',\n",
" input_specs=input_specs,\n",
" activation='hard_swish',\n",
" gating_activation='hard_sigmoid',\n",
" use_sync_bn=False,\n",
" use_external_states=True)\n",
"model = movinet_model.MovinetClassifier(\n",
" backbone=backbone,\n",
" activation='hard_swish',\n",
" num_classes=600,\n",
" output_states=True,\n",
" input_specs=dict(image=input_specs))\n",
"model.build([1, 1, 1, 1, 3])\n",
"\n",
"# Extract pretrained weights\n",
"!wget https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a0_stream.tar.gz -O movinet_a0_stream.tar.gz -q\n",
"!tar -xvf movinet_a0_stream.tar.gz\n",
"\n",
"checkpoint_dir = 'movinet_a0_stream'\n",
"checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir)\n",
"\n",
"# Convert to saved model\n",
"export_saved_model.export_saved_model(\n",
" model=model,\n",
" input_shape=input_shape,\n",
" export_path=saved_model_dir,\n",
" causal=True,\n",
" bundle_input_init_states_fn=False,\n",
" checkpoint_path=checkpoint_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "gPg_6eMC8IwF"
},
"outputs": [],
"source": [
"# Convert to TF Lite\n",
"converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)\n",
"tflite_model = converter.convert()\n",
"\n",
"with open(tflite_filename, 'wb') as f:\n",
" f.write(tflite_model)\n",
"\n",
"# Create the interpreter and signature runner\n",
"interpreter = tf.lite.Interpreter(model_path=tflite_filename)\n",
"runner = interpreter.get_signature_runner()\n",
"\n",
"init_states = {\n",
" name: tf.zeros(x['shape'], dtype=x['dtype'])\n",
" for name, x in runner.get_input_details().items()\n",
"}\n",
"del init_states['image']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "-TQ-7oSJIlTA",
"outputId": "a15519ff-d08c-40bc-fbea-d3a58169450c"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"jumping jacks 0.9791285\n",
"jogging 0.0019550633\n",
"riding unicycle 0.0017429002\n",
"passing soccer ball 0.0016952101\n",
"stretching arm 0.0014458151\n"
]
}
],
"source": [
"# Insert your video clip here\n",
"video = load_gif('jumpingjack.gif', image_size=(172, 172))\n",
"clips = tf.split(video[tf.newaxis], video.shape[0], axis=1)\n",
"\n",
"# To run on a video, pass in one frame at a time\n",
"states = init_states\n",
"for clip in clips:\n",
" # Input shape: [1, 1, 172, 172, 3]\n",
" outputs = runner(**states, image=clip)\n",
" logits = outputs.pop('logits')[0]\n",
" states = outputs\n",
"\n",
"probs = tf.nn.softmax(logits)\n",
"top_k = get_top_k(probs)\n",
"print()\n",
"for label, prob in top_k:\n",
" print(label, prob)"
]
},
{
......@@ -175,17 +680,17 @@
"id": "_s-7bEoa3f8g"
},
"source": [
"## Example Usage with the TensorFlow Model Garden\n",
"## Fine-Tune a Base Model with the TensorFlow Model Garden\n",
"\n",
"Fine-tune MoViNet-A0-Base on [UCF-101](https://www.crcv.ucf.edu/research/data-sets/ucf101/).\n",
"We will Fine-tune MoViNet-A0-Base on [UCF-101](https://www.crcv.ucf.edu/research/data-sets/ucf101/).\n",
"\n",
"The following code will:\n",
"\n",
"- Load the UCF-101 dataset with [TensorFlow Datasets](https://www.tensorflow.org/datasets/catalog/ucf101).\n",
"- Create a [`tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) pipeline for training and evaluation.\n",
"- Create a simple [`tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) pipeline for training and evaluation.\n",
"- Display some example videos from the dataset.\n",
"- Build a MoViNet model and load pretrained weights.\n",
"- Fine-tune the final classifier layers on UCF-101."
"- Fine-tune the final classifier layers on UCF-101 and evaluate accuracy on the validation set."
]
},
{
......@@ -196,7 +701,25 @@
"source": [
"### Load the UCF-101 Dataset with TensorFlow Datasets\n",
"\n",
"Calling `download_and_prepare()` will automatically download the dataset. After downloading, this cell will output information about the dataset."
"Calling `download_and_prepare()` will automatically download the dataset. This step may take up to 1 hour depending on the download and extraction speed. After downloading, the next cell will output information about the dataset."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "2IHLbPAfrs5P"
},
"outputs": [],
"source": [
"# Run imports\n",
"import tensorflow_datasets as tfds\n",
"\n",
"from official.vision.configs import video_classification\n",
"from official.projects.movinet.configs import movinet as movinet_configs\n",
"from official.projects.movinet.modeling import movinet\n",
"from official.projects.movinet.modeling import movinet_layers\n",
"from official.projects.movinet.modeling import movinet_model"
]
},
{
......@@ -288,7 +811,7 @@
")"
]
},
"execution_count": 0,
"execution_count": null,
"metadata": {
"tags": []
},
......@@ -310,15 +833,6 @@
"builder.info"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BsJJgnBBqDKZ"
},
"source": [
"Build the training and evaluation datasets."
]
},
{
"cell_type": "code",
"execution_count": null,
......@@ -327,6 +841,8 @@
},
"outputs": [],
"source": [
"# Build the training and evaluation datasets.\n",
"\n",
"batch_size = 8\n",
"num_frames = 8\n",
"frame_stride = 10\n",
......@@ -392,16 +908,9 @@
"id": "R3RHeuHdsd_3"
},
"source": [
"### Build MoViNet-A0-Base and Load Pretrained Weights"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "JXVQOP9Rqk0I"
},
"source": [
"Here we create a MoViNet model using the open source code provided in [tensorflow/models](https://github.com/tensorflow/models) and load the pretrained weights. Here we freeze the all layers except the final classifier head to speed up fine-tuning."
"### Build MoViNet-A0-Base and Load Pretrained Weights\n",
"\n",
"Here we create a MoViNet model using the open source code provided in [official/projects/movinet](https://github.com/tensorflow/models/tree/master/official/projects/movinet) and load the pretrained weights. Here we freeze the all layers except the final classifier head to speed up fine-tuning."
]
},
{
......@@ -416,32 +925,38 @@
"\n",
"tf.keras.backend.clear_session()\n",
"\n",
"backbone = movinet.Movinet(\n",
" model_id=model_id)\n",
"model = movinet_model.MovinetClassifier(\n",
" backbone=backbone,\n",
" num_classes=600)\n",
"model.build([batch_size, num_frames, resolution, resolution, 3])\n",
"backbone = movinet.Movinet(model_id=model_id)\n",
"model = movinet_model.MovinetClassifier(backbone=backbone, num_classes=600)\n",
"model.build([1, 1, 1, 1, 3])\n",
"\n",
"# Load pretrained weights from TF Hub\n",
"movinet_hub_url = f'https://tfhub.dev/tensorflow/movinet/{model_id}/base/kinetics-600/classification/1'\n",
"movinet_hub_model = hub.KerasLayer(movinet_hub_url, trainable=True)\n",
"pretrained_weights = {w.name: w for w in movinet_hub_model.weights}\n",
"model_weights = {w.name: w for w in model.weights}\n",
"for name in pretrained_weights:\n",
" model_weights[name].assign(pretrained_weights[name])\n",
"# Load pretrained weights\n",
"!wget https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a0_base.tar.gz -O movinet_a0_base.tar.gz -q\n",
"!tar -xvf movinet_a0_base.tar.gz\n",
"\n",
"# Wrap the backbone with a new classifier to create a new classifier head\n",
"# with num_classes outputs\n",
"model = movinet_model.MovinetClassifier(\n",
" backbone=backbone,\n",
" num_classes=num_classes)\n",
"model.build([batch_size, num_frames, resolution, resolution, 3])\n",
"checkpoint_dir = 'movinet_a0_base'\n",
"checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir)\n",
"checkpoint = tf.train.Checkpoint(model=model)\n",
"status = checkpoint.restore(checkpoint_path)\n",
"status.assert_existing_objects_matched()\n",
"\n",
"def build_classifier(backbone, num_classes, freeze_backbone=False):\n",
" \"\"\"Builds a classifier on top of a backbone model.\"\"\"\n",
" model = movinet_model.MovinetClassifier(\n",
" backbone=backbone,\n",
" num_classes=num_classes)\n",
" model.build([batch_size, num_frames, resolution, resolution, 3])\n",
"\n",
"# Freeze all layers except for the final classifier head\n",
"for layer in model.layers[:-1]:\n",
" layer.trainable = False\n",
"model.layers[-1].trainable = True"
" if freeze_backbone:\n",
" for layer in model.layers[:-1]:\n",
" layer.trainable = False\n",
" model.layers[-1].trainable = True\n",
"\n",
" return model\n",
"\n",
"# Wrap the backbone with a new classifier to create a new classifier head\n",
"# with num_classes outputs (101 classes for UCF101).\n",
"# Freeze all layers except for the final classifier head.\n",
"model = build_classifier(backbone, num_classes, freeze_backbone=True)"
]
},
{
......@@ -500,7 +1015,7 @@
"id": "0IyAOOlcpHna"
},
"source": [
"Run the fine-tuning with Keras compile/fit. After fine-tuning the model, we should be able to achieve \u003e70% accuracy on the test set."
"Run the fine-tuning with Keras compile/fit. After fine-tuning the model, we should be able to achieve \u003e85% accuracy on the test set."
]
},
{
......@@ -527,11 +1042,11 @@
"output_type": "stream",
"text": [
"Epoch 1/3\n",
"1192/1192 [==============================] - 348s 286ms/step - loss: 3.4914 - top_1: 0.3639 - top_5: 0.6294 - val_loss: 2.5153 - val_top_1: 0.5975 - val_top_5: 0.8565\n",
"1192/1192 [==============================] - 551s 451ms/step - loss: 2.5050 - top_1: 0.6692 - top_5: 0.8753 - val_loss: 1.6310 - val_top_1: 0.8109 - val_top_5: 0.9701\n",
"Epoch 2/3\n",
"1192/1192 [==============================] - 286s 240ms/step - loss: 2.1397 - top_1: 0.6794 - top_5: 0.9231 - val_loss: 2.0695 - val_top_1: 0.6838 - val_top_5: 0.9070\n",
"1192/1192 [==============================] - 533s 447ms/step - loss: 1.3336 - top_1: 0.9024 - top_5: 0.9906 - val_loss: 1.4576 - val_top_1: 0.8451 - val_top_5: 0.9740\n",
"Epoch 3/3\n",
"1192/1192 [==============================] - 348s 292ms/step - loss: 1.8925 - top_1: 0.7660 - top_5: 0.9454 - val_loss: 1.9848 - val_top_1: 0.7116 - val_top_5: 0.9227\n"
"1192/1192 [==============================] - 531s 446ms/step - loss: 1.2298 - top_1: 0.9329 - top_5: 0.9943 - val_loss: 1.4351 - val_top_1: 0.8514 - val_top_5: 0.9762\n"
]
}
],
......@@ -573,7 +1088,7 @@
"colab": {
"collapsed_sections": [],
"last_runtime": {
"build_target": "//learning/deepmind/public/tools/ml_python:ml_notebook",
"build_target": "//learning/deepmind/dm_python:dm_notebook3",
"kind": "private"
},
"name": "movinet_tutorial.ipynb",
......
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -51,6 +51,8 @@ python3 export_saved_model.py \
To use an exported saved_model, refer to export_saved_model_test.py.
"""
from typing import Optional, Tuple
from absl import app
from absl import flags
import tensorflow as tf
......@@ -113,62 +115,50 @@ flags.DEFINE_string(
FLAGS = flags.FLAGS
def main(_) -> None:
input_specs = tf.keras.layers.InputSpec(shape=[
FLAGS.batch_size,
FLAGS.num_frames,
FLAGS.image_size,
FLAGS.image_size,
3,
])
def export_saved_model(
model: tf.keras.Model,
input_shape: Tuple[int, int, int, int, int],
export_path: str = '/tmp/movinet/',
causal: bool = False,
bundle_input_init_states_fn: bool = True,
checkpoint_path: Optional[str] = None) -> None:
"""Exports a MoViNet model to a saved model.
Args:
model: the tf.keras.Model to export.
input_shape: The 5D spatiotemporal input shape of size
[batch_size, num_frames, image_height, image_width, num_channels].
Set the field or a shape position in the field to None for dynamic input.
export_path: Export path to save the saved_model file.
causal: Run the model in causal mode.
bundle_input_init_states_fn: Add init_states as a function signature to the
saved model. This is not necessary if the input shape is static (e.g.,
for TF Lite).
checkpoint_path: Checkpoint path to load. Leave blank to keep the model's
initialization.
"""
# Use dimensions of 1 except the channels to export faster,
# since we only really need the last dimension to build and get the output
# states. These dimensions can be set to `None` once the model is built.
input_shape = [1 if s is None else s for s in input_specs.shape]
# Override swish activation implementation to remove custom gradients
activation = FLAGS.activation
if activation == 'swish':
activation = 'simple_swish'
classifier_activation = FLAGS.classifier_activation
if classifier_activation == 'swish':
classifier_activation = 'simple_swish'
backbone = movinet.Movinet(
model_id=FLAGS.model_id,
causal=FLAGS.causal,
use_positional_encoding=FLAGS.use_positional_encoding,
conv_type=FLAGS.conv_type,
se_type=FLAGS.se_type,
input_specs=input_specs,
activation=activation,
gating_activation=FLAGS.gating_activation,
use_sync_bn=False,
use_external_states=FLAGS.causal)
model = movinet_model.MovinetClassifier(
backbone,
num_classes=FLAGS.num_classes,
output_states=FLAGS.causal,
input_specs=dict(image=input_specs),
activation=classifier_activation)
model.build(input_shape)
input_shape_concrete = [1 if s is None else s for s in input_shape]
model.build(input_shape_concrete)
# Compile model to generate some internal Keras variables.
model.compile()
if FLAGS.checkpoint_path:
if checkpoint_path:
checkpoint = tf.train.Checkpoint(model=model)
status = checkpoint.restore(FLAGS.checkpoint_path)
status = checkpoint.restore(checkpoint_path)
status.assert_existing_objects_matched()
if FLAGS.causal:
if causal:
# Call the model once to get the output states. Call again with `states`
# input to ensure that the inputs with the `states` argument is built
# with the full output state shapes.
input_image = tf.ones(input_shape)
_, states = model({**model.init_states(input_shape), 'image': input_image})
input_image = tf.ones(input_shape_concrete)
_, states = model({
**model.init_states(input_shape_concrete), 'image': input_image})
_ = model({**states, 'image': input_image})
# Create a function to explicitly set the names of the outputs
......@@ -179,10 +169,10 @@ def main(_) -> None:
specs = {
name: tf.TensorSpec(spec.shape, name=name, dtype=spec.dtype)
for name, spec in model.initial_state_specs(
input_specs.shape).items()
input_shape).items()
}
specs['image'] = tf.TensorSpec(
input_specs.shape, dtype=model.dtype, name='image')
input_shape, dtype=model.dtype, name='image')
predict_fn = tf.function(predict, jit_compile=True)
predict_fn = predict_fn.get_concrete_function(specs)
......@@ -191,17 +181,118 @@ def main(_) -> None:
init_states_fn = init_states_fn.get_concrete_function(
tf.TensorSpec([5], dtype=tf.int32))
if FLAGS.bundle_input_init_states_fn:
if bundle_input_init_states_fn:
signatures = {'call': predict_fn, 'init_states': init_states_fn}
else:
signatures = predict_fn
tf.keras.models.save_model(
model, FLAGS.export_path, signatures=signatures)
model, export_path, signatures=signatures)
else:
_ = model(tf.ones(input_shape))
tf.keras.models.save_model(model, FLAGS.export_path)
_ = model(tf.ones(input_shape_concrete))
tf.keras.models.save_model(model, export_path)
def build_and_export_saved_model(
export_path: str = '/tmp/movinet/',
model_id: str = 'a0',
causal: bool = False,
conv_type: str = '3d',
se_type: str = '3d',
activation: str = 'swish',
classifier_activation: str = 'swish',
gating_activation: str = 'sigmoid',
use_positional_encoding: bool = False,
num_classes: int = 600,
input_shape: Optional[Tuple[int, int, int, int, int]] = None,
bundle_input_init_states_fn: bool = True,
checkpoint_path: Optional[str] = None) -> None:
"""Builds and exports a MoViNet model to a saved model.
Args:
export_path: Export path to save the saved_model file.
model_id: MoViNet model name.
causal: Run the model in causal mode.
conv_type: 3d, 2plus1d, or 3d_2plus1d. 3d configures the network
to use the default 3D convolution. 2plus1d uses (2+1)D convolution
with Conv2D operations and 2D reshaping (e.g., a 5x3x3 kernel becomes
3x3 followed by 5x1 conv). 3d_2plus1d uses (2+1)D convolution with
Conv3D and no 2D reshaping (e.g., a 5x3x3 kernel becomes 1x3x3
followed by 5x1x1 conv).
se_type:
3d, 2d, or 2plus3d. 3d uses the default 3D spatiotemporal global average
pooling for squeeze excitation. 2d uses 2D spatial global average pooling
on each frame. 2plus3d concatenates both 3D and 2D global average
pooling.
activation: The main activation to use across layers.
classifier_activation: The classifier activation to use.
gating_activation: The gating activation to use in squeeze-excitation
layers.
use_positional_encoding: Whether to use positional encoding (only applied
when causal=True).
num_classes: The number of classes for prediction.
input_shape: The 5D spatiotemporal input shape of size
[batch_size, num_frames, image_height, image_width, num_channels].
Set the field or a shape position in the field to None for dynamic input.
bundle_input_init_states_fn: Add init_states as a function signature to the
saved model. This is not necessary if the input shape is static (e.g.,
for TF Lite).
checkpoint_path: Checkpoint path to load. Leave blank for default
initialization.
"""
input_specs = tf.keras.layers.InputSpec(shape=input_shape)
# Override swish activation implementation to remove custom gradients
if activation == 'swish':
activation = 'simple_swish'
if classifier_activation == 'swish':
classifier_activation = 'simple_swish'
backbone = movinet.Movinet(
model_id=model_id,
causal=causal,
use_positional_encoding=use_positional_encoding,
conv_type=conv_type,
se_type=se_type,
input_specs=input_specs,
activation=activation,
gating_activation=gating_activation,
use_sync_bn=False,
use_external_states=causal)
model = movinet_model.MovinetClassifier(
backbone,
num_classes=num_classes,
output_states=causal,
input_specs=dict(image=input_specs),
activation=classifier_activation)
export_saved_model(
model=model,
input_shape=input_shape,
export_path=export_path,
causal=causal,
bundle_input_init_states_fn=bundle_input_init_states_fn,
checkpoint_path=checkpoint_path)
def main(_) -> None:
input_shape = (
FLAGS.batch_size, FLAGS.num_frames, FLAGS.image_size, FLAGS.image_size, 3)
build_and_export_saved_model(
export_path=FLAGS.export_path,
model_id=FLAGS.model_id,
causal=FLAGS.causal,
conv_type=FLAGS.conv_type,
se_type=FLAGS.se_type,
activation=FLAGS.activation,
classifier_activation=FLAGS.classifier_activation,
gating_activation=FLAGS.gating_activation,
use_positional_encoding=FLAGS.use_positional_encoding,
num_classes=FLAGS.num_classes,
input_shape=input_shape,
bundle_input_init_states_fn=FLAGS.bundle_input_init_states_fn,
checkpoint_path=FLAGS.checkpoint_path)
print(' ----- Done. Saved Model is saved at {}'.format(FLAGS.export_path))
......
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "qwBHHt-XvPqn"
},
"source": [
"# Plot MoViNet Video Stream Predictions\n",
"\n",
"This notebook uses [MoViNets (Mobile Video Networks)](https://github.com/tensorflow/models/tree/master/official/projects/movinet) to predict a human action in a streaming video and outputs a visualization of predictions on each frame.\n",
"\n",
"Provide a video URL or upload your own to see how predictions change over time. All models can be run on CPU.\n",
"\n",
"Pretrained models are provided by [TensorFlow Hub](https://tfhub.dev/google/collections/movinet/) and the [TensorFlow Model Garden](https://github.com/tensorflow/models/tree/master/official/projects/movinet), trained on [Kinetics 600](https://deepmind.com/research/open-source/kinetics) for video action classification. All Models use TensorFlow 2 with Keras for inference and training. See the [research paper](https://arxiv.org/pdf/2103.11511.pdf) for more details.\n",
"\n",
"Example output using [this gif](https://github.com/tensorflow/models/raw/f8af2291cced43fc9f1d9b41ddbf772ae7b0d7d2/official/projects/movinet/files/jumpingjack.gif) as input:\n",
"\n",
"![jumping jacks plot](https://storage.googleapis.com/tf_model_garden/vision/movinet/artifacts/jumpingjacks_plot.gif)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "ElvELd9mIfZe"
},
"outputs": [],
"source": [
"#@title Run this cell to initialize and setup a [MoViNet](https://github.com/tensorflow/models/tree/master/official/projects/movinet) model.\n",
"\n",
"\n",
"# Install the mediapy package for visualizing images/videos.\n",
"# See https://github.com/google/mediapy\n",
"!pip install -q mediapy\n",
"\n",
"# Run imports\n",
"import os\n",
"import io\n",
"\n",
"import matplotlib as mpl\n",
"import matplotlib.pyplot as plt\n",
"import mediapy as media\n",
"import numpy as np\n",
"import PIL\n",
"import pandas as pd\n",
"import tensorflow as tf\n",
"import tensorflow_datasets as tfds\n",
"import tensorflow_hub as hub\n",
"import tqdm\n",
"from google.colab import files\n",
"import urllib.request\n",
"\n",
"mpl.rcParams.update({\n",
" 'font.size': 10,\n",
"})\n",
"\n",
"\n",
"# Download Kinetics 600 label map\n",
"!wget https://raw.githubusercontent.com/tensorflow/models/f8af2291cced43fc9f1d9b41ddbf772ae7b0d7d2/official/projects/movinet/files/kinetics_600_labels.txt -O labels.txt -q\n",
"\n",
"with tf.io.gfile.GFile('labels.txt') as f:\n",
" lines = f.readlines()\n",
" KINETICS_600_LABELS_LIST = [line.strip() for line in lines]\n",
" KINETICS_600_LABELS = tf.constant(KINETICS_600_LABELS_LIST)\n",
"\n",
"def get_top_k(probs, k=5, label_map=KINETICS_600_LABELS):\n",
" \"\"\"Outputs the top k model labels and probabilities on the given video.\"\"\"\n",
" top_predictions = tf.argsort(probs, axis=-1, direction='DESCENDING')[:k]\n",
" top_labels = tf.gather(label_map, top_predictions, axis=-1)\n",
" top_labels = [label.decode('utf8') for label in top_labels.numpy()]\n",
" top_probs = tf.gather(probs, top_predictions, axis=-1).numpy()\n",
" return tuple(zip(top_labels, top_probs))\n",
"\n",
"def predict_top_k(model, video, k=5, label_map=KINETICS_600_LABELS):\n",
" \"\"\"Outputs the top k model labels and probabilities on the given video.\"\"\"\n",
" outputs = model.predict(video[tf.newaxis])[0]\n",
" probs = tf.nn.softmax(outputs)\n",
" return get_top_k(probs, k=k, label_map=label_map)\n",
"\n",
"def load_movinet_from_hub(model_id, model_mode, hub_version=3):\n",
" \"\"\"Loads a MoViNet model from TF Hub.\"\"\"\n",
" hub_url = f'https://tfhub.dev/tensorflow/movinet/{model_id}/{model_mode}/kinetics-600/classification/{hub_version}'\n",
"\n",
" encoder = hub.KerasLayer(hub_url, trainable=True)\n",
"\n",
" inputs = tf.keras.layers.Input(\n",
" shape=[None, None, None, 3],\n",
" dtype=tf.float32)\n",
"\n",
" if model_mode == 'base':\n",
" inputs = dict(image=inputs)\n",
" else:\n",
" # Define the state inputs, which is a dict that maps state names to tensors.\n",
" init_states_fn = encoder.resolved_object.signatures['init_states']\n",
" state_shapes = {\n",
" name: ([s if s \u003e 0 else None for s in state.shape], state.dtype)\n",
" for name, state in init_states_fn(tf.constant([0, 0, 0, 0, 3])).items()\n",
" }\n",
" states_input = {\n",
" name: tf.keras.Input(shape[1:], dtype=dtype, name=name)\n",
" for name, (shape, dtype) in state_shapes.items()\n",
" }\n",
"\n",
" # The inputs to the model are the states and the video\n",
" inputs = {**states_input, 'image': inputs}\n",
"\n",
" # Output shape: [batch_size, 600]\n",
" outputs = encoder(inputs)\n",
"\n",
" model = tf.keras.Model(inputs, outputs)\n",
" model.build([1, 1, 1, 1, 3])\n",
"\n",
" return model\n",
"\n",
"# Download example gif\n",
"!wget https://github.com/tensorflow/models/raw/f8af2291cced43fc9f1d9b41ddbf772ae7b0d7d2/official/projects/movinet/files/jumpingjack.gif -O jumpingjack.gif -q\n",
"\n",
"def load_gif(file_path, image_size=(224, 224)):\n",
" \"\"\"Loads a gif file into a TF tensor.\"\"\"\n",
" with tf.io.gfile.GFile(file_path, 'rb') as f:\n",
" video = tf.io.decode_gif(f.read())\n",
" video = tf.image.resize(video, image_size)\n",
" video = tf.cast(video, tf.float32) / 255.\n",
" return video\n",
"\n",
"def get_top_k_streaming_labels(probs, k=5, label_map=KINETICS_600_LABELS_LIST):\n",
" \"\"\"Returns the top-k labels over an entire video sequence.\n",
"\n",
" Args:\n",
" probs: probability tensor of shape (num_frames, num_classes) that represents\n",
" the probability of each class on each frame.\n",
" k: the number of top predictions to select.\n",
" label_map: a list of labels to map logit indices to label strings.\n",
"\n",
" Returns:\n",
" a tuple of the top-k probabilities, labels, and logit indices\n",
" \"\"\"\n",
" top_categories_last = tf.argsort(probs, -1, 'DESCENDING')[-1, :1]\n",
" categories = tf.argsort(probs, -1, 'DESCENDING')[:, :k]\n",
" categories = tf.reshape(categories, [-1])\n",
"\n",
" counts = sorted([\n",
" (i.numpy(), tf.reduce_sum(tf.cast(categories == i, tf.int32)).numpy())\n",
" for i in tf.unique(categories)[0]\n",
" ], key=lambda x: x[1], reverse=True)\n",
"\n",
" top_probs_idx = tf.constant([i for i, _ in counts[:k]])\n",
" top_probs_idx = tf.concat([top_categories_last, top_probs_idx], 0)\n",
" top_probs_idx = tf.unique(top_probs_idx)[0][:k+1]\n",
"\n",
" top_probs = tf.gather(probs, top_probs_idx, axis=-1)\n",
" top_probs = tf.transpose(top_probs, perm=(1, 0))\n",
" top_labels = tf.gather(label_map, top_probs_idx, axis=0)\n",
" top_labels = [label.decode('utf8') for label in top_labels.numpy()]\n",
"\n",
" return top_probs, top_labels, top_probs_idx\n",
"\n",
"def plot_streaming_top_preds_at_step(\n",
" top_probs,\n",
" top_labels,\n",
" step=None,\n",
" image=None,\n",
" legend_loc='lower left',\n",
" duration_seconds=10,\n",
" figure_height=500,\n",
" playhead_scale=0.8,\n",
" grid_alpha=0.3):\n",
" \"\"\"Generates a plot of the top video model predictions at a given time step.\n",
"\n",
" Args:\n",
" top_probs: a tensor of shape (k, num_frames) representing the top-k\n",
" probabilities over all frames.\n",
" top_labels: a list of length k that represents the top-k label strings.\n",
" step: the current time step in the range [0, num_frames].\n",
" image: the image frame to display at the current time step.\n",
" legend_loc: the placement location of the legend.\n",
" duration_seconds: the total duration of the video.\n",
" figure_height: the output figure height.\n",
" playhead_scale: scale value for the playhead.\n",
" grid_alpha: alpha value for the gridlines.\n",
"\n",
" Returns:\n",
" A tuple of the output numpy image, figure, and axes.\n",
" \"\"\"\n",
" num_labels, num_frames = top_probs.shape\n",
" if step is None:\n",
" step = num_frames\n",
"\n",
" fig = plt.figure(figsize=(6.5, 7), dpi=300)\n",
" gs = mpl.gridspec.GridSpec(8, 1)\n",
" ax2 = plt.subplot(gs[:-3, :])\n",
" ax = plt.subplot(gs[-3:, :])\n",
"\n",
" if image is not None:\n",
" ax2.imshow(image, interpolation='nearest')\n",
" ax2.axis('off')\n",
"\n",
" preview_line_x = tf.linspace(0., duration_seconds, num_frames)\n",
" preview_line_y = top_probs\n",
"\n",
" line_x = preview_line_x[:step+1]\n",
" line_y = preview_line_y[:, :step+1]\n",
"\n",
" for i in range(num_labels):\n",
" ax.plot(preview_line_x, preview_line_y[i], label=None, linewidth='1.5',\n",
" linestyle=':', color='gray')\n",
" ax.plot(line_x, line_y[i], label=top_labels[i], linewidth='2.0')\n",
"\n",
"\n",
" ax.grid(which='major', linestyle=':', linewidth='1.0', alpha=grid_alpha)\n",
" ax.grid(which='minor', linestyle=':', linewidth='0.5', alpha=grid_alpha)\n",
"\n",
" min_height = tf.reduce_min(top_probs) * playhead_scale\n",
" max_height = tf.reduce_max(top_probs)\n",
" ax.vlines(preview_line_x[step], min_height, max_height, colors='red')\n",
" ax.scatter(preview_line_x[step], max_height, color='red')\n",
"\n",
" ax.legend(loc=legend_loc)\n",
"\n",
" plt.xlim(0, duration_seconds)\n",
" plt.ylabel('Probability')\n",
" plt.xlabel('Time (s)')\n",
" plt.yscale('log')\n",
"\n",
" fig.tight_layout()\n",
" fig.canvas.draw()\n",
"\n",
" data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)\n",
" data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))\n",
" plt.close()\n",
"\n",
" figure_width = int(figure_height * data.shape[1] / data.shape[0])\n",
" image = PIL.Image.fromarray(data).resize([figure_width, figure_height])\n",
" image = np.array(image)\n",
"\n",
" return image, (fig, ax, ax2)\n",
"\n",
"def plot_streaming_top_preds(\n",
" probs,\n",
" video,\n",
" top_k=5,\n",
" video_fps=25.,\n",
" figure_height=500,\n",
" use_progbar=True):\n",
" \"\"\"Generates a video plot of the top video model predictions.\n",
"\n",
" Args:\n",
" probs: probability tensor of shape (num_frames, num_classes) that represents\n",
" the probability of each class on each frame.\n",
" video: the video to display in the plot.\n",
" top_k: the number of top predictions to select.\n",
" video_fps: the input video fps.\n",
" figure_fps: the output video fps.\n",
" figure_height: the height of the output video.\n",
" use_progbar: display a progress bar.\n",
"\n",
" Returns:\n",
" A numpy array representing the output video.\n",
" \"\"\"\n",
" video_fps = 8.\n",
" figure_height = 500\n",
" steps = video.shape[0]\n",
" duration = steps / video_fps\n",
"\n",
" top_probs, top_labels, _ = get_top_k_streaming_labels(probs, k=top_k)\n",
"\n",
" images = []\n",
" step_generator = tqdm.trange(steps) if use_progbar else range(steps)\n",
" for i in step_generator:\n",
" image, _ = plot_streaming_top_preds_at_step(\n",
" top_probs=top_probs,\n",
" top_labels=top_labels,\n",
" step=i,\n",
" image=video[i],\n",
" duration_seconds=duration,\n",
" figure_height=figure_height,\n",
" )\n",
" images.append(image)\n",
"\n",
" return np.array(images)\n",
"\n",
"def generate_plot(\n",
" model,\n",
" video_url=None,\n",
" resolution=224,\n",
" video_fps=25,\n",
" display_fps=25):\n",
" # Load the video\n",
" if not video_url:\n",
" video_bytes = list(files.upload().values())[0]\n",
" with open('video', 'wb') as f:\n",
" f.write(video_bytes)\n",
" else:\n",
" urllib.request.urlretrieve(video_url, \"video\")\n",
"\n",
" video = tf.cast(media.read_video('video'), tf.float32) / 255.\n",
" video = tf.image.resize(video, [resolution, resolution], preserve_aspect_ratio=True)\n",
"\n",
" # Create initial states for the stream model\n",
" init_states_fn = model.layers[-1].resolved_object.signatures['init_states']\n",
" init_states = init_states_fn(tf.shape(video[tf.newaxis]))\n",
"\n",
" clips = tf.split(video[tf.newaxis], video.shape[0], axis=1)\n",
"\n",
" all_logits = []\n",
"\n",
" print('Running the model on the video...')\n",
"\n",
" # To run on a video, pass in one frame at a time\n",
" states = init_states\n",
" for clip in tqdm.tqdm(clips):\n",
" # Input shape: [1, 1, 172, 172, 3]\n",
" logits, states = model.predict({**states, 'image': clip}, verbose=0)\n",
" all_logits.append(logits)\n",
"\n",
" logits = tf.concat(all_logits, 0)\n",
" probs = tf.nn.softmax(logits)\n",
"\n",
" print('Generating the plot...')\n",
"\n",
" # Generate a plot and output to a video tensor\n",
" plot_video = plot_streaming_top_preds(probs, video, video_fps=video_fps)\n",
" media.show_video(plot_video, fps=display_fps, codec='gif')\n",
"\n",
"model_size = 'm' #@param [\"xs\", \"s\", \"m\", \"l\", \"xl\", \"xxl\"]\n",
"\n",
"model_map = {\n",
" 'xs': 'a0',\n",
" 's': 'a1',\n",
" 'm': 'a2',\n",
" 'l': 'a3',\n",
" 'xl': 'a4',\n",
" 'xxl': 'a5',\n",
"}\n",
"movinet_model_id = model_map[model_size]\n",
"\n",
"model = load_movinet_from_hub(\n",
" movinet_model_id, 'stream', hub_version=3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "jO6HrPk8pqo8"
},
"outputs": [],
"source": [
"#@title Generate a video plot.\n",
"\n",
"#@markdown You may add a video URL (gif or mp4) or leave the video_url field blank to upload your own file.\n",
"video_url = \"https://i.pinimg.com/originals/33/5e/31/335e31bc8ed52511da0cfb4bc44e95c7.gif\" #@param {type:\"string\"}\n",
"\n",
"#@markdown The base input resolution to the model. A good value is 224, but can change based on model size.\n",
"resolution = 224 #@param\n",
"#@markdown The fps of the input video.\n",
"video_fps = 12 #@param\n",
"#@markdown The fps to display the output plot. Depending on the duration of the input video, it may help to use a lower fps.\n",
"display_fps = 12 #@param\n",
"\n",
"generate_plot(\n",
" model,\n",
" video_url=video_url,\n",
" resolution=resolution,\n",
" video_fps=video_fps,\n",
" display_fps=display_fps)"
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"last_runtime": {
"build_target": "//learning/deepmind/dm_python:dm_notebook3",
"kind": "private"
},
"name": "plot_movinet_video_stream_predictions.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
......@@ -21,12 +21,6 @@ task:
global_batch_size: 4096
dtype: 'float32'
aug_rand_hflip: true
aug_type:
autoaug:
augmentation_name: v0
cutout_const: 100
translate_const: 250
type: autoaug
drop_remainder: true
validation_data:
input_path: '/readahead/200M/placer/prod/home/distbelief/imagenet-tensorflow/imagenet-2012-tfrecord/valid*'
......
# Token Dropping for Efficient BERT Pretraining
This is the official implementation of the token dropping method
[Pang et al. Token Dropping for Efficient BERT Pretraining. ACL 2022](#reference).
Token dropping aims to accelerate the pretraining of transformer
models such as BERT without degrading its performance on downstream tasks. In
particular, we drop unimportant tokens starting from an intermediate layer in
the model, to make the model focus on important tokens more efficiently with its
limited computational resources. The dropped tokens are later picked up by the
last layer of the model, so that the model still produces full-length sequences.
We leverage the already built-in masked language modeling (MLM) loss and its
dynamics to identify unimportant tokens with practically no computational
overhead. In our experiments, this simple approach reduces the pretraining cost
of BERT by 25% while achieving slightly better overall fine-tuning performance
on standard downstream tasks.
A BERT model pretrained using this token dropping method is not different to
a BERT model pretrained in the conventional way: a BERT checkpoint pretrained
with token dropping can be viewed and used as a normal BERT checkpoint, for
finetuning etc. Thus, this README file only illustrates how to run token
dropping for pretraining.
### Requirements
The starter code requires Tensorflow. If you haven't installed it yet, follow
the instructions on [tensorflow.org][1].
This code has been tested with Tensorflow 2.5.0. Going forward,
we will continue to target the latest released version of Tensorflow.
Please verify that you have Python 3.6+ and Tensorflow 2.5.0 or higher
installed by running the following commands:
```sh
python --version
python -c 'import tensorflow as tf; print(tf.__version__)'
```
Refer to the [instructions here][2]
for using the model in this repo. Make sure to add the models folder to your
Python path.
[1]: https://www.tensorflow.org/install/
[2]:
https://github.com/tensorflow/models/tree/master/official#running-the-models
Then, you need to generate pretraining data. See
[this instruction]
(https://github.com/tensorflow/models/blob/27fb855b027ead16d2616dcb59c67409a2176b7f/official/legacy/bert/README.md#pre-training)
on how to do that.
## Train using the config file.
After you generated your pretraining data, run the following command to start
pretraining:
```bash
PARAMS="task.train_data.input_data=/path/to/train/data"
PARAMS="${PARAMS},task.validation_data.input_path=/path/to/validation/data"
PARAMS="${PARAMS},runtime.distribution_strategy=tpu"
python3 train.py \
--experiment=token_drop_bert/pretraining \
--config_file=wiki_books_pretrain_sequence_pack.yaml \
--config_file=bert_en_uncased_base_token_drop.yaml \
--params_override=${PARAMS} \
--tpu=local \
--model_dir=/folder/to/hold/logs/and/models/ \
--mode=train_and_eval
```
## Implementation
We implement the encoder and layers using `tf.keras` APIs in NLP
modeling library:
* [masked_lm.py](https://github.com/tensorflow/models/blob/master/official/projects/token_dropping/masked_lm.py)
contains the BERT pretraining task.
* [experiment_configs.py](https://github.com/tensorflow/models/blob/master/official/projects/token_dropping/experiment_configs.py)
registers the token dropping experiment.
* [encoder.py](https://github.com/tensorflow/models/blob/master/official/projects/token_dropping/encoder.py)
contains the BERT encoder that supports token dropping.
* [encoder_config.py](https://github.com/tensorflow/models/blob/master/official/projects/token_dropping/encoder_config.py)
contains the config and method for instantiating the token dropping BERT
encoder.
* [train.py](https://github.com/tensorflow/models/blob/master/official/projects/token_dropping/train.py)
is the program entry.
## Reference
Please cite our paper:
```
@inproceedings{pang2022,
title={Token Dropping for Efficient BERT Pretraining},
author={Richard Yuanzhe Pang*, Le Hou*, Tianyi Zhou, Yuexin Wu, Xinying Song, Xiaodan Song, Denny Zhou},
year={2022},
organization={Association for Computational Linguistics}
}
```
task:
model:
encoder:
type: any
any:
token_allow_list: !!python/tuple
- 100 # [UNK]
- 101 # [CLS]
- 102 # [SEP]
- 103 # [MASK]
token_deny_list: !!python/tuple
- 0 # [PAD]
attention_dropout_rate: 0.1
dropout_rate: 0.1
hidden_activation: gelu
hidden_size: 768
initializer_range: 0.02
intermediate_size: 3072
max_position_embeddings: 512
num_attention_heads: 12
num_layers: 12
type_vocab_size: 2
vocab_size: 30522
token_loss_init_value: 10.0
token_loss_beta: 0.995
token_keep_k: 256
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Transformer-based BERT encoder network."""
# pylint: disable=g-classes-have-attributes
from typing import Any, Callable, Optional, Union, Tuple
from absl import logging
import tensorflow as tf
from official.nlp.modeling import layers
_Initializer = Union[str, tf.keras.initializers.Initializer]
_Activation = Union[str, Callable[..., Any]]
_approx_gelu = lambda x: tf.keras.activations.gelu(x, approximate=True)
class TokenDropBertEncoder(tf.keras.layers.Layer):
"""Bi-directional Transformer-based encoder network with token dropping.
During pretraining, we drop unimportant tokens starting from an intermediate
layer in the model, to make the model focus on important tokens more
efficiently with its limited computational resources. The dropped tokens are
later picked up by the last layer of the model, so that the model still
produces full-length sequences. This approach reduces the pretraining cost of
BERT by 25% while achieving better overall fine-tuning performance on standard
downstream tasks.
Args:
vocab_size: The size of the token vocabulary.
hidden_size: The size of the transformer hidden layers.
num_layers: The number of transformer layers.
num_attention_heads: The number of attention heads for each transformer. The
hidden size must be divisible by the number of attention heads.
max_sequence_length: The maximum sequence length that this encoder can
consume. If None, max_sequence_length uses the value from sequence length.
This determines the variable shape for positional embeddings.
type_vocab_size: The number of types that the 'type_ids' input can take.
inner_dim: The output dimension of the first Dense layer in a two-layer
feedforward network for each transformer.
inner_activation: The activation for the first Dense layer in a two-layer
feedforward network for each transformer.
output_dropout: Dropout probability for the post-attention and output
dropout.
attention_dropout: The dropout rate to use for the attention layers within
the transformer layers.
token_loss_init_value: The default loss value of a token, when the token is
never masked and predicted.
token_loss_beta: How running average factor for computing the average loss
value of a token.
token_keep_k: The number of tokens you want to keep in the intermediate
layers. The rest will be dropped in those layers.
token_allow_list: The list of token-ids that should not be droped. In the
BERT English vocab, token-id from 1 to 998 contains special tokens such
as [CLS], [SEP]. By default, token_allow_list contains all of these
special tokens.
token_deny_list: The list of token-ids that should always be droped. In the
BERT English vocab, token-id=0 means [PAD]. By default, token_deny_list
contains and only contains [PAD].
initializer: The initialzer to use for all weights in this encoder.
output_range: The sequence output range, [0, output_range), by slicing the
target sequence of the last transformer layer. `None` means the entire
target sequence will attend to the source sequence, which yields the full
output.
embedding_width: The width of the word embeddings. If the embedding width is
not equal to hidden size, embedding parameters will be factorized into two
matrices in the shape of ['vocab_size', 'embedding_width'] and
['embedding_width', 'hidden_size'] ('embedding_width' is usually much
smaller than 'hidden_size').
embedding_layer: An optional Layer instance which will be called to generate
embeddings for the input word IDs.
norm_first: Whether to normalize inputs to attention and intermediate dense
layers. If set False, output of attention and intermediate dense layers is
normalized.
with_dense_inputs: Whether to accept dense embeddings as the input.
"""
def __init__(
self,
vocab_size: int,
hidden_size: int = 768,
num_layers: int = 12,
num_attention_heads: int = 12,
max_sequence_length: int = 512,
type_vocab_size: int = 16,
inner_dim: int = 3072,
inner_activation: _Activation = _approx_gelu,
output_dropout: float = 0.1,
attention_dropout: float = 0.1,
token_loss_init_value: float = 10.0,
token_loss_beta: float = 0.995,
token_keep_k: int = 256,
token_allow_list: Tuple[int, ...] = (100, 101, 102, 103),
token_deny_list: Tuple[int, ...] = (0,),
initializer: _Initializer = tf.keras.initializers.TruncatedNormal(
stddev=0.02),
output_range: Optional[int] = None,
embedding_width: Optional[int] = None,
embedding_layer: Optional[tf.keras.layers.Layer] = None,
norm_first: bool = False,
with_dense_inputs: bool = False,
**kwargs):
# Pops kwargs that are used in V1 implementation.
if 'dict_outputs' in kwargs:
kwargs.pop('dict_outputs')
if 'return_all_encoder_outputs' in kwargs:
kwargs.pop('return_all_encoder_outputs')
if 'intermediate_size' in kwargs:
inner_dim = kwargs.pop('intermediate_size')
if 'activation' in kwargs:
inner_activation = kwargs.pop('activation')
if 'dropout_rate' in kwargs:
output_dropout = kwargs.pop('dropout_rate')
if 'attention_dropout_rate' in kwargs:
attention_dropout = kwargs.pop('attention_dropout_rate')
super().__init__(**kwargs)
activation = tf.keras.activations.get(inner_activation)
initializer = tf.keras.initializers.get(initializer)
if embedding_width is None:
embedding_width = hidden_size
if embedding_layer is None:
self._embedding_layer = layers.OnDeviceEmbedding(
vocab_size=vocab_size,
embedding_width=embedding_width,
initializer=initializer,
name='word_embeddings')
else:
self._embedding_layer = embedding_layer
self._position_embedding_layer = layers.PositionEmbedding(
initializer=initializer,
max_length=max_sequence_length,
name='position_embedding')
self._type_embedding_layer = layers.OnDeviceEmbedding(
vocab_size=type_vocab_size,
embedding_width=embedding_width,
initializer=initializer,
use_one_hot=True,
name='type_embeddings')
self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32)
self._embedding_dropout = tf.keras.layers.Dropout(
rate=output_dropout, name='embedding_dropout')
# We project the 'embedding' output to 'hidden_size' if it is not already
# 'hidden_size'.
self._embedding_projection = None
if embedding_width != hidden_size:
self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
'...x,xy->...y',
output_shape=hidden_size,
bias_axes='y',
kernel_initializer=initializer,
name='embedding_projection')
# The first 999 tokens are special tokens such as [PAD], [CLS], [SEP].
# We want to always mask [PAD], and always not to maks [CLS], [SEP].
init_importance = tf.constant(token_loss_init_value, shape=(vocab_size))
if token_allow_list:
init_importance = tf.tensor_scatter_nd_update(
tensor=init_importance,
indices=[[x] for x in token_allow_list],
updates=[1.0e4 for x in token_allow_list])
if token_deny_list:
init_importance = tf.tensor_scatter_nd_update(
tensor=init_importance,
indices=[[x] for x in token_deny_list],
updates=[-1.0e4 for x in token_deny_list])
self._token_importance_embed = layers.TokenImportanceWithMovingAvg(
vocab_size=vocab_size,
init_importance=init_importance,
moving_average_beta=token_loss_beta)
self._token_separator = layers.SelectTopK(top_k=token_keep_k)
self._transformer_layers = []
self._num_layers = num_layers
self._attention_mask_layer = layers.SelfAttentionMask(
name='self_attention_mask')
for i in range(num_layers):
layer = layers.TransformerEncoderBlock(
num_attention_heads=num_attention_heads,
inner_dim=inner_dim,
inner_activation=inner_activation,
output_dropout=output_dropout,
attention_dropout=attention_dropout,
norm_first=norm_first,
output_range=output_range if i == num_layers - 1 else None,
kernel_initializer=initializer,
name='transformer/layer_%d' % i)
self._transformer_layers.append(layer)
self._pooler_layer = tf.keras.layers.Dense(
units=hidden_size,
activation='tanh',
kernel_initializer=initializer,
name='pooler_transform')
self._config = {
'vocab_size': vocab_size,
'hidden_size': hidden_size,
'num_layers': num_layers,
'num_attention_heads': num_attention_heads,
'max_sequence_length': max_sequence_length,
'type_vocab_size': type_vocab_size,
'inner_dim': inner_dim,
'inner_activation': tf.keras.activations.serialize(activation),
'output_dropout': output_dropout,
'attention_dropout': attention_dropout,
'token_loss_init_value': token_loss_init_value,
'token_loss_beta': token_loss_beta,
'token_keep_k': token_keep_k,
'token_allow_list': token_allow_list,
'token_deny_list': token_deny_list,
'initializer': tf.keras.initializers.serialize(initializer),
'output_range': output_range,
'embedding_width': embedding_width,
'embedding_layer': embedding_layer,
'norm_first': norm_first,
'with_dense_inputs': with_dense_inputs,
}
if with_dense_inputs:
self.inputs = dict(
input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
dense_inputs=tf.keras.Input(
shape=(None, embedding_width), dtype=tf.float32),
dense_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
dense_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
)
else:
self.inputs = dict(
input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32))
def call(self, inputs):
if isinstance(inputs, dict):
word_ids = inputs.get('input_word_ids')
mask = inputs.get('input_mask')
type_ids = inputs.get('input_type_ids')
dense_inputs = inputs.get('dense_inputs', None)
dense_mask = inputs.get('dense_mask', None)
dense_type_ids = inputs.get('dense_type_ids', None)
else:
raise ValueError('Unexpected inputs type to %s.' % self.__class__)
word_embeddings = self._embedding_layer(word_ids)
if dense_inputs is not None:
# Concat the dense embeddings at sequence end.
word_embeddings = tf.concat([word_embeddings, dense_inputs], axis=1)
type_ids = tf.concat([type_ids, dense_type_ids], axis=1)
mask = tf.concat([mask, dense_mask], axis=1)
# absolute position embeddings.
position_embeddings = self._position_embedding_layer(word_embeddings)
type_embeddings = self._type_embedding_layer(type_ids)
embeddings = word_embeddings + position_embeddings + type_embeddings
embeddings = self._embedding_norm_layer(embeddings)
embeddings = self._embedding_dropout(embeddings)
if self._embedding_projection is not None:
embeddings = self._embedding_projection(embeddings)
attention_mask = self._attention_mask_layer(embeddings, mask)
encoder_outputs = []
x = embeddings
# Get token routing.
token_importance = self._token_importance_embed(word_ids)
selected, not_selected = self._token_separator(token_importance)
# For a 12-layer BERT:
# 1. All tokens fist go though 5 transformer layers, then
# 2. Only important tokens go through 1 transformer layer with cross
# attention to unimportant tokens, then
# 3. Only important tokens go through 5 transformer layers without cross
# attention.
# 4. Finally, all tokens go through the last layer.
# Step 1.
for layer in self._transformer_layers[:self._num_layers // 2 - 1]:
x = layer([x, attention_mask])
encoder_outputs.append(x)
# Step 2.
# First, separate important and non-important tokens.
x_selected = tf.gather(x, selected, batch_dims=1, axis=1)
mask_selected = tf.gather(mask, selected, batch_dims=1, axis=1)
attention_mask_token_drop = self._attention_mask_layer(
x_selected, mask_selected)
x_not_selected = tf.gather(x, not_selected, batch_dims=1, axis=1)
mask_not_selected = tf.gather(mask, not_selected, batch_dims=1, axis=1)
attention_mask_token_pass = self._attention_mask_layer(
x_selected, tf.concat([mask_selected, mask_not_selected], axis=1))
x_all = tf.concat([x_selected, x_not_selected], axis=1)
# Then, call transformer layer with cross attention.
x_selected = self._transformer_layers[self._num_layers // 2 - 1](
[x_selected, x_all, attention_mask_token_pass])
encoder_outputs.append(x_selected)
# Step 3.
for layer in self._transformer_layers[self._num_layers // 2:-1]:
x_selected = layer([x_selected, attention_mask_token_drop])
encoder_outputs.append(x_selected)
# Step 4.
# First, merge important and non-important tokens.
x_not_selected = tf.cast(x_not_selected, dtype=x_selected.dtype)
x = tf.concat([x_selected, x_not_selected], axis=1)
indices = tf.concat([selected, not_selected], axis=1)
reverse_indices = tf.argsort(indices)
x = tf.gather(x, reverse_indices, batch_dims=1, axis=1)
# Then, call transformer layer with all tokens.
x = self._transformer_layers[-1]([x, attention_mask])
encoder_outputs.append(x)
last_encoder_output = encoder_outputs[-1]
first_token_tensor = last_encoder_output[:, 0, :]
pooled_output = self._pooler_layer(first_token_tensor)
return dict(
sequence_output=encoder_outputs[-1],
pooled_output=pooled_output,
encoder_outputs=encoder_outputs)
def record_mlm_loss(self, mlm_ids: tf.Tensor, mlm_losses: tf.Tensor):
self._token_importance_embed.update_token_importance(
token_ids=mlm_ids, importance=mlm_losses)
def get_embedding_table(self):
return self._embedding_layer.embeddings
def get_embedding_layer(self):
return self._embedding_layer
def get_config(self):
return dict(self._config)
@property
def transformer_layers(self):
"""List of Transformer layers in the encoder."""
return self._transformer_layers
@property
def pooler_layer(self):
"""The pooler dense layer after the transformer layers."""
return self._pooler_layer
@classmethod
def from_config(cls, config, custom_objects=None):
if 'embedding_layer' in config and config['embedding_layer'] is not None:
warn_string = (
'You are reloading a model that was saved with a '
'potentially-shared embedding layer object. If you contine to '
'train this model, the embedding layer will no longer be shared. '
'To work around this, load the model outside of the Keras API.')
print('WARNING: ' + warn_string)
logging.warn(warn_string)
return cls(**config)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Token dropping encoder configuration and instantiation."""
import dataclasses
from typing import Tuple
import tensorflow as tf
from official.modeling import tf_utils
from official.modeling.hyperparams import base_config
from official.nlp.configs import encoders
from official.projects.token_dropping import encoder
@dataclasses.dataclass
class TokenDropBertEncoderConfig(encoders.BertEncoderConfig):
token_loss_init_value: float = 10.0
token_loss_beta: float = 0.995
token_keep_k: int = 256
token_allow_list: Tuple[int, ...] = (100, 101, 102, 103)
token_deny_list: Tuple[int, ...] = (0,)
@base_config.bind(TokenDropBertEncoderConfig)
def get_encoder(encoder_cfg: TokenDropBertEncoderConfig):
"""Instantiates 'TokenDropBertEncoder'.
Args:
encoder_cfg: A 'TokenDropBertEncoderConfig'.
Returns:
A 'encoder.TokenDropBertEncoder' object.
"""
return encoder.TokenDropBertEncoder(
vocab_size=encoder_cfg.vocab_size,
hidden_size=encoder_cfg.hidden_size,
num_layers=encoder_cfg.num_layers,
num_attention_heads=encoder_cfg.num_attention_heads,
intermediate_size=encoder_cfg.intermediate_size,
activation=tf_utils.get_activation(encoder_cfg.hidden_activation),
dropout_rate=encoder_cfg.dropout_rate,
attention_dropout_rate=encoder_cfg.attention_dropout_rate,
max_sequence_length=encoder_cfg.max_position_embeddings,
type_vocab_size=encoder_cfg.type_vocab_size,
initializer=tf.keras.initializers.TruncatedNormal(
stddev=encoder_cfg.initializer_range),
output_range=encoder_cfg.output_range,
embedding_width=encoder_cfg.embedding_size,
return_all_encoder_outputs=encoder_cfg.return_all_encoder_outputs,
dict_outputs=True,
norm_first=encoder_cfg.norm_first,
token_loss_init_value=encoder_cfg.token_loss_init_value,
token_loss_beta=encoder_cfg.token_loss_beta,
token_keep_k=encoder_cfg.token_keep_k,
token_allow_list=encoder_cfg.token_allow_list,
token_deny_list=encoder_cfg.token_deny_list)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for transformer-based bert encoder network."""
# Import libraries
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.nlp.modeling.networks import bert_encoder
from official.projects.token_dropping import encoder
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover.
@keras_parameterized.run_all_keras_modes
class TokenDropBertEncoderTest(keras_parameterized.TestCase):
def tearDown(self):
super(TokenDropBertEncoderTest, self).tearDown()
tf.keras.mixed_precision.set_global_policy("float32")
def test_dict_outputs_network_creation(self):
hidden_size = 32
sequence_length = 21
# Create a small BertEncoder for testing.
test_network = encoder.TokenDropBertEncoder(
vocab_size=100,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3,
token_keep_k=2,
token_allow_list=(),
token_deny_list=())
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
dict_outputs = test_network(
dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
data = dict_outputs["sequence_output"]
pooled = dict_outputs["pooled_output"]
self.assertIsInstance(test_network.transformer_layers, list)
self.assertLen(test_network.transformer_layers, 3)
self.assertIsInstance(test_network.pooler_layer, tf.keras.layers.Dense)
expected_data_shape = [None, sequence_length, hidden_size]
expected_pooled_shape = [None, hidden_size]
self.assertAllEqual(expected_data_shape, data.shape.as_list())
self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
# The default output dtype is float32.
self.assertAllEqual(tf.float32, data.dtype)
self.assertAllEqual(tf.float32, pooled.dtype)
def test_dict_outputs_all_encoder_outputs_network_creation(self):
hidden_size = 32
sequence_length = 21
# Create a small BertEncoder for testing.
test_network = encoder.TokenDropBertEncoder(
vocab_size=100,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3,
dict_outputs=True,
token_keep_k=sequence_length,
token_allow_list=(),
token_deny_list=())
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
dict_outputs = test_network(
dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
all_encoder_outputs = dict_outputs["encoder_outputs"]
pooled = dict_outputs["pooled_output"]
expected_data_shape = [None, sequence_length, hidden_size]
expected_pooled_shape = [None, hidden_size]
self.assertLen(all_encoder_outputs, 3)
for data in all_encoder_outputs:
self.assertAllEqual(expected_data_shape, data.shape.as_list())
self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
# The default output dtype is float32.
self.assertAllEqual(tf.float32, all_encoder_outputs[-1].dtype)
self.assertAllEqual(tf.float32, pooled.dtype)
def test_dict_outputs_network_creation_with_float16_dtype(self):
hidden_size = 32
sequence_length = 21
tf.keras.mixed_precision.set_global_policy("mixed_float16")
# Create a small BertEncoder for testing.
test_network = encoder.TokenDropBertEncoder(
vocab_size=100,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=4,
dict_outputs=True,
token_keep_k=2,
token_allow_list=(),
token_deny_list=())
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
dict_outputs = test_network(
dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
data = dict_outputs["sequence_output"]
pooled = dict_outputs["pooled_output"]
expected_data_shape = [None, sequence_length, hidden_size]
expected_pooled_shape = [None, hidden_size]
self.assertAllEqual(expected_data_shape, data.shape.as_list())
self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
# If float_dtype is set to float16, the data output is float32 (from a layer
# norm) and pool output should be float16.
self.assertAllEqual(tf.float32, data.dtype)
self.assertAllEqual(tf.float16, pooled.dtype)
@parameterized.named_parameters(
("all_sequence_encoder", None, 21),
("output_range_encoder", 1, 1),
)
def test_dict_outputs_network_invocation(
self, output_range, out_seq_len):
hidden_size = 32
sequence_length = 21
vocab_size = 57
num_types = 7
# Create a small BertEncoder for testing.
test_network = encoder.TokenDropBertEncoder(
vocab_size=vocab_size,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3,
type_vocab_size=num_types,
output_range=output_range,
dict_outputs=True,
token_keep_k=2,
token_allow_list=(),
token_deny_list=())
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
dict_outputs = test_network(
dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
data = dict_outputs["sequence_output"]
pooled = dict_outputs["pooled_output"]
# Create a model based off of this network:
model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
# Invoke the model. We can't validate the output data here (the model is too
# complex) but this will catch structural runtime errors.
batch_size = 3
word_id_data = np.random.randint(
vocab_size, size=(batch_size, sequence_length))
mask_data = np.random.randint(2, size=(batch_size, sequence_length))
type_id_data = np.random.randint(
num_types, size=(batch_size, sequence_length))
outputs = model.predict([word_id_data, mask_data, type_id_data])
self.assertEqual(outputs[0].shape[1], out_seq_len)
# Creates a BertEncoder with max_sequence_length != sequence_length
max_sequence_length = 128
test_network = encoder.TokenDropBertEncoder(
vocab_size=vocab_size,
hidden_size=hidden_size,
max_sequence_length=max_sequence_length,
num_attention_heads=2,
num_layers=3,
type_vocab_size=num_types,
dict_outputs=True,
token_keep_k=2,
token_allow_list=(),
token_deny_list=())
dict_outputs = test_network(
dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
data = dict_outputs["sequence_output"]
pooled = dict_outputs["pooled_output"]
model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
outputs = model.predict([word_id_data, mask_data, type_id_data])
self.assertEqual(outputs[0].shape[1], sequence_length)
# Creates a BertEncoder with embedding_width != hidden_size
test_network = encoder.TokenDropBertEncoder(
vocab_size=vocab_size,
hidden_size=hidden_size,
max_sequence_length=max_sequence_length,
num_attention_heads=2,
num_layers=3,
type_vocab_size=num_types,
embedding_width=16,
dict_outputs=True,
token_keep_k=2,
token_allow_list=(),
token_deny_list=())
dict_outputs = test_network(
dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
data = dict_outputs["sequence_output"]
pooled = dict_outputs["pooled_output"]
model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
outputs = model.predict([word_id_data, mask_data, type_id_data])
self.assertEqual(outputs[0].shape[-1], hidden_size)
self.assertTrue(hasattr(test_network, "_embedding_projection"))
def test_network_creation(self):
hidden_size = 32
sequence_length = 21
# Create a small BertEncoder for testing.
test_network = encoder.TokenDropBertEncoder(
vocab_size=100,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3,
token_keep_k=2,
token_allow_list=(),
token_deny_list=())
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
dict_outputs = test_network(
dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
data = dict_outputs["sequence_output"]
pooled = dict_outputs["pooled_output"]
self.assertIsInstance(test_network.transformer_layers, list)
self.assertLen(test_network.transformer_layers, 3)
self.assertIsInstance(test_network.pooler_layer, tf.keras.layers.Dense)
expected_data_shape = [None, sequence_length, hidden_size]
expected_pooled_shape = [None, hidden_size]
self.assertAllEqual(expected_data_shape, data.shape.as_list())
self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
# The default output dtype is float32.
self.assertAllEqual(tf.float32, data.dtype)
self.assertAllEqual(tf.float32, pooled.dtype)
test_network = encoder.TokenDropBertEncoder(
vocab_size=100,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3,
token_keep_k=2,
token_allow_list=(),
token_deny_list=())
# Create the inputs (note that the first dimension is implicit).
inputs = dict(
input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids)
_ = test_network(inputs)
def test_all_encoder_outputs_network_creation(self):
hidden_size = 32
sequence_length = 21
# Create a small BertEncoder for testing.
test_network = encoder.TokenDropBertEncoder(
vocab_size=100,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3,
return_all_encoder_outputs=True,
token_keep_k=sequence_length,
token_allow_list=(),
token_deny_list=())
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
dict_outputs = test_network(
dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
all_encoder_outputs = dict_outputs["encoder_outputs"]
pooled = dict_outputs["pooled_output"]
expected_data_shape = [None, sequence_length, hidden_size]
expected_pooled_shape = [None, hidden_size]
self.assertLen(all_encoder_outputs, 3)
for data in all_encoder_outputs:
self.assertAllEqual(expected_data_shape, data.shape.as_list())
self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
# The default output dtype is float32.
self.assertAllEqual(tf.float32, all_encoder_outputs[-1].dtype)
self.assertAllEqual(tf.float32, pooled.dtype)
def test_network_creation_with_float16_dtype(self):
hidden_size = 32
sequence_length = 21
tf.keras.mixed_precision.set_global_policy("mixed_float16")
# Create a small BertEncoder for testing.
test_network = encoder.TokenDropBertEncoder(
vocab_size=100,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=4,
token_keep_k=2,
token_allow_list=(),
token_deny_list=())
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
dict_outputs = test_network(
dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
data = dict_outputs["sequence_output"]
pooled = dict_outputs["pooled_output"]
expected_data_shape = [None, sequence_length, hidden_size]
expected_pooled_shape = [None, hidden_size]
self.assertAllEqual(expected_data_shape, data.shape.as_list())
self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
# If float_dtype is set to float16, the data output is float32 (from a layer
# norm) and pool output should be float16.
self.assertAllEqual(tf.float32, data.dtype)
self.assertAllEqual(tf.float16, pooled.dtype)
@parameterized.named_parameters(
("all_sequence", None, 21),
("output_range", 1, 1),
)
def test_network_invocation(self, output_range, out_seq_len):
hidden_size = 32
sequence_length = 21
vocab_size = 57
num_types = 7
# Create a small BertEncoder for testing.
test_network = encoder.TokenDropBertEncoder(
vocab_size=vocab_size,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3,
type_vocab_size=num_types,
output_range=output_range,
token_keep_k=2,
token_allow_list=(),
token_deny_list=())
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
dict_outputs = test_network(
dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
data = dict_outputs["sequence_output"]
pooled = dict_outputs["pooled_output"]
# Create a model based off of this network:
model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
# Invoke the model. We can't validate the output data here (the model is too
# complex) but this will catch structural runtime errors.
batch_size = 3
word_id_data = np.random.randint(
vocab_size, size=(batch_size, sequence_length))
mask_data = np.random.randint(2, size=(batch_size, sequence_length))
type_id_data = np.random.randint(
num_types, size=(batch_size, sequence_length))
outputs = model.predict([word_id_data, mask_data, type_id_data])
self.assertEqual(outputs[0].shape[1], out_seq_len)
# Creates a BertEncoder with max_sequence_length != sequence_length
max_sequence_length = 128
test_network = encoder.TokenDropBertEncoder(
vocab_size=vocab_size,
hidden_size=hidden_size,
max_sequence_length=max_sequence_length,
num_attention_heads=2,
num_layers=3,
type_vocab_size=num_types,
token_keep_k=2,
token_allow_list=(),
token_deny_list=())
dict_outputs = test_network(
dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
data = dict_outputs["sequence_output"]
pooled = dict_outputs["pooled_output"]
model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
outputs = model.predict([word_id_data, mask_data, type_id_data])
self.assertEqual(outputs[0].shape[1], sequence_length)
# Creates a BertEncoder with embedding_width != hidden_size
test_network = encoder.TokenDropBertEncoder(
vocab_size=vocab_size,
hidden_size=hidden_size,
max_sequence_length=max_sequence_length,
num_attention_heads=2,
num_layers=3,
type_vocab_size=num_types,
embedding_width=16,
token_keep_k=2,
token_allow_list=(),
token_deny_list=())
dict_outputs = test_network(
dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
data = dict_outputs["sequence_output"]
pooled = dict_outputs["pooled_output"]
model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
outputs = model.predict([word_id_data, mask_data, type_id_data])
self.assertEqual(outputs[0].shape[-1], hidden_size)
self.assertTrue(hasattr(test_network, "_embedding_projection"))
class TokenDropCompatibilityTest(tf.test.TestCase):
def tearDown(self):
super().tearDown()
tf.keras.mixed_precision.set_global_policy("float32")
def test_checkpoint_forward_compatible(self):
batch_size = 3
hidden_size = 32
sequence_length = 21
vocab_size = 57
num_types = 7
kwargs = dict(
vocab_size=vocab_size,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3,
type_vocab_size=num_types,
output_range=None)
word_id_data = np.random.randint(
vocab_size, size=(batch_size, sequence_length))
mask_data = np.random.randint(2, size=(batch_size, sequence_length))
type_id_data = np.random.randint(
num_types, size=(batch_size, sequence_length))
data = dict(
input_word_ids=word_id_data,
input_mask=mask_data,
input_type_ids=type_id_data)
old_net = bert_encoder.BertEncoderV2(**kwargs)
old_net_outputs = old_net(data)
ckpt = tf.train.Checkpoint(net=old_net)
path = ckpt.save(self.get_temp_dir())
new_net = encoder.TokenDropBertEncoder(
token_keep_k=sequence_length,
token_allow_list=(),
token_deny_list=(),
**kwargs)
new_ckpt = tf.train.Checkpoint(net=new_net)
status = new_ckpt.restore(path)
status.assert_existing_objects_matched()
# assert_consumed will fail because the old model has redundant nodes.
new_net_outputs = new_net(data)
self.assertAllEqual(old_net_outputs.keys(), new_net_outputs.keys())
for key in old_net_outputs:
self.assertAllClose(old_net_outputs[key], new_net_outputs[key])
def test_keras_model_checkpoint_forward_compatible(self):
batch_size = 3
hidden_size = 32
sequence_length = 21
vocab_size = 57
num_types = 7
kwargs = dict(
vocab_size=vocab_size,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3,
type_vocab_size=num_types,
output_range=None)
word_id_data = np.random.randint(
vocab_size, size=(batch_size, sequence_length))
mask_data = np.random.randint(2, size=(batch_size, sequence_length))
type_id_data = np.random.randint(
num_types, size=(batch_size, sequence_length))
data = dict(
input_word_ids=word_id_data,
input_mask=mask_data,
input_type_ids=type_id_data)
old_net = bert_encoder.BertEncoderV2(**kwargs)
inputs = old_net.inputs
outputs = old_net(inputs)
old_model = tf.keras.Model(inputs=inputs, outputs=outputs)
old_model_outputs = old_model(data)
ckpt = tf.train.Checkpoint(net=old_model)
path = ckpt.save(self.get_temp_dir())
new_net = encoder.TokenDropBertEncoder(
token_keep_k=sequence_length,
token_allow_list=(),
token_deny_list=(),
**kwargs)
inputs = new_net.inputs
outputs = new_net(inputs)
new_model = tf.keras.Model(inputs=inputs, outputs=outputs)
new_ckpt = tf.train.Checkpoint(net=new_model)
new_ckpt.restore(path)
new_model_outputs = new_model(data)
self.assertAllEqual(old_model_outputs.keys(), new_model_outputs.keys())
for key in old_model_outputs:
self.assertAllClose(old_model_outputs[key], new_model_outputs[key])
if __name__ == "__main__":
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Token dropping BERT experiment configurations.
Only pretraining configs. Token dropping BERT's checkpoints can be used directly
for the regular BERT. So you can just use the regular BERT for finetuning.
"""
# pylint: disable=g-doc-return-or-yield,line-too-long
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.modeling import optimization
from official.nlp.configs import bert
from official.nlp.configs import encoders
from official.nlp.data import pretrain_dataloader
from official.projects.token_dropping import encoder_config
from official.projects.token_dropping import masked_lm
@exp_factory.register_config_factory('token_drop_bert/pretraining')
def token_drop_bert_pretraining() -> cfg.ExperimentConfig:
"""BERT pretraining with token dropping."""
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(enable_xla=True),
task=masked_lm.TokenDropMaskedLMConfig(
model=bert.PretrainerConfig(
encoder=encoders.EncoderConfig(
any=encoder_config.TokenDropBertEncoderConfig(
vocab_size=30522, num_layers=1, token_keep_k=64),
type='any')),
train_data=pretrain_dataloader.BertPretrainDataConfig(),
validation_data=pretrain_dataloader.BertPretrainDataConfig(
is_training=False)),
trainer=cfg.TrainerConfig(
train_steps=1000000,
optimizer_config=optimization.OptimizationConfig({
'optimizer': {
'type': 'adamw',
'adamw': {
'weight_decay_rate':
0.01,
'exclude_from_weight_decay':
['LayerNorm', 'layer_norm', 'bias'],
}
},
'learning_rate': {
'type': 'polynomial',
'polynomial': {
'initial_learning_rate': 1e-4,
'end_learning_rate': 0.0,
}
},
'warmup': {
'type': 'polynomial'
}
})),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None'
])
return config
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Masked language task."""
import dataclasses
from typing import Tuple
import tensorflow as tf
from official.core import task_factory
from official.nlp.tasks import masked_lm
@dataclasses.dataclass
class TokenDropMaskedLMConfig(masked_lm.MaskedLMConfig):
"""The model config."""
pass
@task_factory.register_task_cls(TokenDropMaskedLMConfig)
class TokenDropMaskedLMTask(masked_lm.MaskedLMTask):
"""Task object for Mask language modeling."""
def build_losses(self,
labels,
model_outputs,
metrics,
aux_losses=None) -> Tuple[tf.Tensor, tf.Tensor]:
"""Return the final loss, and the masked-lm loss."""
with tf.name_scope('MaskedLMTask/losses'):
metrics = dict([(metric.name, metric) for metric in metrics])
lm_prediction_losses = tf.keras.losses.sparse_categorical_crossentropy(
labels['masked_lm_ids'],
tf.cast(model_outputs['mlm_logits'], tf.float32),
from_logits=True)
lm_label_weights = labels['masked_lm_weights']
lm_numerator_loss = tf.reduce_sum(lm_prediction_losses *
lm_label_weights)
lm_denominator_loss = tf.reduce_sum(lm_label_weights)
mlm_loss = tf.math.divide_no_nan(lm_numerator_loss, lm_denominator_loss)
metrics['lm_example_loss'].update_state(mlm_loss)
if 'next_sentence_labels' in labels:
sentence_labels = labels['next_sentence_labels']
sentence_outputs = tf.cast(
model_outputs['next_sentence'], dtype=tf.float32)
sentence_loss = tf.reduce_mean(
tf.keras.losses.sparse_categorical_crossentropy(
sentence_labels, sentence_outputs, from_logits=True))
metrics['next_sentence_loss'].update_state(sentence_loss)
total_loss = mlm_loss + sentence_loss
else:
total_loss = mlm_loss
if aux_losses:
total_loss += tf.add_n(aux_losses)
return total_loss, lm_prediction_losses
def train_step(self, inputs, model: tf.keras.Model,
optimizer: tf.keras.optimizers.Optimizer, metrics):
"""Does forward and backward.
Args:
inputs: a dictionary of input tensors.
model: the model, forward pass definition.
optimizer: the optimizer for this training step.
metrics: a nested structure of metrics objects.
Returns:
A dictionary of logs.
"""
with tf.GradientTape() as tape:
outputs = model(inputs, training=True)
# Computes per-replica loss.
loss, lm_prediction_losses = self.build_losses(
labels=inputs,
model_outputs=outputs,
metrics=metrics,
aux_losses=model.losses)
model.encoder_network.record_mlm_loss(
mlm_ids=inputs['masked_lm_ids'],
mlm_losses=lm_prediction_losses)
if self.task_config.scale_loss:
# Scales loss as the default gradients allreduce performs sum inside the
# optimizer.
scaled_loss = loss / tf.distribute.get_strategy().num_replicas_in_sync
tvars = model.trainable_variables
if self.task_config.scale_loss:
grads = tape.gradient(scaled_loss, tvars)
else:
grads = tape.gradient(loss, tvars)
optimizer.apply_gradients(list(zip(grads, tvars)))
self.process_metrics(metrics, inputs, outputs)
return {self.loss: loss}
def validation_step(self, inputs, model: tf.keras.Model, metrics):
"""Validatation step.
Args:
inputs: a dictionary of input tensors.
model: the keras.Model.
metrics: a nested structure of metrics objects.
Returns:
A dictionary of logs.
"""
outputs = self.inference_step(inputs, model)
loss, _ = self.build_losses(
labels=inputs,
model_outputs=outputs,
metrics=metrics,
aux_losses=model.losses)
self.process_metrics(metrics, inputs, outputs)
return {self.loss: loss}
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for official.nlp.tasks.masked_lm."""
import tensorflow as tf
from official.nlp.configs import bert
from official.nlp.configs import encoders
from official.nlp.data import pretrain_dataloader
from official.projects.token_dropping import encoder_config
from official.projects.token_dropping import masked_lm
class MLMTaskTest(tf.test.TestCase):
def test_task(self):
config = masked_lm.TokenDropMaskedLMConfig(
init_checkpoint=self.get_temp_dir(),
scale_loss=True,
model=bert.PretrainerConfig(
encoder=encoders.EncoderConfig(
any=encoder_config.TokenDropBertEncoderConfig(
vocab_size=30522, num_layers=1, token_keep_k=64),
type="any"),
cls_heads=[
bert.ClsHeadConfig(
inner_dim=10, num_classes=2, name="next_sentence")
]),
train_data=pretrain_dataloader.BertPretrainDataConfig(
input_path="dummy",
max_predictions_per_seq=20,
seq_length=128,
global_batch_size=1))
task = masked_lm.TokenDropMaskedLMTask(config)
model = task.build_model()
metrics = task.build_metrics()
dataset = task.build_inputs(config.train_data)
iterator = iter(dataset)
optimizer = tf.keras.optimizers.SGD(lr=0.1)
task.train_step(next(iterator), model, optimizer, metrics=metrics)
task.validation_step(next(iterator), model, metrics=metrics)
# Saves a checkpoint.
ckpt = tf.train.Checkpoint(model=model, **model.checkpoint_items)
ckpt.save(config.init_checkpoint)
task.initialize(model)
if __name__ == "__main__":
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A customized training binary for running token dropping experiments."""
from absl import app
from absl import flags
import gin
from official.common import distribute_utils
from official.common import flags as tfm_flags
from official.core import task_factory
from official.core import train_lib
from official.core import train_utils
from official.modeling import performance
from official.projects.token_dropping import experiment_configs # pylint: disable=unused-import
FLAGS = flags.FLAGS
def main(_):
gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
params = train_utils.parse_configuration(FLAGS)
model_dir = FLAGS.model_dir
if 'train' in FLAGS.mode:
# Pure eval modes do not output yaml files. Otherwise continuous eval job
# may race against the train job for writing the same file.
train_utils.serialize_config(params, model_dir)
# Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
# can have significant impact on model speeds by utilizing float16 in case of
# GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
# dtype is float16
if params.runtime.mixed_precision_dtype:
performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
distribution_strategy = distribute_utils.get_distribution_strategy(
distribution_strategy=params.runtime.distribution_strategy,
all_reduce_alg=params.runtime.all_reduce_alg,
num_gpus=params.runtime.num_gpus,
tpu_address=params.runtime.tpu,
**params.runtime.model_parallelism())
with distribution_strategy.scope():
task = task_factory.get_task(params.task, logging_dir=model_dir)
train_lib.run_experiment(
distribution_strategy=distribution_strategy,
task=task,
mode=FLAGS.mode,
params=params,
model_dir=model_dir)
train_utils.save_gin_config(FLAGS.mode, model_dir)
if __name__ == '__main__':
tfm_flags.define_flags()
app.run(main)
task:
init_checkpoint: ''
model:
cls_heads: [{activation: tanh, cls_token_idx: 0, dropout_rate: 0.1, inner_dim: 768, name: next_sentence, num_classes: 2}]
train_data:
drop_remainder: true
global_batch_size: 512
input_path: /path-to-data/wikipedia.tfrecord*,/path-to-data/books.tfrecord*
is_training: true
max_predictions_per_seq: 76
seq_length: 512
use_next_sentence_label: true
use_position_id: false
use_v2_feature_names: true
validation_data:
drop_remainder: false
global_batch_size: 512
input_path: /path-to-data/wikipedia.tfrecord*,/path-to-data/books.tfrecord*
is_training: false
max_predictions_per_seq: 76
seq_length: 512
use_next_sentence_label: true
use_position_id: false
use_v2_feature_names: true
trainer:
checkpoint_interval: 20000
max_to_keep: 5
optimizer_config:
learning_rate:
polynomial:
cycle: false
decay_steps: 1000000
end_learning_rate: 0.0
initial_learning_rate: 0.0001
power: 1.0
type: polynomial
optimizer:
type: adamw
warmup:
polynomial:
power: 1
warmup_steps: 10000
type: polynomial
steps_per_loop: 1000
summary_interval: 1000
train_steps: 1000000
validation_interval: 1000
validation_steps: 64
task:
init_checkpoint: ''
model:
cls_heads: []
train_data:
drop_remainder: true
global_batch_size: 512
input_path: /path-to-packed-data/wikipedia.tfrecord*,/path-to-packed-data/books.tfrecord*
is_training: true
max_predictions_per_seq: 76
seq_length: 512
use_next_sentence_label: false
use_position_id: false
use_v2_feature_names: true
validation_data:
drop_remainder: false
global_batch_size: 512
input_path: /path-to-packed-data/wikipedia.tfrecord*,/path-to-packed-data/books.tfrecord*
is_training: false
max_predictions_per_seq: 76
seq_length: 512
use_next_sentence_label: false
use_position_id: false
use_v2_feature_names: true
trainer:
checkpoint_interval: 20000
max_to_keep: 5
optimizer_config:
learning_rate:
polynomial:
cycle: false
decay_steps: 1000000
end_learning_rate: 0.0
initial_learning_rate: 0.0001
power: 1.0
type: polynomial
optimizer:
type: adamw
warmup:
polynomial:
power: 1
warmup_steps: 10000
type: polynomial
steps_per_loop: 1000
summary_interval: 1000
train_steps: 1000000
validation_interval: 1000
validation_steps: 64
......@@ -37,7 +37,7 @@ FLAGS = flags.FLAGS
flags.DEFINE_string('output_dir', None, 'Where to write the resulting docs to.')
flags.DEFINE_string(
'code_url_prefix',
'https://github.com/tensorflow/models/blob/master/tensorflow_models/',
'https://github.com/tensorflow/models/blob/master/tensorflow_models/nlp',
'The url prefix for links to code.')
flags.DEFINE_bool('search_hints', True,
......@@ -66,9 +66,11 @@ def gen_api_docs(code_url_prefix, site_path, output_dir, project_short_name,
del tfm.nlp.layers.MultiHeadAttention
del tfm.nlp.layers.EinsumDense
branch = code_url_prefix.strip('/').split('/')[-2]
official_url_prefix = (
f'https://github.com/tensorflow/models/blob/{branch}/official/')
url_parts = code_url_prefix.strip('/').split('/')
url_parts = url_parts[:url_parts.index('tensorflow_models')]
url_parts.append('official')
official_url_prefix = '/'.join(url_parts)
nlp_base_dir = pathlib.Path(tfm.nlp.__file__).parent
......
......@@ -38,7 +38,7 @@ FLAGS = flags.FLAGS
flags.DEFINE_string('output_dir', None, 'Where to write the resulting docs to.')
flags.DEFINE_string(
'code_url_prefix',
'https://github.com/tensorflow/models/blob/master/tensorflow_models/',
'https://github.com/tensorflow/models/blob/master/tensorflow_models/vision',
'The url prefix for links to code.')
flags.DEFINE_bool('search_hints', True,
......@@ -64,9 +64,11 @@ def gen_api_docs(code_url_prefix, site_path, output_dir, project_short_name,
"""Generates api docs for the tensorflow docs package."""
build_api_docs_lib.hide_module_model_and_layer_methods()
branch = code_url_prefix.strip('/').split('/')[-2]
official_url_prefix = (
f'https://github.com/tensorflow/models/blob/{branch}/official/')
url_parts = code_url_prefix.strip('/').split('/')
url_parts = url_parts[:url_parts.index('tensorflow_models')]
url_parts.append('official')
official_url_prefix = '/'.join(url_parts)
vision_base_dir = pathlib.Path(tfm.vision.__file__).parent
......
......@@ -15,5 +15,5 @@
"""Vision package definition."""
# Lint as: python3
# pylint: disable=unused-import
from official.vision.beta import configs
from official.vision.beta import tasks
from official.vision import configs
from official.vision import tasks
......@@ -405,9 +405,9 @@ class SpineNet(tf.keras.Model):
if (block_spec.level < self._min_level or
block_spec.level > self._max_level):
logging.warning(
'SpineNet output level out of range [min_level, max_level] = '
'SpineNet output level %s out of range [min_level, max_level] = '
'[%s, %s] will not be used for further processing.',
self._min_level, self._max_level)
block_spec.level, self._min_level, self._max_level)
endpoints[str(block_spec.level)] = x
return endpoints
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment