Commit 9ba5b316 authored by Mark Daoust's avatar Mark Daoust
Browse files

Convert to colab format

parent 2c929976
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "wide.ipynb",
"version": "0.3.2",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
}
},
"cells": [
{
"metadata": {
"id": "Zr7KpBhMcYvE",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"# TensorFlow Linear Model Tutorial\n",
"\n",
"\n"
]
},
{
"metadata": {
"id": "77aETSYDcdoK",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"In this tutorial, we will use the `tf.estimator` API in TensorFlow to solve a\n",
"binary classification problem: Given census data about a person such as age,\n",
"education, marital status, and occupation (the features), we will try to predict\n",
......@@ -25,24 +51,16 @@
]
},
{
"metadata": {
"id": "NQgONe5ecYvE",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "7ab0889a-32f9-4ace-f848-6c808893b88c"
},
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "tf.enable_eager_execution must be called at program startup.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-42-04d0fb7a9ec6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtensorflow\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtensorflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeature_column\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menable_eager_execution\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/venv3/lib/python3.5/site-packages/tensorflow/python/framework/ops.py\u001b[0m in \u001b[0;36menable_eager_execution\u001b[0;34m(config, device_policy, execution_mode)\u001b[0m\n\u001b[1;32m 5238\u001b[0m \"\"\"\n\u001b[1;32m 5239\u001b[0m return enable_eager_execution_internal(\n\u001b[0;32m-> 5240\u001b[0;31m config, device_policy, execution_mode, None)\n\u001b[0m\u001b[1;32m 5241\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5242\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/venv3/lib/python3.5/site-packages/tensorflow/python/framework/ops.py\u001b[0m in \u001b[0;36menable_eager_execution_internal\u001b[0;34m(config, device_policy, execution_mode, server_def)\u001b[0m\n\u001b[1;32m 5306\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5307\u001b[0m raise ValueError(\n\u001b[0;32m-> 5308\u001b[0;31m \"tf.enable_eager_execution must be called at program startup.\")\n\u001b[0m\u001b[1;32m 5309\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5310\u001b[0m \u001b[0;31m# Monkey patch to get rid of an unnecessary conditional since the context is\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: tf.enable_eager_execution must be called at program startup."
]
}
],
"source": [
"import tensorflow as tf\n",
"import tensorflow.feature_column as fc \n",
......@@ -51,29 +69,32 @@
"import os\n",
"import sys\n",
"from IPython.display import clear_output"
]
],
"execution_count": 1,
"outputs": []
},
{
"metadata": {
"id": "-MPr95UccYvL",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"Download the [tutorial code from github](https://github.com/tensorflow/models/tree/master/official/wide_deep/),\n",
" add the root directory to your python path, and jump to the `wide_deep` directory:"
]
},
{
"metadata": {
"id": "yVvFyhnkcYvL",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 136
},
"outputId": "e57030d7-7f5c-455e-ea0f-55038e909d97"
},
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"fatal: destination path 'models' already exists and is not an empty directory.\r\n"
]
}
],
"source": [
"if \"wide_deep\" not in os.getcwd():\n",
" ! git clone --depth 1 https://github.com/tensorflow/models\n",
......@@ -81,55 +102,96 @@
" sys.path.append(models_path) \n",
" os.environ['PYTHONPATH'] += os.pathsep+models_path\n",
" os.chdir(\"models/official/wide_deep\")"
],
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"text": [
"Cloning into 'models'...\n",
"remote: Counting objects: 2826, done.\u001b[K\n",
"remote: Compressing objects: 100% (2375/2375), done.\u001b[K\n",
"remote: Total 2826 (delta 543), reused 1731 (delta 382), pack-reused 0\u001b[K\n",
"Receiving objects: 100% (2826/2826), 371.22 MiB | 39.17 MiB/s, done.\n",
"Resolving deltas: 100% (543/543), done.\n",
"Checking out files: 100% (2934/2934), done.\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "15Ethw-wcYvP",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"Execute the data download script:"
]
},
{
"metadata": {
"id": "6QilS4-0cYvQ",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "3faf2df7-677e-4a91-c09b-3d81ca30c9c1"
},
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import census_dataset\n",
"import census_main\n",
"\n",
"census_dataset.download(\"/tmp/census_data/\")"
]
],
"execution_count": 3,
"outputs": []
},
{
"metadata": {
"id": "cD5e3ibAcYvS",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"Execute the tutorial code with the following command to train the model described in this tutorial, from the command line:"
]
},
{
"metadata": {
"id": "vbJ8jPAhcYvT",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "cc0182c0-90d7-4f9c-b421-0dd67166c6d2"
},
"cell_type": "code",
"source": [
"output = !python -m census_main --model_type=wide --train_epochs=2\n",
"print([line for line in output if 'accuracy:' in line])"
],
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['I0711 14:47:25.747490 139708077598464 tf_logging.py:115] accuracy: 0.833794']\n"
]
}
"['I0711 22:27:15.442501 140285526747008 tf_logging.py:115] accuracy: 0.8360666']\n"
],
"source": [
"output = !python -m census_main --model_type=wide --train_epochs=2\n",
"print([line for line in output if 'accuracy:' in line])"
"name": "stdout"
}
]
},
{
"metadata": {
"id": "AmZ4CpaOcYvV",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"Read on to find out how this code builds its linear model.\n",
"\n",
......@@ -151,38 +213,70 @@
]
},
{
"metadata": {
"id": "N6Tgye8bcYvX",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "75152d8d-6afa-4e4e-cc0e-3eac7127f8fd"
},
"cell_type": "code",
"source": [
"!ls /tmp/census_data/"
],
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"adult.data adult.test\r\n"
]
}
],
"source": [
"!ls /tmp/census_data/"
"name": "stdout"
}
]
},
{
"metadata": {
"id": "6y3mj9zKcYva",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "3b44b7dd-5a2d-4943-eb19-20f26d5c7098"
},
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"train_file = \"/tmp/census_data/adult.data\"\n",
"test_file = \"/tmp/census_data/adult.test\""
]
],
"execution_count": 6,
"outputs": []
},
{
"metadata": {
"id": "vkn1FNmpcYvb",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"outputId": "4e27b186-b76c-4f19-ea9d-abe19110e93b"
},
"cell_type": "code",
"source": [
"import pandas\n",
"train_df = pandas.read_csv(train_file, header = None, names = census_dataset._CSV_COLUMNS)\n",
"test_df = pandas.read_csv(test_file, header = None, names = census_dataset._CSV_COLUMNS)\n",
"\n",
"train_df.head()"
],
"execution_count": 7,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
......@@ -338,22 +432,19 @@
"4 0 0 40 Cuba <=50K "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
"metadata": {
"tags": []
},
"execution_count": 7
}
],
"source": [
"import pandas\n",
"train_df = pandas.read_csv(train_file, header = None, names = census_dataset._CSV_COLUMNS)\n",
"test_df = pandas.read_csv(test_file, header = None, names = census_dataset._CSV_COLUMNS)\n",
"\n",
"train_df.head()"
]
},
{
"metadata": {
"id": "QZZtXes4cYvf",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"The columns can be grouped into two types—categorical\n",
"and continuous columns:\n",
......@@ -392,10 +483,16 @@
]
},
{
"metadata": {
"id": "N7zNJflKcYvg",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "4aebe747-0fca-4209-cf28-3164080ab89f"
},
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def easy_input_function(df, label_key, num_epochs, shuffle, batch_size):\n",
" df = df.copy()\n",
......@@ -408,34 +505,31 @@
" ds = ds.batch(batch_size).repeat(num_epochs)\n",
"\n",
" return ds"
]
],
"execution_count": 8,
"outputs": []
},
{
"metadata": {
"id": "WeEgNR9AcYvh",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"Since we have eager execution enabled it is easy to inspect the resulting dataset:"
]
},
{
"metadata": {
"id": "ygaKuikecYvi",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 136
},
"outputId": "071665a2-d23f-4c15-da43-ce0d106d473f"
},
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Some feature keys: ['capital_gain', 'occupation', 'gender', 'capital_loss', 'workclass']\n",
"\n",
"A batch of Ages : tf.Tensor([61 18 37 47 47 32 18 23 28 37], shape=(10,), dtype=int32)\n",
"\n",
"A batch of Labels: tf.Tensor(\n",
"[b'>50K' b'<=50K' b'>50K' b'>50K' b'>50K' b'>50K' b'<=50K' b'<=50K'\n",
" b'<=50K' b'<=50K'], shape=(10,), dtype=string)\n"
]
}
],
"source": [
"ds = easy_input_function(train_df, label_key='income_bracket', num_epochs=5, shuffle=True, batch_size=10)\n",
"\n",
......@@ -447,11 +541,30 @@
"print('A batch of Ages :', feature_batch['age'])\n",
"print()\n",
"print('A batch of Labels:', label_batch )"
],
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"text": [
"Some feature keys: ['age', 'workclass', 'fnlwgt', 'education', 'education_num']\n",
"\n",
"A batch of Ages : tf.Tensor([52 57 31 33 34 22 32 66 35 44], shape=(10,), dtype=int32)\n",
"\n",
"A batch of Labels: tf.Tensor(\n",
"[b'<=50K' b'<=50K' b'<=50K' b'<=50K' b'<=50K' b'<=50K' b'<=50K' b'<=50K'\n",
" b'<=50K' b'>50K'], shape=(10,), dtype=string)\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "O_KZxQUucYvm",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"But this approach has severly-limited scalability. For larger data it should be streamed off disk.\n",
"the `census_dataset.input_fn` provides an example of how to do this using `tf.decode_csv` and `tf.data.TextLineDataset`: \n",
......@@ -460,12 +573,23 @@
]
},
{
"metadata": {
"id": "vUTeXaEUcYvn",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 493
},
"outputId": "2da7413a-5e54-4e86-f3c5-07387156ab79"
},
"cell_type": "code",
"source": [
"import inspect\n",
"print(inspect.getsource(census_dataset.input_fn))"
],
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"def input_fn(data_file, num_epochs, shuffle, batch_size):\n",
......@@ -496,63 +620,65 @@
" dataset = dataset.batch(batch_size)\n",
" return dataset\n",
"\n"
]
}
],
"source": [
"import inspect\n",
"print(inspect.getsource(census_dataset.input_fn))"
"name": "stdout"
}
]
},
{
"metadata": {
"id": "yyGcv_e-cYvq",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"This input_fn gives equivalent output:"
]
},
{
"metadata": {
"id": "DlsqRZS5cYvr",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 68
},
"outputId": "31dee63f-80f7-4c7e-f749-a5531d33ab95"
},
"cell_type": "code",
"source": [
"ds = census_dataset.input_fn(train_file, num_epochs=5, shuffle=True, batch_size=10)"
],
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:Parsing /tmp/census_data/adult.data\n"
]
],
"name": "stdout"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING: Logging before flag parsing goes to stderr.\n",
"I0711 14:47:26.362334 140466218788608 tf_logging.py:115] Parsing /tmp/census_data/adult.data\n"
]
}
"I0711 22:27:19.570451 140174775953280 tf_logging.py:115] Parsing /tmp/census_data/adult.data\n"
],
"source": [
"ds = census_dataset.input_fn(train_file, num_epochs=5, shuffle=True, batch_size=10)"
"name": "stderr"
}
]
},
{
"metadata": {
"id": "Mv3as_CEcYvu",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 102
},
"outputId": "3834b00d-9655-488f-d6d2-8d7405848d78"
},
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Feature keys: ['capital_gain', 'occupation', 'gender', 'capital_loss', 'workclass']\n",
"\n",
"Age batch : tf.Tensor([46 38 42 37 29 48 46 40 73 49], shape=(10,), dtype=int32)\n",
"\n",
"Label batch : tf.Tensor([False False False False False False False False True False], shape=(10,), dtype=bool)\n"
]
}
],
"source": [
"for feature_batch, label_batch in ds:\n",
" break\n",
......@@ -562,29 +688,57 @@
"print('Age batch :', feature_batch['age'])\n",
"print()\n",
"print('Label batch :', label_batch )"
],
"execution_count": 12,
"outputs": [
{
"output_type": "stream",
"text": [
"Feature keys: ['age', 'workclass', 'fnlwgt', 'education', 'education_num']\n",
"\n",
"Age batch : tf.Tensor([31 88 36 46 20 51 30 40 31 49], shape=(10,), dtype=int32)\n",
"\n",
"Label batch : tf.Tensor([False False True True False True True False False True], shape=(10,), dtype=bool)\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "810fnfY5cYvz",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"Because `Estimators` expect an `input_fn` that takes no arguments, we typically wrap configurable input function into an obejct with the expected signature. For this notebook configure the `train_inpf` to iterate over the data twice:"
]
},
{
"metadata": {
"id": "wnQdpEcVcYv0",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "b9050d80-e603-4363-dbe9-11c2b368e29d"
},
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"import functools\n",
"train_inpf = functools.partial(census_dataset.input_fn, train_file, num_epochs=2, shuffle=True, batch_size=64)\n",
"test_inpf = functools.partial(census_dataset.input_fn, test_file, num_epochs=1, shuffle=False, batch_size=64)"
]
],
"execution_count": 13,
"outputs": []
},
{
"metadata": {
"id": "pboNpNWhcYv4",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Selecting and Engineering Features for the Model\n",
"\n",
......@@ -609,73 +763,92 @@
]
},
{
"metadata": {
"id": "ZX0r2T5OcYv6",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "283bf438-2a96-4bf3-fa89-94da99f93927"
},
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"age = fc.numeric_column('age')"
]
],
"execution_count": 14,
"outputs": []
},
{
"metadata": {
"id": "tnLUiaHxcYv-",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"The model will use the `feature_column` definitions to build the model input. You can inspect the resulting output using the `input_layer` function:"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"scrolled": true
"id": "kREtIPfwcYv_",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 187
},
"outputId": "197a798b-9809-45e1-a8d4-ed5d237eea9d"
},
"cell_type": "code",
"source": [
"fc.input_layer(feature_batch, [age]).numpy()"
],
"execution_count": 15,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<tf.Tensor: id=237, shape=(10, 1), dtype=float32, numpy=\n",
"array([[46.],\n",
" [38.],\n",
" [42.],\n",
" [37.],\n",
" [29.],\n",
" [48.],\n",
"array([[31.],\n",
" [88.],\n",
" [36.],\n",
" [46.],\n",
" [20.],\n",
" [51.],\n",
" [30.],\n",
" [40.],\n",
" [73.],\n",
" [49.]], dtype=float32)>"
" [31.],\n",
" [49.]], dtype=float32)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
"metadata": {
"tags": []
},
"execution_count": 15
}
],
"source": [
"fc.input_layer(feature_batch, [age]).numpy()"
]
},
{
"metadata": {
"id": "OPuLduCucYwD",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"The following code will train and evaluate a model on only the `age` feature."
]
},
{
"metadata": {
"id": "9R5eSJ1pcYwE",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 54
},
"outputId": "ea791197-8300-4f31-cee1-f7d1b8209838"
},
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'precision': 0.29166666, 'auc_precision_recall': 0.31132147, 'average_loss': 0.5239897, 'label/mean': 0.23622628, 'auc': 0.6781367, 'loss': 33.4552, 'prediction/mean': 0.22513431, 'accuracy': 0.7631595, 'recall': 0.0018200728, 'global_step': 1018, 'accuracy_baseline': 0.76377374}\n"
]
}
],
"source": [
"classifier = tf.estimator.LinearClassifier(feature_columns=[age], n_classes=2)\n",
"classifier.train(train_inpf)\n",
......@@ -683,98 +856,127 @@
"\n",
"clear_output()\n",
"print(result)"
],
"execution_count": 16,
"outputs": [
{
"output_type": "stream",
"text": [
"{'accuracy': 0.76334375, 'accuracy_baseline': 0.76377374, 'auc': 0.67818105, 'auc_precision_recall': 0.31133735, 'average_loss': 0.52437353, 'label/mean': 0.23622628, 'loss': 33.479706, 'precision': 0.31578946, 'prediction/mean': 0.22410269, 'recall': 0.0015600624, 'global_step': 1018}\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "YDZGcdTdcYwI",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"Similarly, we can define a `NumericColumn` for each continuous feature column\n",
"that we want to use in the model:"
]
},
{
"metadata": {
"id": "uqPbUqlxcYwJ",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "68f4ccfd-d71b-4327-b8e8-25c40e986bed"
},
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"education_num = tf.feature_column.numeric_column('education_num')\n",
"capital_gain = tf.feature_column.numeric_column('capital_gain')\n",
"capital_loss = tf.feature_column.numeric_column('capital_loss')\n",
"hours_per_week = tf.feature_column.numeric_column('hours_per_week')"
]
],
"execution_count": 17,
"outputs": []
},
{
"metadata": {
"id": "yqCF0a4DcYwM",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "0f9097a4-bc79-4e67-bd63-6a4d4461736d"
},
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"my_numeric_columns = [age,education_num, capital_gain, capital_loss, hours_per_week]"
]
],
"execution_count": 18,
"outputs": []
},
{
"metadata": {
"id": "xDrZtAZ0cYwO",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"outputId": "6fd558ea-9f0c-4deb-cb8a-6211ec233016"
},
"cell_type": "code",
"source": [
"fc.input_layer(feature_batch, my_numeric_columns).numpy()"
],
"execution_count": 19,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<tf.Tensor: id=2160, shape=(10, 5), dtype=float32, numpy=\n",
"array([[4.600e+01, 0.000e+00, 0.000e+00, 6.000e+00, 4.000e+01],\n",
" [3.800e+01, 4.508e+03, 0.000e+00, 1.300e+01, 4.000e+01],\n",
" [4.200e+01, 0.000e+00, 0.000e+00, 1.400e+01, 4.000e+01],\n",
" [3.700e+01, 0.000e+00, 0.000e+00, 1.100e+01, 4.000e+01],\n",
" [2.900e+01, 0.000e+00, 0.000e+00, 9.000e+00, 4.000e+01],\n",
" [4.800e+01, 0.000e+00, 0.000e+00, 1.300e+01, 5.500e+01],\n",
" [4.600e+01, 0.000e+00, 0.000e+00, 9.000e+00, 5.000e+01],\n",
" [4.000e+01, 0.000e+00, 0.000e+00, 9.000e+00, 4.000e+01],\n",
" [7.300e+01, 6.418e+03, 0.000e+00, 4.000e+00, 9.900e+01],\n",
" [4.900e+01, 0.000e+00, 0.000e+00, 4.000e+00, 4.000e+01]],\n",
" dtype=float32)>"
"array([[3.1000e+01, 0.0000e+00, 0.0000e+00, 1.4000e+01, 4.3000e+01],\n",
" [8.8000e+01, 0.0000e+00, 0.0000e+00, 1.5000e+01, 4.0000e+01],\n",
" [3.6000e+01, 1.5024e+04, 0.0000e+00, 9.0000e+00, 4.0000e+01],\n",
" [4.6000e+01, 0.0000e+00, 0.0000e+00, 1.4000e+01, 5.5000e+01],\n",
" [2.0000e+01, 0.0000e+00, 0.0000e+00, 1.0000e+01, 1.0000e+01],\n",
" [5.1000e+01, 5.1780e+03, 0.0000e+00, 1.2000e+01, 4.5000e+01],\n",
" [3.0000e+01, 1.5024e+04, 0.0000e+00, 1.4000e+01, 6.0000e+01],\n",
" [4.0000e+01, 0.0000e+00, 0.0000e+00, 9.0000e+00, 4.0000e+01],\n",
" [3.1000e+01, 0.0000e+00, 0.0000e+00, 1.0000e+01, 1.0000e+01],\n",
" [4.9000e+01, 0.0000e+00, 0.0000e+00, 1.3000e+01, 4.0000e+01]],\n",
" dtype=float32)"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
"metadata": {
"tags": []
},
"execution_count": 19
}
],
"source": [
"fc.input_layer(feature_batch, my_numeric_columns).numpy()"
]
},
{
"metadata": {
"id": "cBGDN97IcYwQ",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"You could retrain a model on these features with, just by changing the `feature_columns` argument to the constructor:"
]
},
{
"metadata": {
"id": "XN8k5S95cYwR",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"outputId": "72be27c1-e25c-4609-a703-8297c936177a"
},
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.7817087\n",
"accuracy_baseline: 0.76377374\n",
"auc: 0.8027547\n",
"auc_precision_recall: 0.5611528\n",
"average_loss: 1.0698086\n",
"global_step: 1018\n",
"label/mean: 0.23622628\n",
"loss: 68.30414\n",
"precision: 0.57025987\n",
"prediction/mean: 0.36397633\n",
"recall: 0.30811232\n"
]
}
],
"source": [
"classifier = tf.estimator.LinearClassifier(feature_columns=my_numeric_columns, n_classes=2)\n",
"classifier.train(train_inpf)\n",
......@@ -784,11 +986,34 @@
"clear_output()\n",
"for key,value in sorted(result.items()):\n",
" print('%s: %s' % (key, value))"
],
"execution_count": 20,
"outputs": [
{
"output_type": "stream",
"text": [
"accuracy: 0.76377374\n",
"accuracy_baseline: 0.76377374\n",
"auc: 0.539677\n",
"auc_precision_recall: 0.334656\n",
"average_loss: 1.4886041\n",
"global_step: 1018\n",
"label/mean: 0.23622628\n",
"loss: 95.04299\n",
"precision: 0.0\n",
"prediction/mean: 0.21315515\n",
"recall: 0.0\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "jBRq9_AzcYwU",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Categorical columns\n",
"\n",
......@@ -799,20 +1024,31 @@
]
},
{
"metadata": {
"id": "0IjqSi9tcYwV",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 37
},
"outputId": "859f282d-7a9c-417b-a615-643a15d10118"
},
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"relationship = fc.categorical_column_with_vocabulary_list(\n",
" 'relationship', [\n",
" 'Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried',\n",
" 'Other-relative'])\n"
]
],
"execution_count": 21,
"outputs": []
},
{
"metadata": {
"id": "-RjoWv-7cYwW",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"This will create a sparse one-hot vector from the raw input feature.\n",
"\n",
......@@ -824,122 +1060,168 @@
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"scrolled": true
"id": "kI43CYlncYwY",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 224
},
"outputId": "458177e5-4bc0-48f2-b1fb-614b91dd99e6"
},
"cell_type": "code",
"source": [
"fc.input_layer(feature_batch, [age, fc.indicator_column(relationship)])"
],
"execution_count": 22,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<tf.Tensor: id=4490, shape=(10, 7), dtype=float32, numpy=\n",
"array([[46., 0., 0., 0., 0., 1., 0.],\n",
" [38., 1., 0., 0., 0., 0., 0.],\n",
" [42., 0., 1., 0., 0., 0., 0.],\n",
" [37., 1., 0., 0., 0., 0., 0.],\n",
" [29., 1., 0., 0., 0., 0., 0.],\n",
" [48., 1., 0., 0., 0., 0., 0.],\n",
"<tf.Tensor: id=4361, shape=(10, 7), dtype=float32, numpy=\n",
"array([[31., 0., 1., 0., 0., 0., 0.],\n",
" [88., 1., 0., 0., 0., 0., 0.],\n",
" [36., 1., 0., 0., 0., 0., 0.],\n",
" [46., 1., 0., 0., 0., 0., 0.],\n",
" [20., 0., 1., 0., 0., 0., 0.],\n",
" [51., 1., 0., 0., 0., 0., 0.],\n",
" [30., 1., 0., 0., 0., 0., 0.],\n",
" [40., 1., 0., 0., 0., 0., 0.],\n",
" [73., 1., 0., 0., 0., 0., 0.],\n",
" [49., 1., 0., 0., 0., 0., 0.]], dtype=float32)>"
" [31., 0., 0., 1., 0., 0., 0.],\n",
" [49., 0., 1., 0., 0., 0., 0.]], dtype=float32)>"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
"metadata": {
"tags": []
},
"execution_count": 22
}
],
"source": [
"fc.input_layer(feature_batch, [age, fc.indicator_column(relationship)])"
]
},
{
"metadata": {
"id": "tTudP7WHcYwb",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"What if we don't know the set of possible values in advance? Not a problem. We\n",
"can use `categorical_column_with_hash_bucket` instead:"
]
},
{
"metadata": {
"id": "8pSBaliCcYwb",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 37
},
"outputId": "e9b2e611-1311-4933-af0a-489e03fdc960"
},
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"occupation = tf.feature_column.categorical_column_with_hash_bucket(\n",
" 'occupation', hash_bucket_size=1000)"
]
],
"execution_count": 23,
"outputs": []
},
{
"metadata": {
"id": "fSAPrqQkcYwd",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"What will happen is that each possible value in the feature column `occupation`\n",
"will be hashed to an integer ID as we encounter them in training. The example batch has a few different occupations:"
]
},
{
"metadata": {
"id": "dCvQNv36cYwe",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 207
},
"outputId": "23ebfedd-faf8-425b-a855-9897aba20341"
},
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"source": [
"for item in feature_batch['occupation'].numpy():\n",
" print(item.decode())"
],
"execution_count": 24,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Machine-op-inspct\n",
"Transport-moving\n",
"Prof-specialty\n",
"Adm-clerical\n",
"Handlers-cleaners\n",
"Exec-managerial\n",
"Prof-specialty\n",
"Other-service\n",
"Farming-fishing\n",
"Farming-fishing\n",
"Handlers-cleaners\n"
]
}
"Exec-managerial\n",
"Tech-support\n",
"Sales\n",
"Exec-managerial\n",
"Machine-op-inspct\n",
"?\n",
"Exec-managerial\n"
],
"source": [
"for item in feature_batch['occupation'].numpy():\n",
" print(item.decode())"
"name": "stdout"
}
]
},
{
"metadata": {
"id": "KP5hN2rAcYwh",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"if we run `input_layer` with the hashed column we see that the output shape is `(batch_size, hash_bucket_size)`"
]
},
{
"metadata": {
"id": "0Y16peWacYwh",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 54
},
"outputId": "524b1af5-c492-4d0e-b736-7974ca618089"
},
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"source": [
"occupation_result = fc.input_layer(feature_batch, [fc.indicator_column(occupation)])\n",
"\n",
"occupation_result.numpy().shape"
],
"execution_count": 25,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(10, 1000)"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
"metadata": {
"tags": []
},
"execution_count": 25
}
],
"source": [
"occupation_result = fc.input_layer(feature_batch, [fc.indicator_column(occupation)])\n",
"\n",
"occupation_result.numpy().shape"
]
},
{
"metadata": {
"id": "HMW2MzWAcYwk",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"It's easier to see the actual results if we take the tf.argmax over the `hash_bucket_size` dimension.\n",
"\n",
......@@ -949,28 +1231,41 @@
]
},
{
"metadata": {
"id": "q_ryRglmcYwk",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 54
},
"outputId": "e1797664-1200-48e3-c774-52e7e0a18f00"
},
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"source": [
"tf.argmax(occupation_result, axis=1).numpy()"
],
"execution_count": 26,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([911, 420, 979, 96, 10, 979, 527, 936, 936, 10])"
"array([979, 800, 979, 800, 413, 631, 800, 911, 65, 800])"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
"metadata": {
"tags": []
},
"execution_count": 26
}
],
"source": [
"tf.argmax(occupation_result, axis=1).numpy()"
]
},
{
"metadata": {
"id": "j1e5NfyKcYwn",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"No matter which way we choose to define a `SparseColumn`, each feature string\n",
"will be mapped into an integer ID by looking up a fixed mapping or by hashing.\n",
......@@ -983,10 +1278,16 @@
]
},
{
"metadata": {
"id": "0Z5eUrd_cYwo",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 37
},
"outputId": "becd1bda-9014-4b9e-92ef-ba4ee2ed52fa"
},
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"education = tf.feature_column.categorical_column_with_vocabulary_list(\n",
" 'education', [\n",
......@@ -1003,47 +1304,48 @@
" 'workclass', [\n",
" 'Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov',\n",
" 'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'])\n"
]
],
"execution_count": 27,
"outputs": []
},
{
"metadata": {
"id": "a03l9ozUcYwp",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 37
},
"outputId": "374c7f00-8d2e-458f-ec32-b4cbc6b7386f"
},
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"my_categorical_columns = [relationship, occupation, education, marital_status, workclass]"
]
],
"execution_count": 28,
"outputs": []
},
{
"metadata": {
"id": "ASQJM1pEcYwr",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"It's easy to use both sets of columns to configure a model that uses all these features:"
]
},
{
"metadata": {
"id": "_i_MLoo9cYws",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 224
},
"outputId": "95ab18a4-2ec1-4fad-c207-2f86b607a333"
},
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.83342546\n",
"accuracy_baseline: 0.76377374\n",
"auc: 0.8807037\n",
"auc_precision_recall: 0.6601031\n",
"average_loss: 0.8671454\n",
"global_step: 1018\n",
"label/mean: 0.23622628\n",
"loss: 55.36468\n",
"precision: 0.6496042\n",
"prediction/mean: 0.2628341\n",
"recall: 0.6401456\n"
]
}
],
"source": [
"classifier = tf.estimator.LinearClassifier(feature_columns=my_numeric_columns+my_categorical_columns, n_classes=2)\n",
"classifier.train(train_inpf)\n",
......@@ -1052,11 +1354,34 @@
"clear_output()\n",
"for key,value in sorted(result.items()):\n",
" print('%s: %s' % (key, value))"
],
"execution_count": 29,
"outputs": [
{
"output_type": "stream",
"text": [
"accuracy: 0.81978995\n",
"accuracy_baseline: 0.76377374\n",
"auc: 0.869223\n",
"auc_precision_recall: 0.6459037\n",
"average_loss: 1.9878242\n",
"global_step: 1018\n",
"label/mean: 0.23622628\n",
"loss: 126.916725\n",
"precision: 0.60679156\n",
"prediction/mean: 0.2908891\n",
"recall: 0.6736869\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "zdKEqF6xcYwv",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Derived feature columns\n",
"\n",
......@@ -1082,18 +1407,29 @@
]
},
{
"metadata": {
"id": "KT4pjD9AcYww",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "633c1bb5-e5e2-4cf3-8392-5caf473607da"
},
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"age_buckets = tf.feature_column.bucketized_column(\n",
" age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])"
]
],
"execution_count": 30,
"outputs": []
},
{
"metadata": {
"id": "S-XOscrEcYwx",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"where the `boundaries` is a list of bucket boundaries. In this case, there are\n",
"10 boundaries, resulting in 11 age group buckets (from age 17 and below, 18-24,\n",
......@@ -1103,38 +1439,51 @@
]
},
{
"metadata": {
"id": "Lr40vm3qcYwy",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"outputId": "e53a3d92-f8d4-4ff7-da5e-46f498eb2316"
},
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"source": [
"fc.input_layer(feature_batch, [age, age_buckets]).numpy()"
],
"execution_count": 31,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([[46., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],\n",
" [38., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],\n",
" [42., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],\n",
" [37., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],\n",
" [29., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],\n",
" [48., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],\n",
"array([[31., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],\n",
" [88., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],\n",
" [36., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],\n",
" [46., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],\n",
" [20., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n",
" [51., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],\n",
" [30., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],\n",
" [40., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],\n",
" [73., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],\n",
" [31., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],\n",
" [49., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]],\n",
" dtype=float32)"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
"metadata": {
"tags": []
},
"execution_count": 31
}
],
"source": [
"fc.input_layer(feature_batch, [age, age_buckets]).numpy()"
]
},
{
"metadata": {
"id": "Z_tQI9j8cYw1",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Learn complex relationships with crossed column\n",
"\n",
......@@ -1150,18 +1499,29 @@
]
},
{
"metadata": {
"id": "IAPhPzXscYw1",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 37
},
"outputId": "4dd22eaf-3917-449d-9068-5306ae60b6a6"
},
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"education_x_occupation = tf.feature_column.crossed_column(\n",
" ['education', 'occupation'], hash_bucket_size=1000)"
]
],
"execution_count": 32,
"outputs": []
},
{
"metadata": {
"id": "UeTxMunbcYw5",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"We can also create a `crossed_column` over more than two columns. Each\n",
"constituent column can be either a base feature column that is categorical\n",
......@@ -1170,18 +1530,29 @@
]
},
{
"metadata": {
"id": "y8UaBld9cYw7",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 37
},
"outputId": "4abb43e7-c406-4caf-f15e-71af723ec8df"
},
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"age_buckets_x_education_x_occupation = tf.feature_column.crossed_column(\n",
" [age_buckets, 'education', 'occupation'], hash_bucket_size=1000)"
]
],
"execution_count": 33,
"outputs": []
},
{
"metadata": {
"id": "HvKmW6U5cYw8",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"These crossed columns always use hash buckets to avoid the exponential explosion in the number of categories, and put the control over number of model weights in the hands of the user.\n",
"\n",
......@@ -1190,8 +1561,11 @@
]
},
{
"metadata": {
"id": "HtjpheB6cYw9",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Defining The Logistic Regression Model\n",
"\n",
......@@ -1210,39 +1584,16 @@
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:Using default config.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"I0711 14:48:54.071429 140466218788608 tf_logging.py:115] Using default config.\n"
]
"metadata": {
"id": "Klmf3OxpcYw-",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 105
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:Using config: {'_global_id_in_cluster': 0, '_is_chief': True, '_keep_checkpoint_every_n_hours': 10000, '_tf_random_seed': None, '_num_worker_replicas': 1, '_device_fn': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc03341f668>, '_evaluation_master': '', '_train_distribute': None, '_model_dir': '/tmp/tmpligbanno', '_session_config': None, '_save_checkpoints_steps': None, '_master': '', '_num_ps_replicas': 0, '_task_type': 'worker', '_log_step_count_steps': 100, '_save_summary_steps': 100, '_service': None, '_task_id': 0, '_save_checkpoints_secs': 600, '_keep_checkpoint_max': 5}\n"
]
"outputId": "a8f46b90-a9d0-4d33-fff5-38b530e35d43"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"I0711 14:48:54.073915 140466218788608 tf_logging.py:115] Using config: {'_global_id_in_cluster': 0, '_is_chief': True, '_keep_checkpoint_every_n_hours': 10000, '_tf_random_seed': None, '_num_worker_replicas': 1, '_device_fn': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc03341f668>, '_evaluation_master': '', '_train_distribute': None, '_model_dir': '/tmp/tmpligbanno', '_session_config': None, '_save_checkpoints_steps': None, '_master': '', '_num_ps_replicas': 0, '_task_type': 'worker', '_log_step_count_steps': 100, '_save_summary_steps': 100, '_service': None, '_task_id': 0, '_save_checkpoints_secs': 600, '_keep_checkpoint_max': 5}\n"
]
}
],
"cell_type": "code",
"source": [
"import tempfile\n",
"\n",
......@@ -1260,11 +1611,45 @@
"model_dir = tempfile.mkdtemp()\n",
"model = tf.estimator.LinearClassifier(\n",
" model_dir=model_dir, feature_columns=base_columns + crossed_columns)"
],
"execution_count": 34,
"outputs": [
{
"output_type": "stream",
"text": [
"INFO:tensorflow:Using default config.\n"
],
"name": "stdout"
},
{
"output_type": "stream",
"text": [
"I0711 22:27:55.502184 140174775953280 tf_logging.py:115] Using default config.\n"
],
"name": "stderr"
},
{
"output_type": "stream",
"text": [
"INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp93vf5hp6', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f7cc6df0ba8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}\n"
],
"name": "stdout"
},
{
"output_type": "stream",
"text": [
"I0711 22:27:55.509107 140174775953280 tf_logging.py:115] Using config: {'_model_dir': '/tmp/tmp93vf5hp6', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f7cc6df0ba8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}\n"
],
"name": "stderr"
}
]
},
{
"metadata": {
"id": "jRhnPxUucYxC",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"The model also automatically learns a bias term, which controls the prediction\n",
"one would make without observing any features (see the section [How Logistic\n",
......@@ -1279,30 +1664,54 @@
]
},
{
"metadata": {
"id": "ZlrIBuoecYxD",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"outputId": "5aa0bc8c-9496-4301-963a-78bcef54e17a"
},
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"model.train(train_inpf)\n",
"clear_output()"
]
],
"execution_count": 35,
"outputs": []
},
{
"metadata": {
"id": "IvY3a9pzcYxH",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"After the model is trained, we can evaluate how good our model is at predicting\n",
"the labels of the holdout data:"
]
},
{
"metadata": {
"id": "L9nVJEO8cYxI",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"outputId": "8eb14bd7-9030-4381-c18a-6a5c7c17c569"
},
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"source": [
"results = model.evaluate(test_inpf)\n",
"clear_output()\n",
"for key in sorted(results):\n",
" print('%s: %0.2f' % (key, results[key]))"
],
"execution_count": 36,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.84\n",
......@@ -1312,23 +1721,21 @@
"average_loss: 0.35\n",
"global_step: 1018.00\n",
"label/mean: 0.24\n",
"loss: 22.37\n",
"precision: 0.69\n",
"prediction/mean: 0.24\n",
"recall: 0.57\n"
]
}
"loss: 22.42\n",
"precision: 0.71\n",
"prediction/mean: 0.22\n",
"recall: 0.52\n"
],
"source": [
"results = model.evaluate(test_inpf)\n",
"clear_output()\n",
"for key in sorted(results):\n",
" print('%s: %0.2f' % (key, results[key]))"
"name": "stdout"
}
]
},
{
"metadata": {
"id": "E0fAibNDcYxL",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"The first line of the final output should be something like\n",
"`accuracy: 0.83`, which means the accuracy is 83%. Feel free to try more\n",
......@@ -1341,11 +1748,39 @@
]
},
{
"metadata": {
"id": "8R5bz5CxcYxL",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 669
},
"outputId": "71f5e775-0d24-4356-d785-3b06aa385957"
},
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"source": [
"import numpy as np\n",
"predict_df = test_df[:20].copy()\n",
"\n",
"pred_iter = model.predict(\n",
" lambda:easy_input_function(predict_df, label_key='income_bracket',\n",
" num_epochs=1, shuffle=False, batch_size=10))\n",
"\n",
"classes = np.array(['<=50K', '>50K'])\n",
"pred_class_id = []\n",
"for pred_dict in pred_iter:\n",
" pred_class_id.append(pred_dict['class_ids'])\n",
"\n",
"predict_df['predicted_class'] = classes[np.array(pred_class_id)]\n",
"predict_df['correct'] = predict_df['predicted_class'] == predict_df['income_bracket']\n",
"\n",
"clear_output()\n",
"predict_df[['income_bracket','predicted_class', 'correct']]"
],
"execution_count": 37,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
......@@ -1520,34 +1955,19 @@
"19 >50K >50K True"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
"metadata": {
"tags": []
},
"execution_count": 37
}
],
"source": [
"import numpy as np\n",
"predict_df = test_df[:20].copy()\n",
"\n",
"pred_iter = model.predict(\n",
" lambda:easy_input_function(predict_df, label_key='income_bracket',\n",
" num_epochs=1, shuffle=False, batch_size=10))\n",
"\n",
"classes = np.array(['<=50K', '>50K'])\n",
"pred_class_id = []\n",
"for pred_dict in pred_iter:\n",
" pred_class_id.append(pred_dict['class_ids'])\n",
"\n",
"predict_df['predicted_class'] = classes[np.array(pred_class_id)]\n",
"predict_df['correct'] = predict_df['predicted_class'] == predict_df['income_bracket']\n",
"\n",
"clear_output()\n",
"predict_df[['income_bracket','predicted_class', 'correct']]"
]
},
{
"metadata": {
"id": "N_uCpFTicYxN",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"If you'd like to see a working end-to-end example, you can download our\n",
"[example code](https://github.com/tensorflow/models/tree/master/official/wide_deep/census_main.py)\n",
......@@ -1568,28 +1988,16 @@
]
},
{
"metadata": {
"id": "cVv2HsqocYxO",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"outputId": "68504270-5bcc-4a87-dbfa-7fd94cf54dff"
},
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy: 0.84\n",
"accuracy_baseline: 0.76\n",
"auc: 0.89\n",
"auc_precision_recall: 0.70\n",
"average_loss: 0.35\n",
"global_step: 2036.00\n",
"label/mean: 0.24\n",
"loss: 22.29\n",
"precision: 0.69\n",
"prediction/mean: 0.24\n",
"recall: 0.56\n"
]
}
],
"source": [
"#TODO(markdaoust): is the regularization strength here not working?\n",
"model = tf.estimator.LinearClassifier(\n",
......@@ -1605,11 +2013,34 @@
"clear_output()\n",
"for key in sorted(results):\n",
" print('%s: %0.2f' % (key, results[key]))"
],
"execution_count": 38,
"outputs": [
{
"output_type": "stream",
"text": [
"accuracy: 0.84\n",
"accuracy_baseline: 0.76\n",
"auc: 0.89\n",
"auc_precision_recall: 0.70\n",
"average_loss: 0.35\n",
"global_step: 2036.00\n",
"label/mean: 0.24\n",
"loss: 22.28\n",
"precision: 0.70\n",
"prediction/mean: 0.24\n",
"recall: 0.55\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "5AqvPEQwcYxU",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"One important difference between L1 and L2 regularization is that L1\n",
"regularization tends to make model weights stay at zero, creating sparser\n",
......@@ -1626,8 +2057,11 @@
]
},
{
"metadata": {
"id": "i5119iMWcYxU",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"how_it_works\"> </a>\n",
"## How Logistic Regression Works\n",
......@@ -1675,8 +2109,11 @@
]
},
{
"metadata": {
"id": "hbXuPYQIcYxV",
"colab_type": "text"
},
"cell_type": "markdown",
"metadata": {},
"source": [
"## What Next\n",
"\n",
......@@ -1689,32 +2126,21 @@
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
"id": "jpdw2z5WcYxV",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
}
"outputId": "403d18f6-d01e-47dc-dfc7-8c95d9a8ec34"
},
"nbformat": 4,
"nbformat_minor": 2
"cell_type": "code",
"source": [
""
],
"execution_count": 38,
"outputs": []
}
]
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment