data_utils.py 4.14 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utilities for parsing Kaggle baby names files."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import os

import numpy as np
import tensorflow as tf
import pandas as pd

# the default end of name rep will be zero
_EON = 0


def read_names(names_path):
    """read data from downloaded file. See SmallNames.txt for example format
    or go to https://www.kaggle.com/kaggle/us-baby-names for full lists

    Args:
        names_path: path to the csv file similar to the example type
    Returns:
        Dataset: a namedtuple of two elements: deduped names and their associated
            counts. The names contain only 26 chars and are all lower case
    """
    names_data = pd.read_csv(names_path)
    names_data.Name = names_data.Name.str.lower()

    name_data = names_data.groupby(by=["Name"])["Count"].sum()
    name_counts = np.array(name_data.tolist())
    names_deduped = np.array(name_data.index.tolist())

    Dataset = collections.namedtuple('Dataset', ['Name', 'Count'])
    return Dataset(names_deduped, name_counts)


def _letter_to_number(letter):
    """converts letters to numbers between 1 and 27"""
    # ord of lower case 'a' is 97
    return ord(letter) - 96


def namignizer_iterator(names, counts, batch_size, num_steps, epoch_size):
    """Takes a list of names and counts like those output from read_names, and
    makes an iterator yielding a batch_size by num_steps array of random names
james mike dupont's avatar
untie  
james mike dupont committed
61
    separated by an end of name token. The names are chosen randomly according
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
    to their counts. The batch may end mid-name

    Args:
        names: a set of lowercase names composed of 26 characters
        counts: a list of the frequency of those names
        batch_size: int
        num_steps: int
        epoch_size: number of batches to yield
    Yields:
        (x, y): a batch_size by num_steps array of ints representing letters, where
            x will be the input and y will be the target
    """
    name_distribution = counts / counts.sum()

    for i in range(epoch_size):
        data = np.zeros(batch_size * num_steps + 1)
        samples = np.random.choice(names, size=batch_size * num_steps // 2,
                                   replace=True, p=name_distribution)

        data_index = 0
        for sample in samples:
            if data_index >= batch_size * num_steps:
                break
            for letter in map(_letter_to_number, sample) + [_EON]:
                if data_index >= batch_size * num_steps:
                    break
                data[data_index] = letter
                data_index += 1

        x = data[:batch_size * num_steps].reshape((batch_size, num_steps))
        y = data[1:batch_size * num_steps + 1].reshape((batch_size, num_steps))

        yield (x, y)


def name_to_batch(name, batch_size, num_steps):
    """ Takes a single name and fills a batch with it

    Args:
        name: lowercase composed of 26 characters
        batch_size: int
        num_steps: int
    Returns:
        x, y: a batch_size by num_steps array of ints representing letters, where
            x will be the input and y will be the target. The array is filled up
            to the length of the string, the rest is filled with zeros
    """
    data = np.zeros(batch_size * num_steps + 1)

    data_index = 0
    for letter in map(_letter_to_number, name) + [_EON]:
        data[data_index] = letter
        data_index += 1

    x = data[:batch_size * num_steps].reshape((batch_size, num_steps))
    y = data[1:batch_size * num_steps + 1].reshape((batch_size, num_steps))

    return x, y