Commit 19d80b98 authored by Xin Pan's avatar Xin Pan Committed by GitHub
Browse files

Merge pull request #379 from panyx0718/master

Example of data conversion and fix comments
parents f98c5ded 85836119
...@@ -27,6 +27,9 @@ for example vocabulary format. In <b>How To Run</b> below, users can use toy ...@@ -27,6 +27,9 @@ for example vocabulary format. In <b>How To Run</b> below, users can use toy
data and vocab provided in the data/ directory to run the training by replacing data and vocab provided in the data/ directory to run the training by replacing
the data directory flag. the data directory flag.
data_convert_example.py contains example of convert between binary and text.
<b>Experiment Result</b> <b>Experiment Result</b>
8000 examples from testset are sampled to generate summaries and rouge score is 8000 examples from testset are sampled to generate summaries and rouge score is
...@@ -73,10 +76,13 @@ Install TensorFlow and Bazel. ...@@ -73,10 +76,13 @@ Install TensorFlow and Bazel.
```shell ```shell
# cd to your workspace # cd to your workspace
# clone the code to your workspace and create empty WORKSPACE file. # 1. Clone the textsum code to your workspace 'textsum' directory.
# move the data to your workspace. If don't have full dataset yet, copy # 2. Create an empty 'WORKSPACE' file in your workspace.
# the toy data from the data/ directory from code directory and rename # 3. Move the train/eval/test data to your workspace 'data' directory.
# the files. # In the following example, I named the data training-*, test-*, etc.
# If your data files have different names, update the --data_path.
# If you don't have data but want to try out the model, copy the toy
# data from the textsum/data/data to the data/ directory in the workspace.
ls -R ls -R
.: .:
data textsum WORKSPACE data textsum WORKSPACE
......
...@@ -70,11 +70,15 @@ class Vocab(object): ...@@ -70,11 +70,15 @@ class Vocab(object):
return self._count return self._count
def ExampleGen(recordio_path, num_epochs=None): def ExampleGen(data_path, num_epochs=None):
"""Generates tf.Examples from path of recordio files. """Generates tf.Examples from path of data files.
Binary data format: <length><blob>. <length> represents the byte size
of <blob>. <blob> is serialized tf.Example proto. The tf.Example contains
the tokenized article text and summary.
Args: Args:
recordio_path: CNS path to tf.Example recordio data_path: path to tf.Example data files.
num_epochs: Number of times to go through the data. None means infinite. num_epochs: Number of times to go through the data. None means infinite.
Yields: Yields:
...@@ -86,7 +90,7 @@ def ExampleGen(recordio_path, num_epochs=None): ...@@ -86,7 +90,7 @@ def ExampleGen(recordio_path, num_epochs=None):
while True: while True:
if num_epochs is not None and epoch >= num_epochs: if num_epochs is not None and epoch >= num_epochs:
break break
filelist = glob.glob(recordio_path) filelist = glob.glob(data_path)
assert filelist, 'Empty filelist.' assert filelist, 'Empty filelist.'
random.shuffle(filelist) random.shuffle(filelist)
for f in filelist: for f in filelist:
......
"""Example of Converting TextSum model data.
Usage:
python data_convert_example.py --command binary_to_text --in_file data/data --out_file data/text_data
python data_convert_example.py --command text_to_binary --in_file data/text_data --out_file data/binary_data
python data_convert_example.py --command binary_to_text --in_file data/binary_data --out_file data/text_data2
diff data/text_data2 data/text_data
"""
import struct
import sys
import tensorflow as tf
from tensorflow.core.example import example_pb2
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('command', 'binary_to_text',
'Either binary_to_text or text_to_binary.'
'Specify FLAGS.in_file accordingly.')
tf.app.flags.DEFINE_string('in_file', '', 'path to file')
tf.app.flags.DEFINE_string('out_file', '', 'path to file')
def _binary_to_text():
reader = open(FLAGS.in_file, 'rb')
writer = open(FLAGS.out_file, 'w')
while True:
len_bytes = reader.read(8)
if not len_bytes:
sys.stderr.write('Done reading\n')
return
str_len = struct.unpack('q', len_bytes)[0]
tf_example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
tf_example = example_pb2.Example.FromString(tf_example_str)
examples = []
for key in tf_example.features.feature:
examples.append('%s=%s' % (key, tf_example.features.feature[key].bytes_list.value[0]))
writer.write('%s\n' % '\t'.join(examples))
reader.close()
writer.close()
def _text_to_binary():
inputs = open(FLAGS.in_file, 'r').readlines()
writer = open(FLAGS.out_file, 'wb')
for inp in inputs:
tf_example = example_pb2.Example()
for feature in inp.strip().split('\t'):
(k, v) = feature.split('=')
tf_example.features.feature[k].bytes_list.value.extend([v])
tf_example_str = tf_example.SerializeToString()
str_len = len(tf_example_str)
writer.write(struct.pack('q', str_len))
writer.write(struct.pack('%ds' % str_len, tf_example_str))
writer.close()
def main(unused_argv):
assert FLAGS.command and FLAGS.in_file and FLAGS.out_file
if FLAGS.command == 'binary_to_text':
_binary_to_text()
elif FLAGS.command == 'text_to_binary':
_text_to_binary()
if __name__ == '__main__':
tf.app.run()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment