Merge pull request #379 from panyx0718/master

Example of data conversion and fix comments

Merge pull request #379 from panyx0718/master
Example of data conversion and fix comments
19d80b98 · Xin Pan · GitHub · f98c5ded · 85836119 · 19d80b98
Commit 19d80b98 authored Sep 30, 2016 by Xin Pan Committed by GitHub Sep 30, 2016
Show whitespace changes
Inline Side-by-side

Showing with 83 additions and 8 deletions

textsum/README.md textsum/README.md +10 -4

textsum/data.py textsum/data.py +8 -4

textsum/data_convert_example.py textsum/data_convert_example.py +65 -0

No files found.
--- a/textsum/README.md
+++ b/textsum/README.md
@@ -27,6 +27,9 @@ for example vocabulary format. In <b>How To Run</b> below, users can use toy
 data and vocab provided in the data/ directory to run the training by replacing
 the data directory flag.

+data_convert_example.py contains example of convert between binary and text.
+
+
 <b>Experiment Result</b>

 8000 examples from testset are sampled to generate summaries and rouge score is
@@ -73,10 +76,13 @@ Install TensorFlow and Bazel.

 ```shell
 # cd to your workspace
-# clone the code to your workspace and create empty WORKSPACE file.
-# move the data to your workspace. If don't have full dataset yet, copy
-# the toy data from the data/ directory from code directory and rename
-# the files.
+# 1. Clone the textsum code to your workspace 'textsum' directory.
+# 2. Create an empty 'WORKSPACE' file in your workspace.
+# 3. Move the train/eval/test data to your workspace 'data' directory.
+#    In the following example, I named the data training-*, test-*, etc.
+#    If your data files have different names, update the --data_path.
+#    If you don't have data but want to try out the model, copy the toy
+#    data from the textsum/data/data to the data/ directory in the workspace.
 ls -R
 .:
 data  textsum  WORKSPACE

--- a/textsum/data.py
+++ b/textsum/data.py
@@ -70,11 +70,15 @@ class Vocab(object):
    return self._count


-def ExampleGen(recordio_path, num_epochs=None):
-  """Generates tf.Examples from path of recordio files.
+def ExampleGen(data_path, num_epochs=None):
+  """Generates tf.Examples from path of data files.
+
+    Binary data format: <length><blob>. <length> represents the byte size
+    of <blob>. <blob> is serialized tf.Example proto. The tf.Example contains
+    the tokenized article text and summary.

  Args:
-    recordio_path: CNS path to tf.Example recordio
+    data_path: path to tf.Example data files.
    num_epochs: Number of times to go through the data. None means infinite.

  Yields:
@@ -86,7 +90,7 @@ def ExampleGen(recordio_path, num_epochs=None):
  while True:
    if num_epochs is not None and epoch >= num_epochs:
      break
-    filelist = glob.glob(recordio_path)
+    filelist = glob.glob(data_path)
    assert filelist, 'Empty filelist.'
    random.shuffle(filelist)
    for f in filelist:

--- a/textsum/data_convert_example.py
+++ b/textsum/data_convert_example.py
+"""Example of Converting TextSum model data.
+Usage:
+python data_convert_example.py --command binary_to_text --in_file data/data --out_file data/text_data
+python data_convert_example.py --command text_to_binary --in_file data/text_data --out_file data/binary_data
+python data_convert_example.py --command binary_to_text --in_file data/binary_data --out_file data/text_data2
+diff data/text_data2 data/text_data
+"""
+
+import struct
+import sys
+
+import tensorflow as tf
+from tensorflow.core.example import example_pb2
+
+FLAGS = tf.app.flags.FLAGS
+tf.app.flags.DEFINE_string('command', 'binary_to_text',
+                           'Either binary_to_text or text_to_binary.'
+                           'Specify FLAGS.in_file accordingly.')
+tf.app.flags.DEFINE_string('in_file', '', 'path to file')
+tf.app.flags.DEFINE_string('out_file', '', 'path to file')
+
+def _binary_to_text():
+  reader = open(FLAGS.in_file, 'rb')
+  writer = open(FLAGS.out_file, 'w')
+  while True:
+    len_bytes = reader.read(8)
+    if not len_bytes:
+      sys.stderr.write('Done reading\n')
+      return
+    str_len = struct.unpack('q', len_bytes)[0]
+    tf_example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
+    tf_example = example_pb2.Example.FromString(tf_example_str)
+    examples = []
+    for key in tf_example.features.feature:
+      examples.append('%s=%s' % (key, tf_example.features.feature[key].bytes_list.value[0]))
+    writer.write('%s\n' % '\t'.join(examples))
+  reader.close()
+  writer.close()
+
+
+def _text_to_binary():
+  inputs = open(FLAGS.in_file, 'r').readlines()
+  writer = open(FLAGS.out_file, 'wb')
+  for inp in inputs:
+    tf_example = example_pb2.Example()
+    for feature in inp.strip().split('\t'):
+      (k, v) = feature.split('=')
+      tf_example.features.feature[k].bytes_list.value.extend([v])
+    tf_example_str = tf_example.SerializeToString()
+    str_len = len(tf_example_str)
+    writer.write(struct.pack('q', str_len))
+    writer.write(struct.pack('%ds' % str_len, tf_example_str))
+  writer.close()
+
+
+def main(unused_argv):
+  assert FLAGS.command and FLAGS.in_file and FLAGS.out_file
+  if FLAGS.command == 'binary_to_text':
+    _binary_to_text()
+  elif FLAGS.command == 'text_to_binary':
+    _text_to_binary()
+
+
+if __name__ == '__main__':
+  tf.app.run()