Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
19d80b98
Commit
19d80b98
authored
Sep 30, 2016
by
Xin Pan
Committed by
GitHub
Sep 30, 2016
Browse files
Merge pull request #379 from panyx0718/master
Example of data conversion and fix comments
parents
f98c5ded
85836119
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
83 additions
and
8 deletions
+83
-8
textsum/README.md
textsum/README.md
+10
-4
textsum/data.py
textsum/data.py
+8
-4
textsum/data_convert_example.py
textsum/data_convert_example.py
+65
-0
No files found.
textsum/README.md
View file @
19d80b98
...
...
@@ -27,6 +27,9 @@ for example vocabulary format. In <b>How To Run</b> below, users can use toy
data and vocab provided in the data/ directory to run the training by replacing
the data directory flag.
data_convert_example.py contains example of convert between binary and text.
<b>
Experiment Result
</b>
8000 examples from testset are sampled to generate summaries and rouge score is
...
...
@@ -73,10 +76,13 @@ Install TensorFlow and Bazel.
```
shell
# cd to your workspace
# clone the code to your workspace and create empty WORKSPACE file.
# move the data to your workspace. If don't have full dataset yet, copy
# the toy data from the data/ directory from code directory and rename
# the files.
# 1. Clone the textsum code to your workspace 'textsum' directory.
# 2. Create an empty 'WORKSPACE' file in your workspace.
# 3. Move the train/eval/test data to your workspace 'data' directory.
# In the following example, I named the data training-*, test-*, etc.
# If your data files have different names, update the --data_path.
# If you don't have data but want to try out the model, copy the toy
# data from the textsum/data/data to the data/ directory in the workspace.
ls
-R
.:
data textsum WORKSPACE
...
...
textsum/data.py
View file @
19d80b98
...
...
@@ -70,11 +70,15 @@ class Vocab(object):
return
self
.
_count
def
ExampleGen
(
recordio_path
,
num_epochs
=
None
):
"""Generates tf.Examples from path of recordio files.
def
ExampleGen
(
data_path
,
num_epochs
=
None
):
"""Generates tf.Examples from path of data files.
Binary data format: <length><blob>. <length> represents the byte size
of <blob>. <blob> is serialized tf.Example proto. The tf.Example contains
the tokenized article text and summary.
Args:
recordio
_path:
CNS
path to tf.Example
recordio
data
_path: path to tf.Example
data files.
num_epochs: Number of times to go through the data. None means infinite.
Yields:
...
...
@@ -86,7 +90,7 @@ def ExampleGen(recordio_path, num_epochs=None):
while
True
:
if
num_epochs
is
not
None
and
epoch
>=
num_epochs
:
break
filelist
=
glob
.
glob
(
recordio
_path
)
filelist
=
glob
.
glob
(
data
_path
)
assert
filelist
,
'Empty filelist.'
random
.
shuffle
(
filelist
)
for
f
in
filelist
:
...
...
textsum/data_convert_example.py
0 → 100644
View file @
19d80b98
"""Example of Converting TextSum model data.
Usage:
python data_convert_example.py --command binary_to_text --in_file data/data --out_file data/text_data
python data_convert_example.py --command text_to_binary --in_file data/text_data --out_file data/binary_data
python data_convert_example.py --command binary_to_text --in_file data/binary_data --out_file data/text_data2
diff data/text_data2 data/text_data
"""
import
struct
import
sys
import
tensorflow
as
tf
from
tensorflow.core.example
import
example_pb2
FLAGS
=
tf
.
app
.
flags
.
FLAGS
tf
.
app
.
flags
.
DEFINE_string
(
'command'
,
'binary_to_text'
,
'Either binary_to_text or text_to_binary.'
'Specify FLAGS.in_file accordingly.'
)
tf
.
app
.
flags
.
DEFINE_string
(
'in_file'
,
''
,
'path to file'
)
tf
.
app
.
flags
.
DEFINE_string
(
'out_file'
,
''
,
'path to file'
)
def
_binary_to_text
():
reader
=
open
(
FLAGS
.
in_file
,
'rb'
)
writer
=
open
(
FLAGS
.
out_file
,
'w'
)
while
True
:
len_bytes
=
reader
.
read
(
8
)
if
not
len_bytes
:
sys
.
stderr
.
write
(
'Done reading
\n
'
)
return
str_len
=
struct
.
unpack
(
'q'
,
len_bytes
)[
0
]
tf_example_str
=
struct
.
unpack
(
'%ds'
%
str_len
,
reader
.
read
(
str_len
))[
0
]
tf_example
=
example_pb2
.
Example
.
FromString
(
tf_example_str
)
examples
=
[]
for
key
in
tf_example
.
features
.
feature
:
examples
.
append
(
'%s=%s'
%
(
key
,
tf_example
.
features
.
feature
[
key
].
bytes_list
.
value
[
0
]))
writer
.
write
(
'%s
\n
'
%
'
\t
'
.
join
(
examples
))
reader
.
close
()
writer
.
close
()
def
_text_to_binary
():
inputs
=
open
(
FLAGS
.
in_file
,
'r'
).
readlines
()
writer
=
open
(
FLAGS
.
out_file
,
'wb'
)
for
inp
in
inputs
:
tf_example
=
example_pb2
.
Example
()
for
feature
in
inp
.
strip
().
split
(
'
\t
'
):
(
k
,
v
)
=
feature
.
split
(
'='
)
tf_example
.
features
.
feature
[
k
].
bytes_list
.
value
.
extend
([
v
])
tf_example_str
=
tf_example
.
SerializeToString
()
str_len
=
len
(
tf_example_str
)
writer
.
write
(
struct
.
pack
(
'q'
,
str_len
))
writer
.
write
(
struct
.
pack
(
'%ds'
%
str_len
,
tf_example_str
))
writer
.
close
()
def
main
(
unused_argv
):
assert
FLAGS
.
command
and
FLAGS
.
in_file
and
FLAGS
.
out_file
if
FLAGS
.
command
==
'binary_to_text'
:
_binary_to_text
()
elif
FLAGS
.
command
==
'text_to_binary'
:
_text_to_binary
()
if
__name__
==
'__main__'
:
tf
.
app
.
run
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment