Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Fairseq
Commits
862cad11
"src/vscode:/vscode.git/clone" did not exist on "dcf320f2937f19581bad195e35d6ba796d807c42"
Commit
862cad11
authored
Sep 12, 2018
by
Sergey Edunov
Committed by
Myle Ott
Sep 25, 2018
Browse files
Parallel preprocessing
parent
ee46c63b
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
169 additions
and
34 deletions
+169
-34
fairseq/data/indexed_dataset.py
fairseq/data/indexed_dataset.py
+29
-3
fairseq/tokenizer.py
fairseq/tokenizer.py
+71
-13
preprocess.py
preprocess.py
+69
-18
No files found.
fairseq/data/indexed_dataset.py
View file @
862cad11
...
...
@@ -52,9 +52,15 @@ def data_file_path(prefix_path):
class
IndexedDataset
(
torch
.
utils
.
data
.
Dataset
):
"""Loader for TorchNet IndexedDataset"""
def
__init__
(
self
,
path
,
fix_lua_indexing
=
False
):
def
__init__
(
self
,
path
,
fix_lua_indexing
=
False
,
read_data
=
True
):
super
().
__init__
()
self
.
fix_lua_indexing
=
fix_lua_indexing
self
.
read_index
(
path
)
self
.
data_file
=
None
if
read_data
:
self
.
read_data
(
path
)
def
read_index
(
self
,
path
):
with
open
(
index_file_path
(
path
),
'rb'
)
as
f
:
magic
=
f
.
read
(
8
)
assert
magic
==
b
'TNTIDX
\x00\x00
'
...
...
@@ -66,7 +72,6 @@ class IndexedDataset(torch.utils.data.Dataset):
self
.
dim_offsets
=
read_longs
(
f
,
self
.
size
+
1
)
self
.
data_offsets
=
read_longs
(
f
,
self
.
size
+
1
)
self
.
sizes
=
read_longs
(
f
,
self
.
s
)
self
.
read_data
(
path
)
def
read_data
(
self
,
path
):
self
.
data_file
=
open
(
data_file_path
(
path
),
'rb'
,
buffering
=
0
)
...
...
@@ -76,7 +81,8 @@ class IndexedDataset(torch.utils.data.Dataset):
raise
IndexError
(
'index out of range'
)
def
__del__
(
self
):
self
.
data_file
.
close
()
if
self
.
data_file
:
self
.
data_file
.
close
()
def
__getitem__
(
self
,
i
):
self
.
check_index
(
i
)
...
...
@@ -193,6 +199,26 @@ class IndexedDatasetBuilder(object):
self
.
sizes
.
append
(
s
)
self
.
dim_offsets
.
append
(
self
.
dim_offsets
[
-
1
]
+
len
(
tensor
.
size
()))
def
merge_file_
(
self
,
another_file
):
index
=
IndexedDataset
(
another_file
,
read_data
=
False
)
assert
index
.
dtype
==
self
.
dtype
begin
=
self
.
data_offsets
[
-
1
]
for
offset
in
index
.
data_offsets
[
1
:]:
self
.
data_offsets
.
append
(
begin
+
offset
)
self
.
sizes
.
extend
(
index
.
sizes
)
begin
=
self
.
dim_offsets
[
-
1
]
for
dim_offset
in
index
.
dim_offsets
[
1
:]:
self
.
dim_offsets
.
append
(
begin
+
dim_offset
)
with
open
(
data_file_path
(
another_file
),
'rb'
)
as
f
:
while
True
:
data
=
f
.
read
(
1024
)
if
data
:
self
.
out_file
.
write
(
data
)
else
:
break
def
finalize
(
self
,
index_file
):
self
.
out_file
.
close
()
index
=
open
(
index_file
,
'wb'
)
...
...
fairseq/tokenizer.py
View file @
862cad11
...
...
@@ -6,10 +6,10 @@
# can be found in the PATENTS file in the same directory.
from
collections
import
Counter
import
re
import
os
,
re
import
torch
from
multiprocessing
import
Pool
SPACE_NORMALIZER
=
re
.
compile
(
"\s+"
)
...
...
@@ -20,28 +20,74 @@ def tokenize_line(line):
return
line
.
split
()
def
safe_readline
(
f
):
pos
=
f
.
tell
()
while
True
:
try
:
return
f
.
readline
()
except
UnicodeDecodeError
:
pos
-=
1
f
.
seek
(
pos
)
# search where this character begins
class
Tokenizer
:
@
staticmethod
def
add_file_to_dictionary
(
filename
,
dict
,
tokenize
):
def
add_file_to_dictionary_single_worker
(
filename
,
tokenize
,
eos_word
,
worker_id
=
0
,
num_workers
=
1
):
counter
=
Counter
()
with
open
(
filename
,
'r'
)
as
f
:
for
line
in
f
:
size
=
os
.
fstat
(
f
.
fileno
()).
st_size
chunk_size
=
size
//
num_workers
offset
=
worker_id
*
chunk_size
end
=
offset
+
chunk_size
f
.
seek
(
offset
)
if
offset
>
0
:
safe_readline
(
f
)
# drop first incomplete line
line
=
f
.
readline
()
while
line
:
for
word
in
tokenize
(
line
):
dict
.
add_symbol
(
word
)
dict
.
add_symbol
(
dict
.
eos_word
)
counter
.
update
([
word
])
counter
.
update
([
eos_word
])
if
f
.
tell
()
>
end
:
break
line
=
f
.
readline
()
return
counter
@
staticmethod
def
add_file_to_dictionary
(
filename
,
dict
,
tokenize
,
num_workers
):
def
merge_result
(
counter
):
for
w
,
c
in
counter
.
items
():
dict
.
add_symbol
(
w
,
c
)
if
num_workers
>
1
:
pool
=
Pool
(
processes
=
num_workers
)
results
=
[]
for
worker_id
in
range
(
num_workers
):
results
.
append
(
pool
.
apply_async
(
Tokenizer
.
add_file_to_dictionary_single_worker
,
(
filename
,
tokenize
,
dict
.
eos_word
,
worker_id
,
num_workers
)
))
pool
.
close
()
pool
.
join
()
for
r
in
results
:
merge_result
(
r
.
get
())
else
:
merge_result
(
Tokenizer
.
add_file_to_dictionary_single_worker
(
filename
,
tokenize
,
dict
.
eos_word
))
@
staticmethod
def
binarize
(
filename
,
dict
,
consumer
,
tokenize
=
tokenize_line
,
append_eos
=
True
,
reverse_order
=
False
):
append_eos
=
True
,
reverse_order
=
False
,
offset
=
0
,
end
=-
1
):
nseq
,
ntok
=
0
,
0
replaced
=
Counter
()
def
replaced_consumer
(
word
,
idx
):
if
idx
==
dict
.
unk_index
and
word
!=
dict
.
unk_word
:
replaced
.
update
([
word
])
with
open
(
filename
,
'r'
)
as
f
:
for
line
in
f
:
f
.
seek
(
offset
)
# next(f) breaks f.tell(), hence readline() must be used
line
=
safe_readline
(
f
)
while
line
:
if
end
>
0
and
f
.
tell
()
>
end
:
break
ids
=
Tokenizer
.
tokenize
(
line
=
line
,
dict
=
dict
,
...
...
@@ -52,10 +98,22 @@ class Tokenizer:
reverse_order
=
reverse_order
,
)
nseq
+=
1
consumer
(
ids
)
ntok
+=
len
(
ids
)
return
{
'nseq'
:
nseq
,
'nunk'
:
sum
(
replaced
.
values
()),
'ntok'
:
ntok
,
'replaced'
:
len
(
replaced
)}
consumer
(
ids
)
line
=
f
.
readline
()
return
{
'nseq'
:
nseq
,
'nunk'
:
sum
(
replaced
.
values
()),
'ntok'
:
ntok
,
'replaced'
:
replaced
}
@
staticmethod
def
find_offsets
(
filename
,
num_chunks
):
with
open
(
filename
,
'r'
)
as
f
:
size
=
os
.
fstat
(
f
.
fileno
()).
st_size
chunk_size
=
size
//
num_chunks
offsets
=
[
0
for
_
in
range
(
num_chunks
+
1
)]
for
i
in
range
(
1
,
num_chunks
):
f
.
seek
(
chunk_size
*
i
)
safe_readline
(
f
)
offsets
[
i
]
=
f
.
tell
()
return
offsets
@
staticmethod
def
tokenize
(
line
,
dict
,
tokenize
=
tokenize_line
,
add_if_not_exist
=
True
,
...
...
preprocess.py
View file @
862cad11
...
...
@@ -10,12 +10,16 @@ Data pre-processing: build vocabularies and binarize training data.
"""
import
argparse
from
collections
import
Counter
from
itertools
import
zip_longest
import
os
import
shutil
from
fairseq.data
import
indexed_dataset
,
dictionary
from
fairseq.tokenizer
import
Tokenizer
,
tokenize_line
from
multiprocessing
import
Pool
,
Manager
,
Process
def
get_parser
():
...
...
@@ -41,6 +45,7 @@ def get_parser():
parser
.
add_argument
(
'--only-source'
,
action
=
'store_true'
,
help
=
'Only process the source language'
)
parser
.
add_argument
(
'--padding-factor'
,
metavar
=
'N'
,
default
=
8
,
type
=
int
,
help
=
'Pad dictionary size to be multiple of N'
)
parser
.
add_argument
(
'--workers'
,
metavar
=
'N'
,
default
=
1
,
type
=
int
,
help
=
'number of parallel workers'
)
return
parser
...
...
@@ -52,7 +57,7 @@ def main(args):
def
build_dictionary
(
filenames
):
d
=
dictionary
.
Dictionary
()
for
filename
in
filenames
:
Tokenizer
.
add_file_to_dictionary
(
filename
,
d
,
tokenize_line
)
Tokenizer
.
add_file_to_dictionary
(
filename
,
d
,
tokenize_line
,
args
.
workers
)
return
d
def
train_path
(
lang
):
...
...
@@ -70,11 +75,6 @@ def main(args):
def
dict_path
(
lang
):
return
dest_path
(
'dict'
,
lang
)
+
'.txt'
def
dataset_dest_path
(
output_prefix
,
lang
,
extension
):
base
=
f
'
{
args
.
destdir
}
/
{
output_prefix
}
'
lang_part
=
f
'.
{
args
.
source_lang
}
-
{
args
.
target_lang
}
.
{
lang
}
'
if
lang
is
not
None
else
''
return
f
'
{
base
}{
lang_part
}
.
{
extension
}
'
if
args
.
joined_dictionary
:
assert
not
args
.
srcdict
,
'cannot combine --srcdict and --joined-dictionary'
assert
not
args
.
tgtdict
,
'cannot combine --tgtdict and --joined-dictionary'
...
...
@@ -111,25 +111,54 @@ def main(args):
)
tgt_dict
.
save
(
dict_path
(
args
.
target_lang
))
def
make_binary_dataset
(
input_prefix
,
output_prefix
,
lang
):
def
make_binary_dataset
(
input_prefix
,
output_prefix
,
lang
,
num_workers
):
dict
=
dictionary
.
Dictionary
.
load
(
dict_path
(
lang
))
print
(
'| [{}] Dictionary: {} types'
.
format
(
lang
,
len
(
dict
)
-
1
))
n_seq_tok
=
[
0
,
0
]
replaced
=
Counter
()
d
s
=
indexed_dataset
.
IndexedDatasetBuilder
(
dataset_dest_path
(
output_prefix
,
lang
,
'bin'
))
def
consumer
(
tensor
):
ds
.
add_item
(
tensor
)
d
ef
merge_result
(
worker_result
):
replaced
.
update
(
worker_result
[
'replaced'
])
n_seq_tok
[
0
]
+=
worker_result
[
'nseq'
]
n_seq_tok
[
1
]
+=
worker_result
[
'ntok'
]
input_file
=
'{}{}'
.
format
(
input_prefix
,
(
'.'
+
lang
)
if
lang
is
not
None
else
''
)
res
=
Tokenizer
.
binarize
(
input_file
,
dict
,
consumer
)
offsets
=
Tokenizer
.
find_offsets
(
input_file
,
num_workers
)
pool
=
None
if
num_workers
>
1
:
pool
=
Pool
(
processes
=
num_workers
-
1
)
for
worker_id
in
range
(
1
,
num_workers
):
prefix
=
"{}{}"
.
format
(
output_prefix
,
worker_id
)
pool
.
apply_async
(
binarize
,
(
args
,
input_file
,
dict
,
prefix
,
lang
,
offsets
[
worker_id
],
offsets
[
worker_id
+
1
]),
callback
=
merge_result
)
pool
.
close
()
ds
=
indexed_dataset
.
IndexedDatasetBuilder
(
dataset_dest_file
(
args
,
output_prefix
,
lang
,
'bin'
))
merge_result
(
Tokenizer
.
binarize
(
input_file
,
dict
,
lambda
t
:
ds
.
add_item
(
t
),
offset
=
0
,
end
=
offsets
[
1
]))
if
num_workers
>
1
:
pool
.
join
()
for
worker_id
in
range
(
1
,
num_workers
):
prefix
=
"{}{}"
.
format
(
output_prefix
,
worker_id
)
temp_file_path
=
dataset_dest_prefix
(
args
,
prefix
,
lang
)
ds
.
merge_file_
(
temp_file_path
)
os
.
remove
(
indexed_dataset
.
data_file_path
(
temp_file_path
))
os
.
remove
(
indexed_dataset
.
index_file_path
(
temp_file_path
))
ds
.
finalize
(
dataset_dest_file
(
args
,
output_prefix
,
lang
,
'idx'
))
print
(
'| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'
.
format
(
lang
,
input_file
,
res
[
'nseq'
],
res
[
'ntok'
],
100
*
res
[
'nunk'
]
/
res
[
'ntok'
],
dict
.
unk_word
))
ds
.
finalize
(
dataset_dest_path
(
output_prefix
,
lang
,
'idx'
))
lang
,
input_file
,
n_seq_tok
[
0
],
n_seq_tok
[
1
],
100
*
sum
(
replaced
.
values
())
/
n_seq_tok
[
1
],
dict
.
unk_word
))
def
make_dataset
(
input_prefix
,
output_prefix
,
lang
):
def
make_dataset
(
input_prefix
,
output_prefix
,
lang
,
num_workers
=
1
):
if
args
.
output_format
==
'binary'
:
make_binary_dataset
(
input_prefix
,
output_prefix
,
lang
)
make_binary_dataset
(
input_prefix
,
output_prefix
,
lang
,
num_workers
)
elif
args
.
output_format
==
'raw'
:
# Copy original text file to destination folder
output_text_file
=
dest_path
(
...
...
@@ -140,7 +169,7 @@ def main(args):
def
make_all
(
lang
):
if
args
.
trainpref
:
make_dataset
(
args
.
trainpref
,
'train'
,
lang
)
make_dataset
(
args
.
trainpref
,
'train'
,
lang
,
num_workers
=
args
.
workers
)
if
args
.
validpref
:
for
k
,
validpref
in
enumerate
(
args
.
validpref
.
split
(
','
)):
outprefix
=
'valid{}'
.
format
(
k
)
if
k
>
0
else
'valid'
...
...
@@ -196,6 +225,28 @@ def main(args):
print
(
'{} {}'
.
format
(
src_dict
[
k
],
tgt_dict
[
v
]),
file
=
f
)
def
binarize
(
args
,
filename
,
dict
,
output_prefix
,
lang
,
offset
,
end
):
ds
=
indexed_dataset
.
IndexedDatasetBuilder
(
dataset_dest_file
(
args
,
output_prefix
,
lang
,
'bin'
))
def
consumer
(
tensor
):
ds
.
add_item
(
tensor
)
res
=
Tokenizer
.
binarize
(
filename
,
dict
,
consumer
,
offset
=
offset
,
end
=
end
)
ds
.
finalize
(
dataset_dest_file
(
args
,
output_prefix
,
lang
,
'idx'
))
return
res
def
dataset_dest_prefix
(
args
,
output_prefix
,
lang
):
base
=
f
'
{
args
.
destdir
}
/
{
output_prefix
}
'
lang_part
=
f
'.
{
args
.
source_lang
}
-
{
args
.
target_lang
}
.
{
lang
}
'
if
lang
is
not
None
else
''
return
f
'
{
base
}{
lang_part
}
'
def
dataset_dest_file
(
args
,
output_prefix
,
lang
,
extension
):
base
=
dataset_dest_prefix
(
args
,
output_prefix
,
lang
)
return
f
'
{
base
}
.
{
extension
}
'
if
__name__
==
'__main__'
:
parser
=
get_parser
()
args
=
parser
.
parse_args
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment