Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
4302ace5
Unverified
Commit
4302ace5
authored
Jul 27, 2020
by
Sam Shleifer
Committed by
GitHub
Jul 27, 2020
Browse files
[pack_dataset] don't sort before packing, only pack train (#5954)
parent
c8bdf7f4
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
27 additions
and
12 deletions
+27
-12
examples/requirements.txt
examples/requirements.txt
+1
-0
examples/seq2seq/minify_dataset.py
examples/seq2seq/minify_dataset.py
+19
-0
examples/seq2seq/pack_dataset.py
examples/seq2seq/pack_dataset.py
+7
-12
No files found.
examples/requirements.txt
View file @
4302ace5
...
@@ -13,3 +13,4 @@ streamlit
...
@@ -13,3 +13,4 @@ streamlit
elasticsearch
elasticsearch
pandas
pandas
nlp
nlp
fire
examples/seq2seq/minify_dataset.py
0 → 100644
View file @
4302ace5
from
pathlib
import
Path
import
fire
def
minify
(
src_dir
:
str
,
dest_dir
:
str
,
n
:
int
):
"""Write first n lines of each file f in src_dir to dest_dir/f """
src_dir
=
Path
(
src_dir
)
dest_dir
=
Path
(
dest_dir
)
dest_dir
.
mkdir
(
exist_ok
=
True
)
for
path
in
src_dir
.
iterdir
():
new
=
[
x
.
rstrip
()
for
x
in
list
(
path
.
open
().
readlines
())][:
n
]
dest_path
=
dest_dir
.
joinpath
(
path
.
name
)
print
(
dest_path
)
dest_path
.
open
(
"w"
).
write
(
"
\n
"
.
join
(
new
))
if
__name__
==
"__main__"
:
fire
.
Fire
(
minify
)
examples/seq2seq/pack_dataset.py
View file @
4302ace5
...
@@ -6,6 +6,7 @@
...
@@ -6,6 +6,7 @@
"""
"""
import
argparse
import
argparse
import
shutil
from
pathlib
import
Path
from
pathlib
import
Path
from
tqdm
import
tqdm
from
tqdm
import
tqdm
...
@@ -17,7 +18,7 @@ def pack_examples(tok, src_examples, tgt_examples, max_tokens=1024):
...
@@ -17,7 +18,7 @@ def pack_examples(tok, src_examples, tgt_examples, max_tokens=1024):
finished_src
,
finished_tgt
=
[],
[]
finished_src
,
finished_tgt
=
[],
[]
sorted_examples
=
list
(
sorted
(
zip
(
src_examples
,
tgt_examples
)
,
key
=
lambda
x
:
len
(
x
[
0
]))
)
sorted_examples
=
list
(
zip
(
src_examples
,
tgt_examples
))
new_src
,
new_tgt
=
sorted_examples
[
0
]
new_src
,
new_tgt
=
sorted_examples
[
0
]
def
is_too_big
(
strang
):
def
is_too_big
(
strang
):
...
@@ -42,20 +43,10 @@ def pack_examples(tok, src_examples, tgt_examples, max_tokens=1024):
...
@@ -42,20 +43,10 @@ def pack_examples(tok, src_examples, tgt_examples, max_tokens=1024):
return
finished_src
,
finished_tgt
return
finished_src
,
finished_tgt
def
minify
(
src_dir
:
Path
,
dest_dir
:
Path
,
n
:
int
):
"""Write first n lines of each file f in src_dir to dest_dir/f"""
dest_dir
.
mkdir
(
exist_ok
=
True
)
for
path
in
src_dir
.
iterdir
():
new
=
[
x
.
rstrip
()
for
x
in
list
(
path
.
open
().
readlines
())][:
n
]
dest_path
=
dest_dir
.
joinpath
(
path
.
name
)
print
(
dest_path
)
dest_path
.
open
(
"w"
).
write
(
"
\n
"
.
join
(
new
))
def
pack_data_dir
(
tok
,
data_dir
:
Path
,
max_tokens
,
save_path
):
def
pack_data_dir
(
tok
,
data_dir
:
Path
,
max_tokens
,
save_path
):
save_path
=
Path
(
save_path
)
save_path
=
Path
(
save_path
)
save_path
.
mkdir
(
exist_ok
=
True
)
save_path
.
mkdir
(
exist_ok
=
True
)
for
split
in
[
"val"
,
"test"
,
"train"
]:
for
split
in
[
"train"
]:
src_path
,
tgt_path
=
data_dir
/
f
"
{
split
}
.source"
,
data_dir
/
f
"
{
split
}
.target"
src_path
,
tgt_path
=
data_dir
/
f
"
{
split
}
.source"
,
data_dir
/
f
"
{
split
}
.target"
src_docs
=
[
x
.
rstrip
()
for
x
in
Path
(
src_path
).
open
().
readlines
()]
src_docs
=
[
x
.
rstrip
()
for
x
in
Path
(
src_path
).
open
().
readlines
()]
tgt_docs
=
[
x
.
rstrip
()
for
x
in
Path
(
tgt_path
).
open
().
readlines
()]
tgt_docs
=
[
x
.
rstrip
()
for
x
in
Path
(
tgt_path
).
open
().
readlines
()]
...
@@ -63,6 +54,10 @@ def pack_data_dir(tok, data_dir: Path, max_tokens, save_path):
...
@@ -63,6 +54,10 @@ def pack_data_dir(tok, data_dir: Path, max_tokens, save_path):
print
(
f
"packed
{
split
}
split from
{
len
(
src_docs
)
}
examples ->
{
len
(
packed_src
)
}
."
)
print
(
f
"packed
{
split
}
split from
{
len
(
src_docs
)
}
examples ->
{
len
(
packed_src
)
}
."
)
Path
(
save_path
/
f
"
{
split
}
.source"
).
open
(
"w"
).
write
(
"
\n
"
.
join
(
packed_src
))
Path
(
save_path
/
f
"
{
split
}
.source"
).
open
(
"w"
).
write
(
"
\n
"
.
join
(
packed_src
))
Path
(
save_path
/
f
"
{
split
}
.target"
).
open
(
"w"
).
write
(
"
\n
"
.
join
(
packed_tgt
))
Path
(
save_path
/
f
"
{
split
}
.target"
).
open
(
"w"
).
write
(
"
\n
"
.
join
(
packed_tgt
))
for
split
in
[
"val"
,
"test"
]:
src_path
,
tgt_path
=
data_dir
/
f
"
{
split
}
.source"
,
data_dir
/
f
"
{
split
}
.target"
shutil
.
copyfile
(
src_path
,
save_path
/
f
"
{
split
}
.source"
)
shutil
.
copyfile
(
tgt_path
,
save_path
/
f
"
{
split
}
.target"
)
def
packer_cli
():
def
packer_cli
():
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment