Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Fairseq
Commits
71d2d44c
Commit
71d2d44c
authored
Jan 12, 2018
by
Sergey Edunov
Browse files
Prepare scripts for WMT14
parent
9430544a
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
266 additions
and
4 deletions
+266
-4
data/prepare-wmt14en2de.sh
data/prepare-wmt14en2de.sh
+127
-0
data/prepare-wmt14en2fr.sh
data/prepare-wmt14en2fr.sh
+136
-0
fairseq/tokenizer.py
fairseq/tokenizer.py
+3
-4
No files found.
data/prepare-wmt14en2de.sh
0 → 100644
View file @
71d2d44c
#!/bin/bash
# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
echo
'Cloning Moses github repository (for tokenization scripts)...'
git clone https://github.com/moses-smt/mosesdecoder.git
echo
'Cloning Subword NMT repository (for BPE pre-processing)...'
git clone https://github.com/rsennrich/subword-nmt.git
SCRIPTS
=
mosesdecoder/scripts
TOKENIZER
=
$SCRIPTS
/tokenizer/tokenizer.perl
CLEAN
=
$SCRIPTS
/training/clean-corpus-n.perl
NORM_PUNC
=
$SCRIPTS
/tokenizer/normalize-punctuation.perl
REM_NON_PRINT_CHAR
=
$SCRIPTS
/tokenizer/remove-non-printing-char.perl
BPEROOT
=
subword-nmt
BPE_TOKENS
=
40000
URLS
=(
"http://statmt.org/wmt13/training-parallel-europarl-v7.tgz"
"http://statmt.org/wmt13/training-parallel-commoncrawl.tgz"
"http://statmt.org/wmt14/training-parallel-nc-v9.tgz"
)
FILES
=(
"training-parallel-europarl-v7.tgz"
"training-parallel-commoncrawl.tgz"
"training-parallel-nc-v9.tgz"
)
CORPORA
=(
"training/europarl-v7.de-en"
"commoncrawl.de-en"
"training/news-commentary-v9.de-en"
)
if
[
!
-d
"
$SCRIPTS
"
]
;
then
echo
"Please set SCRIPTS variable correctly to point to Moses scripts."
exit
fi
src
=
en
tgt
=
de
lang
=
en-de
prep
=
wmt14_en_de
tmp
=
$prep
/tmp
orig
=
orig
mkdir
-p
$orig
$tmp
$prep
cd
$orig
for
((
i
=
0
;
i<
${#
URLS
[@]
}
;
++i
))
;
do
file
=
${
FILES
[i]
}
if
[
-f
$file
]
;
then
echo
"
$file
already exists, skipping download"
else
url
=
${
URLS
[i]
}
wget
"
$url
"
if
[
-f
$file
]
;
then
echo
"
$url
successfully downloaded."
else
echo
"
$url
not successfully downloaded."
exit
-1
fi
if
[
${
file
:
-4
}
==
".tgz"
]
;
then
tar
zxvf
$file
elif
[
${
file
:
-4
}
==
".tar"
]
;
then
tar
xvf
$file
fi
fi
done
cd
..
echo
"pre-processing train data..."
for
l
in
$src
$tgt
;
do
rm
$tmp
/train.tags.
$lang
.tok.
$l
for
f
in
"
${
CORPORA
[@]
}
"
;
do
cat
$orig
/
$f
.
$l
|
\
perl
$NORM_PUNC
$l
|
\
perl
$REM_NON_PRINT_CHAR
|
\
perl
$TOKENIZER
-threads
8
-a
-l
$l
>>
$tmp
/train.tags.
$lang
.tok.
$l
done
done
echo
"pre-processing test data..."
for
l
in
$src
$tgt
;
do
if
[
"
$l
"
==
"
$src
"
]
;
then
t
=
"src"
else
t
=
"ref"
fi
grep
'<seg id'
$orig
/test-full/newstest2014-deen-
$t
.
$l
.sgm |
\
sed
-e
's/<seg id="[0-9]*">\s*//g'
|
\
sed
-e
's/\s*<\/seg>\s*//g'
|
\
sed
-e
"s/
\’
/
\'
/g"
|
\
perl
$TOKENIZER
-threads
8
-a
-l
$l
>
$tmp
/test.
$l
echo
""
done
echo
"splitting train and valid..."
for
l
in
$src
$tgt
;
do
awk
'{if (NR%100 == 0) print $0; }'
$tmp
/train.tags.
$lang
.tok.
$l
>
$tmp
/valid.
$l
awk
'{if (NR%100 != 0) print $0; }'
$tmp
/train.tags.
$lang
.tok.
$l
>
$tmp
/train.
$l
done
TRAIN
=
$tmp
/train.de-en
BPE_CODE
=
$prep
/code
rm
-f
$TRAIN
for
l
in
$src
$tgt
;
do
cat
$tmp
/train.
$l
>>
$TRAIN
done
echo
"learn_bpe.py on
${
TRAIN
}
..."
python
$BPEROOT
/learn_bpe.py
-s
$BPE_TOKENS
<
$TRAIN
>
$BPE_CODE
for
L
in
$src
$tgt
;
do
for
f
in
train.
$L
valid.
$L
test.
$L
;
do
echo
"apply_bpe.py to
${
f
}
..."
python
$BPEROOT
/apply_bpe.py
-c
$BPE_CODE
<
$tmp
/
$f
>
$tmp
/bpe.
$f
done
done
perl
$CLEAN
-ratio
1.5
$tmp
/bpe.train
$src
$tgt
$prep
/train 1 250
perl
$CLEAN
-ratio
1.5
$tmp
/bpe.valid
$src
$tgt
$prep
/valid 1 250
for
L
in
$src
$tgt
;
do
cp
$tmp
/bpe.test.
$L
$prep
/test.
$L
done
data/prepare-wmt14en2fr.sh
0 → 100644
View file @
71d2d44c
#!/bin/bash
# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
echo
'Cloning Moses github repository (for tokenization scripts)...'
git clone https://github.com/moses-smt/mosesdecoder.git
echo
'Cloning Subword NMT repository (for BPE pre-processing)...'
git clone https://github.com/rsennrich/subword-nmt.git
SCRIPTS
=
mosesdecoder/scripts
TOKENIZER
=
$SCRIPTS
/tokenizer/tokenizer.perl
CLEAN
=
$SCRIPTS
/training/clean-corpus-n.perl
NORM_PUNC
=
$SCRIPTS
/tokenizer/normalize-punctuation.perl
REM_NON_PRINT_CHAR
=
$SCRIPTS
/tokenizer/remove-non-printing-char.perl
BPEROOT
=
subword-nmt
BPE_TOKENS
=
40000
URLS
=(
"http://statmt.org/wmt13/training-parallel-europarl-v7.tgz"
"http://statmt.org/wmt13/training-parallel-commoncrawl.tgz"
"http://statmt.org/wmt13/training-parallel-un.tgz"
"http://statmt.org/wmt14/training-parallel-nc-v9.tgz"
"http://statmt.org/wmt10/training-giga-fren.tar"
"http://statmt.org/wmt14/test-full.tgz"
)
FILES
=(
"training-parallel-europarl-v7.tgz"
"training-parallel-commoncrawl.tgz"
"training-parallel-un.tgz"
"training-parallel-nc-v9.tgz"
"training-giga-fren.tar"
"test-full.tgz"
)
CORPORA
=(
"training/europarl-v7.fr-en"
"commoncrawl.fr-en"
"un/undoc.2000.fr-en"
"training/news-commentary-v9.fr-en"
"giga-fren.release2.fixed"
)
if
[
!
-d
"
$SCRIPTS
"
]
;
then
echo
"Please set SCRIPTS variable correctly to point to Moses scripts."
exit
fi
src
=
en
tgt
=
fr
lang
=
en-fr
prep
=
wmt14_en_fr
tmp
=
$prep
/tmp
orig
=
orig
mkdir
-p
$orig
$tmp
$prep
cd
$orig
for
((
i
=
0
;
i<
${#
URLS
[@]
}
;
++i
))
;
do
file
=
${
FILES
[i]
}
if
[
-f
$file
]
;
then
echo
"
$file
already exists, skipping download"
else
url
=
${
URLS
[i]
}
wget
"
$url
"
if
[
-f
$file
]
;
then
echo
"
$url
successfully downloaded."
else
echo
"
$url
not successfully downloaded."
exit
-1
fi
if
[
${
file
:
-4
}
==
".tgz"
]
;
then
tar
zxvf
$file
elif
[
${
file
:
-4
}
==
".tar"
]
;
then
tar
xvf
$file
fi
fi
done
gunzip
giga-fren.release2.fixed.
*
.gz
cd
..
echo
"pre-processing train data..."
for
l
in
$src
$tgt
;
do
rm
$tmp
/train.tags.
$lang
.tok.
$l
for
f
in
"
${
CORPORA
[@]
}
"
;
do
cat
$orig
/
$f
.
$l
|
\
perl
$NORM_PUNC
$l
|
\
perl
$REM_NON_PRINT_CHAR
|
\
perl
$TOKENIZER
-threads
8
-a
-l
$l
>>
$tmp
/train.tags.
$lang
.tok.
$l
done
done
echo
"pre-processing test data..."
for
l
in
$src
$tgt
;
do
if
[
"
$l
"
==
"
$src
"
]
;
then
t
=
"src"
else
t
=
"ref"
fi
grep
'<seg id'
$orig
/test-full/newstest2014-fren-
$t
.
$l
.sgm |
\
sed
-e
's/<seg id="[0-9]*">\s*//g'
|
\
sed
-e
's/\s*<\/seg>\s*//g'
|
\
sed
-e
"s/
\’
/
\'
/g"
|
\
perl
$TOKENIZER
-threads
8
-a
-l
$l
>
$tmp
/test.
$l
echo
""
done
echo
"splitting train and valid..."
for
l
in
$src
$tgt
;
do
awk
'{if (NR%1333 == 0) print $0; }'
$tmp
/train.tags.
$lang
.tok.
$l
>
$tmp
/valid.
$l
awk
'{if (NR%1333 != 0) print $0; }'
$tmp
/train.tags.
$lang
.tok.
$l
>
$tmp
/train.
$l
done
TRAIN
=
$tmp
/train.fr-en
BPE_CODE
=
$prep
/code
rm
-f
$TRAIN
for
l
in
$src
$tgt
;
do
cat
$tmp
/train.
$l
>>
$TRAIN
done
echo
"learn_bpe.py on
${
TRAIN
}
..."
python
$BPEROOT
/learn_bpe.py
-s
$BPE_TOKENS
<
$TRAIN
>
$BPE_CODE
for
L
in
$src
$tgt
;
do
for
f
in
train.
$L
valid.
$L
test.
$L
;
do
echo
"apply_bpe.py to
${
f
}
..."
python
$BPEROOT
/apply_bpe.py
-c
$BPE_CODE
<
$tmp
/
$f
>
$tmp
/bpe.
$f
done
done
perl
$CLEAN
-ratio
1.5
$tmp
/bpe.train
$src
$tgt
$prep
/train 1 250
perl
$CLEAN
-ratio
1.5
$tmp
/bpe.valid
$src
$tgt
$prep
/valid 1 250
for
L
in
$src
$tgt
;
do
cp
$tmp
/bpe.test.
$L
$prep
/test.
$L
done
fairseq/tokenizer.py
View file @
71d2d44c
...
@@ -13,12 +13,11 @@ import torch
...
@@ -13,12 +13,11 @@ import torch
from
fairseq
import
dictionary
from
fairseq
import
dictionary
SPACE_NORMALIZER
=
re
.
compile
(
"\s+"
)
def
tokenize_line
(
line
):
def
tokenize_line
(
line
):
line
=
re
.
sub
(
r
"\t"
,
""
,
line
)
line
=
SPACE_NORMALIZER
.
sub
(
" "
,
line
)
line
=
re
.
sub
(
r
"^\s+"
,
""
,
line
)
line
=
line
.
strip
()
line
=
re
.
sub
(
r
"\s+$"
,
""
,
line
)
line
=
re
.
sub
(
r
"\s+"
,
" "
,
line
)
return
line
.
split
()
return
line
.
split
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment