"python/pyproject.toml" did not exist on "b675ecaec40d11ef8c0980de748e1a0471fcf5c4"
Commit a8260d52 authored by Sergey Edunov's avatar Sergey Edunov
Browse files

BPE transformation for IWSLT

parent ae0c05d9
...@@ -108,7 +108,7 @@ $ cd .. ...@@ -108,7 +108,7 @@ $ cd ..
$ TEXT=data/iwslt14.tokenized.de-en $ TEXT=data/iwslt14.tokenized.de-en
$ python preprocess.py --source-lang de --target-lang en \ $ python preprocess.py --source-lang de --target-lang en \
--trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \ --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
--thresholdtgt 3 --thresholdsrc 3 --destdir data-bin/iwslt14.tokenized.de-en --destdir data-bin/iwslt14.tokenized.de-en
``` ```
This will write binarized data that can be used for model training to `data-bin/iwslt14.tokenized.de-en`. This will write binarized data that can be used for model training to `data-bin/iwslt14.tokenized.de-en`.
......
...@@ -5,10 +5,15 @@ ...@@ -5,10 +5,15 @@
echo 'Cloning Moses github repository (for tokenization scripts)...' echo 'Cloning Moses github repository (for tokenization scripts)...'
git clone https://github.com/moses-smt/mosesdecoder.git git clone https://github.com/moses-smt/mosesdecoder.git
echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
git clone https://github.com/rsennrich/subword-nmt.git
SCRIPTS=mosesdecoder/scripts SCRIPTS=mosesdecoder/scripts
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
LC=$SCRIPTS/tokenizer/lowercase.perl LC=$SCRIPTS/tokenizer/lowercase.perl
CLEAN=$SCRIPTS/training/clean-corpus-n.perl CLEAN=$SCRIPTS/training/clean-corpus-n.perl
BPEROOT=subword-nmt
BPE_TOKENS=10000
URL="https://wit3.fbk.eu/archive/2014-01/texts/de/en/de-en.tgz" URL="https://wit3.fbk.eu/archive/2014-01/texts/de/en/de-en.tgz"
GZ=de-en.tgz GZ=de-en.tgz
...@@ -81,13 +86,30 @@ done ...@@ -81,13 +86,30 @@ done
echo "creating train, valid, test..." echo "creating train, valid, test..."
for l in $src $tgt; do for l in $src $tgt; do
awk '{if (NR%23 == 0) print $0; }' $tmp/train.tags.de-en.$l > $prep/valid.$l awk '{if (NR%23 == 0) print $0; }' $tmp/train.tags.de-en.$l > $tmp/valid.$l
awk '{if (NR%23 != 0) print $0; }' $tmp/train.tags.de-en.$l > $prep/train.$l awk '{if (NR%23 != 0) print $0; }' $tmp/train.tags.de-en.$l > $tmp/train.$l
cat $tmp/IWSLT14.TED.dev2010.de-en.$l \ cat $tmp/IWSLT14.TED.dev2010.de-en.$l \
$tmp/IWSLT14.TEDX.dev2012.de-en.$l \ $tmp/IWSLT14.TEDX.dev2012.de-en.$l \
$tmp/IWSLT14.TED.tst2010.de-en.$l \ $tmp/IWSLT14.TED.tst2010.de-en.$l \
$tmp/IWSLT14.TED.tst2011.de-en.$l \ $tmp/IWSLT14.TED.tst2011.de-en.$l \
$tmp/IWSLT14.TED.tst2012.de-en.$l \ $tmp/IWSLT14.TED.tst2012.de-en.$l \
> $prep/test.$l > $tmp/test.$l
done
TRAIN=$tmp/train.en-de
BPE_CODE=$prep/code
rm -f $TRAIN
for l in $src $tgt; do
cat $tmp/train.$l >> $TRAIN
done
echo "learn_bpe.py on ${TRAIN}..."
python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
for L in $src $tgt; do
for f in train.$L valid.$L test.$L; do
echo "apply_bpe.py to ${f}..."
python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f
done
done done
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment