BPE transformation for IWSLT

a8260d52 · Sergey Edunov · ae0c05d9 · a8260d52 · a8260d52
Commit a8260d52 authored Oct 11, 2017 by Sergey Edunov
Hide whitespace changes
Inline Side-by-side

Showing with 26 additions and 4 deletions

README.md README.md +1 -1

data/prepare-iwslt14.sh data/prepare-iwslt14.sh +25 -3

No files found.
--- a/README.md
+++ b/README.md
@@ -108,7 +108,7 @@ $ cd ..
 $ TEXT=data/iwslt14.tokenized.de-en
 $ python preprocess.py --source-lang de --target-lang en \
  --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
-  --thresholdtgt 3 --thresholdsrc 3 --destdir data-bin/iwslt14.tokenized.de-en
+  --destdir data-bin/iwslt14.tokenized.de-en
 ```
 This will write binarized data that can be used for model training to `data-bin/iwslt14.tokenized.de-en`.

--- a/data/prepare-iwslt14.sh
+++ b/data/prepare-iwslt14.sh
@@ -5,10 +5,15 @@
 echo 'Cloning Moses github repository (for tokenization scripts)...'
 git clone https://github.com/moses-smt/mosesdecoder.git
+echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
+git clone https://github.com/rsennrich/subword-nmt.git
 SCRIPTS=mosesdecoder/scripts
 TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
 LC=$SCRIPTS/tokenizer/lowercase.perl
 CLEAN=$SCRIPTS/training/clean-corpus-n.perl
+BPEROOT=subword-nmt
+BPE_TOKENS=10000
 URL="https://wit3.fbk.eu/archive/2014-01/texts/de/en/de-en.tgz"
 GZ=de-en.tgz
@@ -81,13 +86,30 @@ done
 echo "creating train, valid, test..."
 for l in $src $tgt; do
-    awk '{if (NR%23 == 0)  print $0; }' $tmp/train.tags.de-en.$l > $prep/valid.$l
+    awk '{if (NR%23 == 0)  print $0; }' $tmp/train.tags.de-en.$l > $tmp/valid.$l
-    awk '{if (NR%23 != 0)  print $0; }' $tmp/train.tags.de-en.$l > $prep/train.$l
+    awk '{if (NR%23 != 0)  print $0; }' $tmp/train.tags.de-en.$l > $tmp/train.$l
    cat $tmp/IWSLT14.TED.dev2010.de-en.$l \
        $tmp/IWSLT14.TEDX.dev2012.de-en.$l \
        $tmp/IWSLT14.TED.tst2010.de-en.$l \
        $tmp/IWSLT14.TED.tst2011.de-en.$l \
        $tmp/IWSLT14.TED.tst2012.de-en.$l \
-        > $prep/test.$l
+        > $tmp/test.$l
+done
+TRAIN=$tmp/train.en-de
+BPE_CODE=$prep/code
+rm -f $TRAIN
+for l in $src $tgt; do
+    cat $tmp/train.$l >> $TRAIN
+done
+echo "learn_bpe.py on ${TRAIN}..."
+python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
+for L in $src $tgt; do
+    for f in train.$L valid.$L test.$L; do
+        echo "apply_bpe.py to ${f}..."
+        python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f
+    done
 done