modify scripts, elaborate on usage and meanings

85cc80ab · kimiyoung · c6b06f76 · 85cc80ab · 85cc80ab · 85cc80ab
Commit 85cc80ab authored Jan 22, 2019 by kimiyoung
18 changed files
--- a/pytorch/README.md
+++ b/pytorch/README.md
+#### Introduction
+This directory contains our pytorch implementation of Transformer-XL. Note that our state-of-the-art results reported in the paper were obtained by training the model on a large-scale TPU cluster, and our pytorch codebase currently does not support distributed training. Here we provide two sets of hyperparameters and scripts:
+- `*large.sh` are for the SoTA setting with large models which might not be directly runnable on a local GPU machine.
+- `*base.sh` are for the base models which can be run on a few GPUs.
+The pytorch implementation produces similar results to the TF codebase under the same settings in our preliminary experiments.
 #### Prerequisite
 - Pytorch 0.4: `conda install pytorch torchvision -c pytorch`
 #### Data Prepration
 `bash getdata.sh`
@@ -18,7 +26,7 @@
  `bash run_enwik8.sh train --work_dir PATH_TO_WORK_DIR`
- Testing
+- Evaluation
  `bash run_enwik8.sh eval --work_dir PATH_TO_WORK_DIR`
@@ -28,7 +36,7 @@
 - Make sure the machine have **4 GPUs**, each with **at least 11G memory**
- Training
+- Evaluation
  `bash run_wt103.sh train --work_dir PATH_TO_WORK_DIR`
@@ -44,8 +52,8 @@
 - `--div_val`: when using adaptive softmax and embedding, the embedding dimension is divided by `div_val` from bin $i$ to bin $i+1$. This saves both GPU memory and the parameter budget.
 - `--fp16` and `--dynamic-loss-scale`: Run in pseudo-fp16 mode (fp16 storage fp32 math) with dynamic loss scaling. 
  - Note: to explore the `--fp16` option, please make sure the `apex` package is installed (https://github.com/NVIDIA/apex/).
- `--attn_type`: set `attn_type` to 2 to use standard Transformer without any recurrence.
+- To see performance without the recurrence mechanism, simply use `mem_len=0` in all your scripts.
+- To see performance with a standard Transformer without relative positional encodings and recurrence mechanisms, use `attn_type=2` and `mem_len=0`.
 #### Other datasets:

--- a/pytorch/run_enwik8.sh
+++ b/pytorch/run_enwik8.sh
--- a/pytorch/run_enwik8_large.sh
+++ b/pytorch/run_enwik8_large.sh
+#!/bin/bash
+if [[ $1 == 'train' ]]; then
+    echo 'Run training...'
+    python train.py \
+        --cuda \
+        --data ../data/enwik8/ \
+        --dataset enwik8 \
+        --n_layer 24 \
+        --d_model 1024 \
+        --n_head 8 \
+        --d_head 128 \
+        --d_inner 3072 \
+        --dropout 0.15 \
+        --dropatt 0.15 \
+        --optim adam \
+        --lr 0.00025 \
+        --warmup_step 4000 \
+        --max_step 400000 \
+        --tgt_len 768 \
+        --mem_len 768 \
+        --eval_tgt_len 128 \
+        --batch_size 64 \
+        --multi_gpu \
+        --gpu0_bsz 0 \
+        ${@:2}
+elif [[ $1 == 'eval' ]]; then
+    echo 'Run evaluation...'
+    python eval.py \
+        --cuda \
+        --data ../data/enwik8/ \
+        --dataset enwik8 \
+        --tgt_len 128 \
+        --mem_len 3800 \
+        --clamp_len 1000 \
+        --same_length \
+        --split test \
+        ${@:2}
+else
+    echo 'unknown argment 1'
+fi
--- a/pytorch/run_lm1b.sh
+++ b/pytorch/run_lm1b.sh
@@ -7,16 +7,16 @@ if [[ $1 == 'train' ]]; then
        --data ../data/one-billion-words/ \
        --dataset lm1b \
        --adaptive \
-        --n_layer 12 \
+        --n_layer 18 \
        --d_model 1024 \
        --div_val 4 \
        --n_head 8 \
        --d_head 128 \
-        --d_inner 2048 \
+        --d_inner 4096 \
        --dropout 0.0 \
        --dropatt 0.0 \
        --optim adam \
-        --warmup_step 4000 \
+        --warmup_step 20000 \
        --max_step 500000 \
        --lr 0.00025 \
        --tgt_len 32 \
@@ -34,7 +34,7 @@ elif [[ $1 == 'eval' ]]; then
        --dataset lm1b \
        --batch_size 64 \
        --tgt_len 32 \
-        --mem_len 64 \
+        --mem_len 128 \
        --split test \
        --same_length \
        ${@:2}

--- a/pytorch/run_lm1b_large.sh
+++ b/pytorch/run_lm1b_large.sh
+#!/bin/bash
+if [[ $1 == 'train' ]]; then
+    echo 'Run training...'
+    python train.py \
+        --cuda \
+        --data ../data/one-billion-words/ \
+        --dataset lm1b \
+        --adaptive \
+        --div_val 4 \
+        --n_layer 24 \
+        --d_model 1280 \
+        --n_head 16 \
+        --d_head 80 \
+        --d_inner 8192 \
+        --dropout 0.05 \
+        --dropatt 0.05 \
+        --optim adam \
+        --warmup_step 30000 \
+        --max_step 1200000 \
+        --lr 0.00025 \
+        --tgt_len 32 \
+        --mem_len 32 \
+        --eval_tgt_len 32 \
+        --batch_size 512 \
+        --multi_gpu \
+        --gpu0_bsz 0 \
+        ${@:2}
+elif [[ $1 == 'eval' ]]; then
+    echo 'Run evaluation...'
+    python eval.py \
+        --cuda \
+        --data ../data/one-billion-words/ \
+        --dataset lm1b \
+        --batch_size 8 \
+        --tgt_len 32 \
+        --mem_len 128 \
+        --split test \
+        --same_length \
+        ${@:2}
+else
+    echo 'unknown argment 1'
+fi
--- a/pytorch/run_text8.sh
+++ b/pytorch/run_text8.sh
@@ -11,14 +11,18 @@ if [[ $1 == 'train' ]]; then
        --n_head 8 \
        --d_head 64 \
        --d_inner 2048 \
-        --dropout 0.2 \
+        --dropout 0.1 \
+        --dropatt 0.0 \
        --optim adam \
        --lr 0.00025 \
-        --tgt_len 256 \
+        --warmup_step 0 \
-        --mem_len 256 \
+        --max_step 400000 \
-        --eval_tgt_len 256 \
+        --tgt_len 512 \
-        --batch_size 16 \
+        --mem_len 512 \
-        --max_step 1000000 \
+        --eval_tgt_len 128 \
+        --batch_size 22 \
+        --multi_gpu \
+        --gpu0_bsz 4 \
        ${@:2}
 elif [[ $1 == 'eval' ]]; then
    echo 'Run evaluation...'
@@ -26,7 +30,9 @@ elif [[ $1 == 'eval' ]]; then
        --cuda \
        --data ../data/text8/ \
        --dataset text8 \
-        --tgt_len 256 \
+        --tgt_len 80 \
+        --mem_len 2100 \
+        --clamp_len 820 \
        --same_length \
        --split test \
        ${@:2}

--- a/pytorch/run_text8_large.sh
+++ b/pytorch/run_text8_large.sh
+#!/bin/bash
+if [[ $1 == 'train' ]]; then
+    echo 'Run training...'
+    python train.py \
+        --cuda \
+        --data ../data/text8/ \
+        --dataset text8 \
+        --n_layer 24 \
+        --d_model 1024 \
+        --n_head 8 \
+        --d_head 128 \
+        --d_inner 3072 \
+        --dropout 0.15 \
+        --dropatt 0.15 \
+        --optim adam \
+        --lr 0.00025 \
+        --tgt_len 768 \
+        --mem_len 768 \
+        --eval_tgt_len 128 \
+        --batch_size 64 \
+        --max_step 400000 \
+        ${@:2}
+elif [[ $1 == 'eval' ]]; then
+    echo 'Run evaluation...'
+    python eval.py \
+        --cuda \
+        --data ../data/text8/ \
+        --dataset text8 \
+        --tgt_len 128 \
+        --mem_len 3800 \
+        --clamp_len 1000 \
+        --same_length \
+        --split test \
+        ${@:2}
+else
+    echo 'unknown argment 1'
+fi
--- a/pytorch/run_wt103.sh
+++ b/pytorch/run_wt103.sh
--- a/pytorch/run_wt103_large.sh
+++ b/pytorch/run_wt103_large.sh
+#!/bin/bash
+if [[ $1 == 'train' ]]; then
+    echo 'Run training...'
+    python train.py \
+        --cuda \
+        --data ../data/wikitext-103/ \
+        --dataset wt103 \
+        --adaptive \
+        --div_val 4 \
+        --n_layer 18 \
+        --d_model 1024 \
+        --n_head 16 \
+        --d_head 64 \
+        --d_inner 4096 \
+        --dropout 0.2 \
+        --dropatt 0.2 \
+        --optim adam \
+        --lr 0.00025 \
+        --warmup_step 16000 \
+        --max_step 4000000 \
+        --tgt_len 384 \
+        --mem_len 384 \
+        --eval_tgt_len 128 \
+        --batch_size 128 \
+        --multi_gpu \
+        --gpu0_bsz 0 \
+        ${@:2}
+elif [[ $1 == 'eval' ]]; then
+    echo 'Run evaluation...'
+    python eval.py \
+        --cuda \
+        --data ../data/wikitext-103/ \
+        --dataset wt103 \
+        --tgt_len 128 \
+        --mem_len 1600 \
+        --clamp_len 1000 \
+        --same_length \
+        --split test \
+        ${@:2}
+else
+    echo 'unknown argment 1'
+fi
--- a/tf/README.md
+++ b/tf/README.md
+## Introduction
+This directory contains our TF implementation of Transformer-XL. Note that our state-of-the-art results reported in the paper were obtained by training the model on a large-scale TPU cluster, and our gpu codebase currently does not support distributed training. Here we provide two sets of hyperparameters and scripts:
+- `*large_tpu.sh` are for the SoTA setting on TPUs. These are exactly the commands we used to obtained our best results.
+- `*base_gpu.sh` are for the base models which can be run on a few GPUs.
 ## Prerequisite
 - Python 2.7
@@ -78,39 +86,40 @@ For `dataset` in `[enwik8, lm1b, wt103, text8]`:
 GPU:
- create training and validation data: `bash scripts/dataset_gpu.sh train_data`
+- create training and validation data: `bash scripts/dataset_bas_gpu.sh train_data`
- create test data: `bash scripts/dataset_gpu.sh test_data`
+- create test data: `bash scripts/dataset_base_gpu.sh test_data`
 TPU:
- Set the Google storage URL  in `scripts/dataset_tpu.sh`:
+- Set the Google storage URL  in `scripts/dataset_large_tpu.sh`:
  - `GSDATA`: data URL
  - `GSEXP`: experiment URL
- create training and validation data: `bash scripts/dataset_tpu.sh train_data`
+- create training and validation data: `bash scripts/dataset_large_tpu.sh train_data`
- create test data: `bash scripts/dataset_tpu.sh test_data`
+- create test data: `bash scripts/dataset_large_tpu.sh test_data`
 #### (2) Run training
-GPU:
+Base models on GPUs:
- Modify the configurations in `scripts/dataset_gpu.sh`  according to your needs.
+- Modify the configurations in `scripts/dataset_base_gpu.sh`  according to your needs.
- `bash scripts/dataset_gpu.sh train`
+- `bash scripts/dataset_base_gpu.sh train`
+- If enough resources are available, increasing the model sizes (e.g., `N_LAYER`, `D_MODEL`, `D_EMBED`, `D_HEAD`, `D_INNER`) so that they are closer to the values defined in `scripts/dataset_large_tpu.sh`. Likewise, when resources are limited, decrease the model sizes. It is recommended to ensure that `D_MODEL == D_EMBED` and `D_MODEL == N_HEAD x D_HEAD`. When the model sizes increase, remember to increase `warmup_steps` accordingly to alleviate optimization difficulties.
-TPU:
+Larger models on TPUs:
- Modify the configurations in `scripts/dataset_tpu.sh`  according to your needs.
+- Modify the configurations in `scripts/dataset_large_tpu.sh`  according to your needs.
- `bash scripts/dataset_tpu.sh train`
+- `bash scripts/dataset_large_tpu.sh train`
 #### (3) Run evaluation
-GPU:
+Base models on GPUs:
- `bash scripts/dataset_gpu.sh eval --eval_ckpt_path PATH_TO_CKPT`
+- `bash scripts/dataset_base_gpu.sh eval --eval_ckpt_path PATH_TO_CKPT`
-TPU:
+Larger models on TPUs:
- `bash scripts/dataset_tpu.sh eval --eval_ckpt_path PATH_TO_CKPT`
+- `bash scripts/dataset_base_tpu.sh eval --eval_ckpt_path PATH_TO_CKPT`
--- a/tf/scripts/enwik8_gpu.sh
+++ b/tf/scripts/enwik8_gpu.sh
@@ -4,47 +4,47 @@
 DATA_ROOT=../data/enwik8/
 # Model
-N_LAYER=24
+N_LAYER=12
-D_MODEL=1024
+D_MODEL=512
-D_EMBED=1024
+D_EMBED=512
 N_HEAD=8
-D_HEAD=128
+D_HEAD=64
-D_INNER=3072
+D_INNER=2048
 # Training
-TGT_LEN=256
+TGT_LEN=512
-MEM_LEN=256
+MEM_LEN=512
-BSZ=16
+BSZ=24
-NUM_CORE=2
+NUM_CORE=4
 # Testing
-TEST_TGT_LEN=128
+TEST_TGT_LEN=80
-TEST_MEM_LEN=3800
+TEST_MEM_LEN=2100
-TEST_CLAMP_LEN=1000
+TEST_CLAMP_LEN=820
-TEST_BSZ=16
+TEST_BSZ=10
-TEST_NUM_CORE=4
+TEST_NUM_CORE=1
 if [[ $1 == 'train_data' ]]; then
    python data_utils.py \
-      --data_dir=${DATA_ROOT}/ \
+        --data_dir=${DATA_ROOT}/ \
-      --dataset=enwik8 \
+        --dataset=enwik8 \
-      --tgt_len=${TGT_LEN} \
+        --tgt_len=${TGT_LEN} \
-      --per_host_train_bsz=${BSZ} \
+        --per_host_train_bsz=${BSZ} \
-      --per_host_valid_bsz=${BSZ} \
+        --per_host_valid_bsz=${BSZ} \
-      --num_passes=1 \
+        --num_passes=1 \
-      --use_tpu=False \
+        --use_tpu=False \
-      ${@:2}
+        ${@:2}
 elif [[ $1 == 'test_data' ]]; then
    python data_utils.py \
-      --data_dir=${DATA_ROOT}/ \
+        --data_dir=${DATA_ROOT}/ \
-      --dataset=enwik8 \
+        --dataset=enwik8 \
-      --tgt_len=${TEST_TGT_LEN} \
+        --tgt_len=${TEST_TGT_LEN} \
-      --per_host_test_bsz=${TEST_BSZ} \
+        --per_host_test_bsz=${TEST_BSZ} \
-      --num_passes=1 \
+        --num_passes=1 \
-      --use_tpu=False \
+        --use_tpu=False \
-      ${@:2}
+        ${@:2}
 elif [[ $1 == 'train' ]]; then
    echo 'Run training...'
    python train_gpu.py \
@@ -68,7 +68,7 @@ elif [[ $1 == 'train' ]]; then
        --train_batch_size=${BSZ} \
        --num_core_per_host=${NUM_CORE} \
        --iterations=200 \
-        --save_steps=200 \
+        --save_steps=4000 \
        --do_train=True \
        --do_eval=False \
        ${@:2}
@@ -99,4 +99,4 @@ elif [[ $1 == 'eval' ]]; then
        ${@:2}
 else
    echo 'unknown argment 1'
 fi
\ No newline at end of file
--- a/tf/scripts/enwik8_tpu.sh
+++ b/tf/scripts/enwik8_tpu.sh
--- a/tf/scripts/lm1b_gpu.sh
+++ b/tf/scripts/lm1b_gpu.sh
@@ -5,19 +5,19 @@ DATA_ROOT=../data/one-billion-words/
 # Model
 DIV_VAL=4
-N_LAYER=24
+N_LAYER=18
-D_MODEL=1280
+D_MODEL=1024
-D_EMBED=1280
+D_EMBED=1024
-N_HEAD=16
+N_HEAD=8
-D_HEAD=80
+D_HEAD=128
-D_INNER=8192
+D_INNER=4096
 # Training
 TGT_LEN=256
 MEM_LEN=256
-BSZ=16
+BSZ=256
-NUM_CORE=2
+NUM_CORE=4
 # Testing
 TEST_TGT_LEN=32

--- a/tf/scripts/lm1b_tpu.sh
+++ b/tf/scripts/lm1b_tpu.sh
--- a/tf/scripts/text8_gpu.sh
+++ b/tf/scripts/text8_gpu.sh
@@ -4,47 +4,47 @@
 DATA_ROOT=../data/text8/
 # Model
-N_LAYER=24
+N_LAYER=12
-D_MODEL=1024
+D_MODEL=512
-D_EMBED=1024
+D_EMBED=512
 N_HEAD=8
-D_HEAD=128
+D_HEAD=64
-D_INNER=3072
+D_INNER=2048
 # Training
-TGT_LEN=256
+TGT_LEN=512
-MEM_LEN=256
+MEM_LEN=512
-BSZ=16
+BSZ=24
-NUM_CORE=2
+NUM_CORE=4
 # Testing
-TEST_TGT_LEN=128
+TEST_TGT_LEN=80
-TEST_MEM_LEN=3800
+TEST_MEM_LEN=2100
-TEST_CLAMP_LEN=1000
+TEST_CLAMP_LEN=820
-TEST_BSZ=8
+TEST_BSZ=10
-TEST_NUM_CORE=2
+TEST_NUM_CORE=1
 if [[ $1 == 'train_data' ]]; then
    python data_utils.py \
-      --data_dir=${DATA_ROOT}/ \
+        --data_dir=${DATA_ROOT}/ \
-      --dataset=text8 \
+        --dataset=text8 \
-      --tgt_len=${TGT_LEN} \
+        --tgt_len=${TGT_LEN} \
-      --per_host_train_bsz=${BSZ} \
+        --per_host_train_bsz=${BSZ} \
-      --per_host_valid_bsz=${BSZ} \
+        --per_host_valid_bsz=${BSZ} \
-      --num_passes=1 \
+        --num_passes=1 \
-      --use_tpu=False \
+        --use_tpu=False \
-      ${@:2}
+        ${@:2}
 elif [[ $1 == 'test_data' ]]; then
    python data_utils.py \
-      --data_dir=${DATA_ROOT}/ \
+        --data_dir=${DATA_ROOT}/ \
-      --dataset=text8 \
+        --dataset=text8 \
-      --tgt_len=${TEST_TGT_LEN} \
+        --tgt_len=${TEST_TGT_LEN} \
-      --per_host_test_bsz=${TEST_BSZ} \
+        --per_host_test_bsz=${TEST_BSZ} \
-      --num_passes=1 \
+        --num_passes=1 \
-      --use_tpu=False \
+        --use_tpu=False \
-      ${@:2}
+        ${@:2}
 elif [[ $1 == 'train' ]]; then
    echo 'Run training...'
    python train_gpu.py \
@@ -68,7 +68,7 @@ elif [[ $1 == 'train' ]]; then
        --train_batch_size=${BSZ} \
        --num_core_per_host=${NUM_CORE} \
        --iterations=200 \
-        --save_steps=200 \
+        --save_steps=4000 \
        --do_train=True \
        --do_eval=False \
        ${@:2}
@@ -99,4 +99,4 @@ elif [[ $1 == 'eval' ]]; then
        ${@:2}
 else
    echo 'unknown argment 1'
 fi
\ No newline at end of file
--- a/tf/scripts/text8_tpu.sh
+++ b/tf/scripts/text8_tpu.sh
--- a/tf/scripts/wt103_gpu.sh
+++ b/tf/scripts/wt103_gpu.sh
@@ -4,49 +4,49 @@
 DATA_ROOT=../data/wikitext-103/
 # Model
-DIV_VAL=4
+DIV_VAL=1
-N_LAYER=18
+N_LAYER=16
-D_MODEL=1024
+D_MODEL=410
-D_EMBED=1024
+D_EMBED=410
-N_HEAD=16
+N_HEAD=10
-D_HEAD=64
+D_HEAD=41
-D_INNER=4096
+D_INNER=2100
 # Training
-TGT_LEN=256
+TGT_LEN=150
-MEM_LEN=256
+MEM_LEN=150
-BSZ=16
+BSZ=60
-NUM_CORE=2
+NUM_CORE=4
 # Testing
-TEST_TGT_LEN=128
+TEST_TGT_LEN=64
-TEST_MEM_LEN=1600
+TEST_MEM_LEN=640
-TEST_CLAMP_LEN=1000
+TEST_CLAMP_LEN=400
-TEST_BSZ=16
+TEST_BSZ=10
 TEST_NUM_CORE=1
 if [[ $1 == 'train_data' ]]; then
    python data_utils.py \
-      --data_dir=${DATA_ROOT}/ \
+        --data_dir=${DATA_ROOT}/ \
-      --dataset=wt103 \
+        --dataset=wt103 \
-      --tgt_len=${TGT_LEN} \
+        --tgt_len=${TGT_LEN} \
-      --per_host_train_bsz=${BSZ} \
+        --per_host_train_bsz=${BSZ} \
-      --per_host_valid_bsz=${BSZ} \
+        --per_host_valid_bsz=${BSZ} \
-      --num_passes=1 \
+        --num_passes=1 \
-      --use_tpu=False \
+        --use_tpu=False \
-      ${@:2}
+        ${@:2}
 elif [[ $1 == 'test_data' ]]; then
    python data_utils.py \
-      --data_dir=${DATA_ROOT}/ \
+        --data_dir=${DATA_ROOT}/ \
-      --dataset=enwik8 \
+        --dataset=enwik8 \
-      --tgt_len=${TEST_TGT_LEN} \
+        --tgt_len=${TEST_TGT_LEN} \
-      --per_host_test_bsz=${TEST_BSZ} \
+        --per_host_test_bsz=${TEST_BSZ} \
-      --num_passes=1 \
+        --num_passes=1 \
-      --use_tpu=False \
+        --use_tpu=False \
-      ${@:2}
+        ${@:2}
 elif [[ $1 == 'train' ]]; then
    echo 'Run training...'
    python train_gpu.py \
@@ -105,4 +105,4 @@ elif [[ $1 == 'eval' ]]; then
        ${@:2}
 else
    echo 'unknown argment 1'
 fi
\ No newline at end of file
--- a/tf/scripts/wt103_tpu.sh
+++ b/tf/scripts/wt103_tpu.sh