Merging in master

aed3086a · Michael Carilli · b5465fe6 · 9041a868 · aed3086a · aed3086a
Commit aed3086a authored Jan 31, 2019 by Michael Carilli
4 changed files
--- a/examples/word_language_model/README.md
+++ b/examples/word_language_model/README.md
@@ -9,8 +9,20 @@ The trained model can then be used by the generate script to generate new text.

 `main_fp16_optimizer.py` with `--fp16` demonstrates use of `apex.fp16_utils.FP16_Optimizer` to automatically manage master parameters and loss scaling.

+With `--fp16`, to enable Tensor Core use and achieve best performance, dimensions that participate in GEMMs in the model should be multiples of 8.  Specifically, these are
+* dictionary length (ntokens in `main.py`),
+* embedding size (`--emsize`),
+* hidden size (`--nhid`), and
+* batch size (`--batch_size`).
+
+The dictionary length is a property of the dataset, and is not controlled by a command line argument. In `main.py`, `corpus = data.Corpus(args.data, pad_to_multiple_of=8)` and the `Corpus` constructor in
+`data.py` ensure that the dictionary length is a multiple of 8.
+
+Also, for mixed precision performance, a good general rule is: the more work you give the GPU, the better.  Bigger models and larger batch sizes supply the cores with more work and do a better job saturating the device.  A (very rough) way to check if you're saturating the device is to run nvidia-smi from another terminal, and see what fraction of device memory you're using.  This will tell you how much leeway you have to increase model or batch size.
+
 ```bash
 python main.py --cuda --epochs 6        # Train a LSTM on Wikitext-2 with CUDA
+python main.py --cuda --epochs 6 --fp16 # Train a LSTM on Wikitext-2 with CUDA and mixed precision
 python main.py --cuda --epochs 6 --tied # Train a tied LSTM on Wikitext-2 with CUDA
 python main.py --cuda --tied            # Train a tied LSTM on Wikitext-2 with CUDA for 40 epochs
 python generate.py                      # Generate samples from the trained LSTM model.
@@ -67,12 +79,11 @@ optional arguments:
 ```
 which triggers the use of dynamic loss scaling.  Supplying `--dynamic-loss-scale` will override the `--loss_scale` argument, if any.

-With these arguments, a variety of models can be tested.
-As an example, the following arguments produce slower but better models:
+With these arguments, a variety of models can be tested.  For example

 ```bash
-python main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 40
-python main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 40 --tied
-python main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40
-python main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 --tied
+python main.py --cuda --emsize 656 --nhid 656 --dropout 0.5 --epochs 40
+python main.py --cuda --emsize 656 --nhid 656 --dropout 0.5 --epochs 40 --tied
+python main.py --cuda --emsize 1504 --nhid 1504 --dropout 0.65 --epochs 40
+python main.py --cuda --emsize 1504 --nhid 1504 --dropout 0.65 --epochs 40 --tied
 ```
--- a/examples/word_language_model/data.py
+++ b/examples/word_language_model/data.py
@@ -18,12 +18,23 @@ class Dictionary(object):


 class Corpus(object):
-    def __init__(self, path):
+    def __init__(self, path, pad_to_multiple_of=1):
+        # Synthetic elements used to pad the dictionary length.
+        # It is assumed that these synthetic elements do not appear in the actual data files.
+        self.synthetic = ["vvvvvvvv" + str(i) for i in range(pad_to_multiple_of-1)]
+
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

+        # Pad dictionary size to desired multiple.  For example, padding to a multiple of 8
+        # is necessary to ensure Tensor Core usage for the decoder.
+        pad_elem = pad_to_multiple_of - len(self.dictionary)%pad_to_multiple_of
+        if pad_elem != pad_to_multiple_of:
+            for i in range(pad_elem):
+                self.dictionary.add_word(self.synthetic[i])
+
    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)

--- a/examples/word_language_model/main.py
+++ b/examples/word_language_model/main.py
@@ -17,9 +17,9 @@ parser.add_argument('--data', type=str, default='./data/wikitext-2',
                    help='location of the data corpus')
 parser.add_argument('--model', type=str, default='LSTM',
                    help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)')
-parser.add_argument('--emsize', type=int, default=200,
+parser.add_argument('--emsize', type=int, default=1504,
                    help='size of word embeddings')
-parser.add_argument('--nhid', type=int, default=200,
+parser.add_argument('--nhid', type=int, default=1504,
                    help='number of hidden units per layer')
 parser.add_argument('--nlayers', type=int, default=2,
                    help='number of layers')
@@ -29,11 +29,11 @@ parser.add_argument('--clip', type=float, default=0.25,
                    help='gradient clipping')
 parser.add_argument('--epochs', type=int, default=40,
                    help='upper epoch limit')
-parser.add_argument('--batch_size', type=int, default=20, metavar='N',
+parser.add_argument('--batch_size', type=int, default=24, metavar='N',
                    help='batch size')
 parser.add_argument('--bptt', type=int, default=35,
                    help='sequence length')
-parser.add_argument('--dropout', type=float, default=0.2,
+parser.add_argument('--dropout', type=float, default=0.65,
                    help='dropout applied to layers (0 = no dropout)')
 parser.add_argument('--tied', action='store_true',
                    help='tie the word embedding and softmax weights')
@@ -64,7 +64,9 @@ if args.fp16 and not args.cuda:
 # Load data
 ###############################################################################

-corpus = data.Corpus(args.data)
+# Ensure that the dictionary length is a multiple of 8,
+# so that the decoder's GEMMs will use Tensor Cores.
+corpus = data.Corpus(args.data, pad_to_multiple_of=8)

 # Starting from sequential data, batchify arranges the dataset into columns.
 # For instance, with the alphabet as the sequence and batch size 4, we'd get
@@ -99,6 +101,16 @@ test_data = batchify(corpus.test, eval_batch_size)
 ###############################################################################

 ntokens = len(corpus.dictionary)
+
+if args.fp16 and args.cuda:
+    if ntokens%8 != 0:
+        print("Warning: the dictionary size (ntokens = {}) should be a multiple of 8 to ensure "
+              "Tensor Core use for the decoder's GEMMs.".format(ntokens))
+    if args.emsize%8 != 0 or args.nhid%8 != 0 or args.batch_size%8 != 0:
+        print("Warning: emsize = {}, nhid = {}, batch_size = {} should all be multiples of 8 "
+              "to ensure Tensor Core use for the RNN's GEMMs.".format(
+              args.emsize, args.nhid, args.batch_size))
+
 model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied)

 if args.cuda and args.fp16:

--- a/examples/word_language_model/main_fp16_optimizer.py
+++ b/examples/word_language_model/main_fp16_optimizer.py
@@ -17,9 +17,9 @@ parser.add_argument('--data', type=str, default='./data/wikitext-2',
                    help='location of the data corpus')
 parser.add_argument('--model', type=str, default='LSTM',
                    help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)')
-parser.add_argument('--emsize', type=int, default=200,
+parser.add_argument('--emsize', type=int, default=1504,
                    help='size of word embeddings')
-parser.add_argument('--nhid', type=int, default=200,
+parser.add_argument('--nhid', type=int, default=1504,
                    help='number of hidden units per layer')
 parser.add_argument('--nlayers', type=int, default=2,
                    help='number of layers')
@@ -29,7 +29,7 @@ parser.add_argument('--clip', type=float, default=0.25,
                    help='gradient clipping')
 parser.add_argument('--epochs', type=int, default=40,
                    help='upper epoch limit')
-parser.add_argument('--batch_size', type=int, default=20, metavar='N',
+parser.add_argument('--batch_size', type=int, default=24, metavar='N',
                    help='batch size')
 parser.add_argument('--bptt', type=int, default=35,
                    help='sequence length')
@@ -67,7 +67,9 @@ if args.fp16 and not args.cuda:
 # Load data
 ###############################################################################

-corpus = data.Corpus(args.data)
+# Ensure that the dictionary length is a multiple of 8,
+# so that the decoder's GEMMs will use Tensor Cores.
+corpus = data.Corpus(args.data, pad_to_multiple_of=8)

 # Starting from sequential data, batchify arranges the dataset into columns.
 # For instance, with the alphabet as the sequence and batch size 4, we'd get
@@ -102,6 +104,16 @@ test_data = batchify(corpus.test, eval_batch_size)
 ###############################################################################

 ntokens = len(corpus.dictionary)
+
+if args.fp16 and args.cuda:
+    if ntokens%8 != 0:
+        print("Warning: the dictionary size (ntokens = {}) should be a multiple of 8 to ensure "
+              "Tensor Core use for the decoder's GEMMs.".format(ntokens))
+    if args.emsize%8 != 0 or args.nhid%8 != 0 or args.batch_size%8 != 0:
+        print("Warning: emsize = {}, nhid = {}, batch_size = {} should all be multiples of 8 "
+              "to ensure Tensor Core use for the RNN's GEMMs.".format(
+              args.emsize, args.nhid, args.batch_size))
+
 model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied)

 if args.cuda and args.fp16: