Commit e21946e0 authored by Michael Carilli's avatar Michael Carilli
Browse files

Updated default sizes to be multiples of 8 to enable Tensor Core use. Added...

Updated default sizes to be multiples of 8 to enable Tensor Core use.  Added performance guidelines to README.
parent def8fb85
...@@ -9,10 +9,21 @@ The trained model can then be used by the generate script to generate new text. ...@@ -9,10 +9,21 @@ The trained model can then be used by the generate script to generate new text.
`main_fp16_optimizer.py` with `--fp16` demonstrates use of `apex.fp16_utils.FP16_Optimizer` to automatically manage master parameters and loss scaling. `main_fp16_optimizer.py` with `--fp16` demonstrates use of `apex.fp16_utils.FP16_Optimizer` to automatically manage master parameters and loss scaling.
With `--fp16`, to enable Tensor Core use and achieve best performance, all dimensions that participate in GEMMs in the model should be multiples of 8. Specifically, these are
* dictionary length (ntokens in `main.py`),
* embedding size (`--emsize`),
* hidden size (`--nhid`), and
* batch size (`--batch_size`).
The dictionary length is a property of the dataset, and is not controlled by a command line argument. In `main.py`, `corpus = data.Corpus(args.data, pad_to_multiple_of=8)` and the `Corpus` constructor in
`data.py` ensure that the dictionary length is a multiple of 8.
Also, for mixed precision performance, a good general rule is: the more work you give the GPU, the better. Bigger models and larger batch sizes supply the cores with more work and do a better job saturating the device. A (very rough) way to check if you're saturating the device is to run nvidia-smi from another terminal, and see what fraction of device memory you're using. This will tell you how much leeway you have to increase model or batch size.
```bash ```bash
python main.py --cuda --epochs 6 # Train a LSTM on Wikitext-2 with CUDA, reaching perplexity of 117.61 python main.py --cuda --epochs 6 # Train a LSTM on Wikitext-2 with CUDA
python main.py --cuda --epochs 6 --tied # Train a tied LSTM on Wikitext-2 with CUDA, reaching perplexity of 110.44 python main.py --cuda --epochs 6 --fp16 # Train a LSTM on Wikitext-2 with CUDA and mixed precision
python main.py --cuda --tied # Train a tied LSTM on Wikitext-2 with CUDA for 40 epochs, reaching perplexity of 87.17 python main.py --cuda --epochs 6 --tied # Train a tied LSTM on Wikitext-2 with CUDA
python main.py --cuda --tied # Train a tied LSTM on Wikitext-2 with CUDA for 40 epochs
python generate.py # Generate samples from the trained LSTM model. python generate.py # Generate samples from the trained LSTM model.
``` ```
...@@ -67,14 +78,13 @@ optional arguments: ...@@ -67,14 +78,13 @@ optional arguments:
``` ```
which triggers the use of dynamic loss scaling. Supplying `--dynamic-loss-scale` will override the `--loss_scale` argument, if any. which triggers the use of dynamic loss scaling. Supplying `--dynamic-loss-scale` will override the `--loss_scale` argument, if any.
With these arguments, a variety of models can be tested. With these arguments, a variety of models can be tested. For example
As an example, the following arguments produce slower but better models:
```bash ```bash
python main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 40 # Test perplexity of 80.97 python main.py --cuda --emsize 656 --nhid 656 --dropout 0.5 --epochs 40
python main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 40 --tied # Test perplexity of 75.96 python main.py --cuda --emsize 656 --nhid 656 --dropout 0.5 --epochs 40 --tied
python main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 # Test perplexity of 77.42 python main.py --cuda --emsize 1504 --nhid 1504 --dropout 0.65 --epochs 40
python main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 --tied # Test perplexity of 72.30 python main.py --cuda --emsize 1504 --nhid 1504 --dropout 0.65 --epochs 40 --tied
``` ```
Perplexities on PTB are equal or better than Perplexities on PTB are equal or better than
......
...@@ -18,12 +18,23 @@ class Dictionary(object): ...@@ -18,12 +18,23 @@ class Dictionary(object):
class Corpus(object): class Corpus(object):
def __init__(self, path): def __init__(self, path, pad_to_multiple_of=1):
# Synthetic elements used to pad the dictionary length.
# It is assumed that these synthetic elements do not appear in the actual data files.
self.synthetic = ["vvvvvvvv" + str(i) for i in range(pad_to_multiple_of-1)]
self.dictionary = Dictionary() self.dictionary = Dictionary()
self.train = self.tokenize(os.path.join(path, 'train.txt')) self.train = self.tokenize(os.path.join(path, 'train.txt'))
self.valid = self.tokenize(os.path.join(path, 'valid.txt')) self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
self.test = self.tokenize(os.path.join(path, 'test.txt')) self.test = self.tokenize(os.path.join(path, 'test.txt'))
# Pad dictionary size to desired multiple. For example, padding to a multiple of 8
# is necessary to ensure Tensor Core usage for the decoder.
pad_elem = pad_to_multiple_of - len(self.dictionary)%pad_to_multiple_of
if pad_elem != pad_to_multiple_of:
for i in range(pad_elem):
self.dictionary.add_word(self.synthetic[i])
def tokenize(self, path): def tokenize(self, path):
"""Tokenizes a text file.""" """Tokenizes a text file."""
assert os.path.exists(path) assert os.path.exists(path)
......
...@@ -17,9 +17,9 @@ parser.add_argument('--data', type=str, default='./data/wikitext-2', ...@@ -17,9 +17,9 @@ parser.add_argument('--data', type=str, default='./data/wikitext-2',
help='location of the data corpus') help='location of the data corpus')
parser.add_argument('--model', type=str, default='LSTM', parser.add_argument('--model', type=str, default='LSTM',
help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)') help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)')
parser.add_argument('--emsize', type=int, default=200, parser.add_argument('--emsize', type=int, default=1504,
help='size of word embeddings') help='size of word embeddings')
parser.add_argument('--nhid', type=int, default=200, parser.add_argument('--nhid', type=int, default=1504,
help='number of hidden units per layer') help='number of hidden units per layer')
parser.add_argument('--nlayers', type=int, default=2, parser.add_argument('--nlayers', type=int, default=2,
help='number of layers') help='number of layers')
...@@ -29,11 +29,11 @@ parser.add_argument('--clip', type=float, default=0.25, ...@@ -29,11 +29,11 @@ parser.add_argument('--clip', type=float, default=0.25,
help='gradient clipping') help='gradient clipping')
parser.add_argument('--epochs', type=int, default=40, parser.add_argument('--epochs', type=int, default=40,
help='upper epoch limit') help='upper epoch limit')
parser.add_argument('--batch_size', type=int, default=20, metavar='N', parser.add_argument('--batch_size', type=int, default=24, metavar='N',
help='batch size') help='batch size')
parser.add_argument('--bptt', type=int, default=35, parser.add_argument('--bptt', type=int, default=35,
help='sequence length') help='sequence length')
parser.add_argument('--dropout', type=float, default=0.2, parser.add_argument('--dropout', type=float, default=0.65,
help='dropout applied to layers (0 = no dropout)') help='dropout applied to layers (0 = no dropout)')
parser.add_argument('--tied', action='store_true', parser.add_argument('--tied', action='store_true',
help='tie the word embedding and softmax weights') help='tie the word embedding and softmax weights')
...@@ -64,7 +64,9 @@ if args.fp16 and not args.cuda: ...@@ -64,7 +64,9 @@ if args.fp16 and not args.cuda:
# Load data # Load data
############################################################################### ###############################################################################
corpus = data.Corpus(args.data) # Ensure that the dictionary length is a multiple of 8,
# so that the decoder's GEMMs will use Tensor Cores.
corpus = data.Corpus(args.data, pad_to_multiple_of=8)
# Starting from sequential data, batchify arranges the dataset into columns. # Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get # For instance, with the alphabet as the sequence and batch size 4, we'd get
...@@ -99,6 +101,16 @@ test_data = batchify(corpus.test, eval_batch_size) ...@@ -99,6 +101,16 @@ test_data = batchify(corpus.test, eval_batch_size)
############################################################################### ###############################################################################
ntokens = len(corpus.dictionary) ntokens = len(corpus.dictionary)
if args.fp16 and args.cuda:
if ntokens%8 != 0:
print("Warning: the dictionary size (ntokens = {}) should be a multiple of 8 to ensure "
"Tensor Core use for the decoder's GEMMs.".format(ntokens))
if args.emsize%8 != 0 or args.nhid%8 != 0 or args.batch_size%8 != 0:
print("Warning: emsize = {}, nhid = {}, batch_size = {} should all be multiples of 8 "
"to ensure Tensor Core use for the RNN's GEMMs.".format(
args.emsize, args.nhid, args.batch_size))
model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied) model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied)
if args.cuda and args.fp16: if args.cuda and args.fp16:
......
...@@ -17,9 +17,9 @@ parser.add_argument('--data', type=str, default='./data/wikitext-2', ...@@ -17,9 +17,9 @@ parser.add_argument('--data', type=str, default='./data/wikitext-2',
help='location of the data corpus') help='location of the data corpus')
parser.add_argument('--model', type=str, default='LSTM', parser.add_argument('--model', type=str, default='LSTM',
help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)') help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)')
parser.add_argument('--emsize', type=int, default=200, parser.add_argument('--emsize', type=int, default=1504,
help='size of word embeddings') help='size of word embeddings')
parser.add_argument('--nhid', type=int, default=200, parser.add_argument('--nhid', type=int, default=1504,
help='number of hidden units per layer') help='number of hidden units per layer')
parser.add_argument('--nlayers', type=int, default=2, parser.add_argument('--nlayers', type=int, default=2,
help='number of layers') help='number of layers')
...@@ -29,7 +29,7 @@ parser.add_argument('--clip', type=float, default=0.25, ...@@ -29,7 +29,7 @@ parser.add_argument('--clip', type=float, default=0.25,
help='gradient clipping') help='gradient clipping')
parser.add_argument('--epochs', type=int, default=40, parser.add_argument('--epochs', type=int, default=40,
help='upper epoch limit') help='upper epoch limit')
parser.add_argument('--batch_size', type=int, default=20, metavar='N', parser.add_argument('--batch_size', type=int, default=24, metavar='N',
help='batch size') help='batch size')
parser.add_argument('--bptt', type=int, default=35, parser.add_argument('--bptt', type=int, default=35,
help='sequence length') help='sequence length')
...@@ -67,7 +67,9 @@ if args.fp16 and not args.cuda: ...@@ -67,7 +67,9 @@ if args.fp16 and not args.cuda:
# Load data # Load data
############################################################################### ###############################################################################
corpus = data.Corpus(args.data) # Ensure that the dictionary length is a multiple of 8,
# so that the decoder's GEMMs will use Tensor Cores.
corpus = data.Corpus(args.data, pad_to_multiple_of=8)
# Starting from sequential data, batchify arranges the dataset into columns. # Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get # For instance, with the alphabet as the sequence and batch size 4, we'd get
...@@ -102,6 +104,16 @@ test_data = batchify(corpus.test, eval_batch_size) ...@@ -102,6 +104,16 @@ test_data = batchify(corpus.test, eval_batch_size)
############################################################################### ###############################################################################
ntokens = len(corpus.dictionary) ntokens = len(corpus.dictionary)
if args.fp16 and args.cuda:
if ntokens%8 != 0:
print("Warning: the dictionary size (ntokens = {}) should be a multiple of 8 to ensure "
"Tensor Core use for the decoder's GEMMs.".format(ntokens))
if args.emsize%8 != 0 or args.nhid%8 != 0 or args.batch_size%8 != 0:
print("Warning: emsize = {}, nhid = {}, batch_size = {} should all be multiples of 8 "
"to ensure Tensor Core use for the RNN's GEMMs.".format(
args.emsize, args.nhid, args.batch_size))
model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied) model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied)
if args.cuda and args.fp16: if args.cuda and args.fp16:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment