Commit aed3086a authored by Michael Carilli's avatar Michael Carilli
Browse files

Merging in master

parents b5465fe6 9041a868
......@@ -9,8 +9,20 @@ The trained model can then be used by the generate script to generate new text.
`main_fp16_optimizer.py` with `--fp16` demonstrates use of `apex.fp16_utils.FP16_Optimizer` to automatically manage master parameters and loss scaling.
With `--fp16`, to enable Tensor Core use and achieve best performance, dimensions that participate in GEMMs in the model should be multiples of 8. Specifically, these are
* dictionary length (ntokens in `main.py`),
* embedding size (`--emsize`),
* hidden size (`--nhid`), and
* batch size (`--batch_size`).
The dictionary length is a property of the dataset, and is not controlled by a command line argument. In `main.py`, `corpus = data.Corpus(args.data, pad_to_multiple_of=8)` and the `Corpus` constructor in
`data.py` ensure that the dictionary length is a multiple of 8.
Also, for mixed precision performance, a good general rule is: the more work you give the GPU, the better. Bigger models and larger batch sizes supply the cores with more work and do a better job saturating the device. A (very rough) way to check if you're saturating the device is to run nvidia-smi from another terminal, and see what fraction of device memory you're using. This will tell you how much leeway you have to increase model or batch size.
```bash
python main.py --cuda --epochs 6 # Train a LSTM on Wikitext-2 with CUDA
python main.py --cuda --epochs 6 --fp16 # Train a LSTM on Wikitext-2 with CUDA and mixed precision
python main.py --cuda --epochs 6 --tied # Train a tied LSTM on Wikitext-2 with CUDA
python main.py --cuda --tied # Train a tied LSTM on Wikitext-2 with CUDA for 40 epochs
python generate.py # Generate samples from the trained LSTM model.
......@@ -67,12 +79,11 @@ optional arguments:
```
which triggers the use of dynamic loss scaling. Supplying `--dynamic-loss-scale` will override the `--loss_scale` argument, if any.
With these arguments, a variety of models can be tested.
As an example, the following arguments produce slower but better models:
With these arguments, a variety of models can be tested. For example
```bash
python main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 40
python main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 40 --tied
python main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40
python main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 --tied
python main.py --cuda --emsize 656 --nhid 656 --dropout 0.5 --epochs 40
python main.py --cuda --emsize 656 --nhid 656 --dropout 0.5 --epochs 40 --tied
python main.py --cuda --emsize 1504 --nhid 1504 --dropout 0.65 --epochs 40
python main.py --cuda --emsize 1504 --nhid 1504 --dropout 0.65 --epochs 40 --tied
```
......@@ -18,12 +18,23 @@ class Dictionary(object):
class Corpus(object):
def __init__(self, path):
def __init__(self, path, pad_to_multiple_of=1):
# Synthetic elements used to pad the dictionary length.
# It is assumed that these synthetic elements do not appear in the actual data files.
self.synthetic = ["vvvvvvvv" + str(i) for i in range(pad_to_multiple_of-1)]
self.dictionary = Dictionary()
self.train = self.tokenize(os.path.join(path, 'train.txt'))
self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
self.test = self.tokenize(os.path.join(path, 'test.txt'))
# Pad dictionary size to desired multiple. For example, padding to a multiple of 8
# is necessary to ensure Tensor Core usage for the decoder.
pad_elem = pad_to_multiple_of - len(self.dictionary)%pad_to_multiple_of
if pad_elem != pad_to_multiple_of:
for i in range(pad_elem):
self.dictionary.add_word(self.synthetic[i])
def tokenize(self, path):
"""Tokenizes a text file."""
assert os.path.exists(path)
......
......@@ -17,9 +17,9 @@ parser.add_argument('--data', type=str, default='./data/wikitext-2',
help='location of the data corpus')
parser.add_argument('--model', type=str, default='LSTM',
help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)')
parser.add_argument('--emsize', type=int, default=200,
parser.add_argument('--emsize', type=int, default=1504,
help='size of word embeddings')
parser.add_argument('--nhid', type=int, default=200,
parser.add_argument('--nhid', type=int, default=1504,
help='number of hidden units per layer')
parser.add_argument('--nlayers', type=int, default=2,
help='number of layers')
......@@ -29,11 +29,11 @@ parser.add_argument('--clip', type=float, default=0.25,
help='gradient clipping')
parser.add_argument('--epochs', type=int, default=40,
help='upper epoch limit')
parser.add_argument('--batch_size', type=int, default=20, metavar='N',
parser.add_argument('--batch_size', type=int, default=24, metavar='N',
help='batch size')
parser.add_argument('--bptt', type=int, default=35,
help='sequence length')
parser.add_argument('--dropout', type=float, default=0.2,
parser.add_argument('--dropout', type=float, default=0.65,
help='dropout applied to layers (0 = no dropout)')
parser.add_argument('--tied', action='store_true',
help='tie the word embedding and softmax weights')
......@@ -64,7 +64,9 @@ if args.fp16 and not args.cuda:
# Load data
###############################################################################
corpus = data.Corpus(args.data)
# Ensure that the dictionary length is a multiple of 8,
# so that the decoder's GEMMs will use Tensor Cores.
corpus = data.Corpus(args.data, pad_to_multiple_of=8)
# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
......@@ -99,6 +101,16 @@ test_data = batchify(corpus.test, eval_batch_size)
###############################################################################
ntokens = len(corpus.dictionary)
if args.fp16 and args.cuda:
if ntokens%8 != 0:
print("Warning: the dictionary size (ntokens = {}) should be a multiple of 8 to ensure "
"Tensor Core use for the decoder's GEMMs.".format(ntokens))
if args.emsize%8 != 0 or args.nhid%8 != 0 or args.batch_size%8 != 0:
print("Warning: emsize = {}, nhid = {}, batch_size = {} should all be multiples of 8 "
"to ensure Tensor Core use for the RNN's GEMMs.".format(
args.emsize, args.nhid, args.batch_size))
model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied)
if args.cuda and args.fp16:
......
......@@ -17,9 +17,9 @@ parser.add_argument('--data', type=str, default='./data/wikitext-2',
help='location of the data corpus')
parser.add_argument('--model', type=str, default='LSTM',
help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)')
parser.add_argument('--emsize', type=int, default=200,
parser.add_argument('--emsize', type=int, default=1504,
help='size of word embeddings')
parser.add_argument('--nhid', type=int, default=200,
parser.add_argument('--nhid', type=int, default=1504,
help='number of hidden units per layer')
parser.add_argument('--nlayers', type=int, default=2,
help='number of layers')
......@@ -29,7 +29,7 @@ parser.add_argument('--clip', type=float, default=0.25,
help='gradient clipping')
parser.add_argument('--epochs', type=int, default=40,
help='upper epoch limit')
parser.add_argument('--batch_size', type=int, default=20, metavar='N',
parser.add_argument('--batch_size', type=int, default=24, metavar='N',
help='batch size')
parser.add_argument('--bptt', type=int, default=35,
help='sequence length')
......@@ -67,7 +67,9 @@ if args.fp16 and not args.cuda:
# Load data
###############################################################################
corpus = data.Corpus(args.data)
# Ensure that the dictionary length is a multiple of 8,
# so that the decoder's GEMMs will use Tensor Cores.
corpus = data.Corpus(args.data, pad_to_multiple_of=8)
# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
......@@ -102,6 +104,16 @@ test_data = batchify(corpus.test, eval_batch_size)
###############################################################################
ntokens = len(corpus.dictionary)
if args.fp16 and args.cuda:
if ntokens%8 != 0:
print("Warning: the dictionary size (ntokens = {}) should be a multiple of 8 to ensure "
"Tensor Core use for the decoder's GEMMs.".format(ntokens))
if args.emsize%8 != 0 or args.nhid%8 != 0 or args.batch_size%8 != 0:
print("Warning: emsize = {}, nhid = {}, batch_size = {} should all be multiples of 8 "
"to ensure Tensor Core use for the RNN's GEMMs.".format(
args.emsize, args.nhid, args.batch_size))
model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied)
if args.cuda and args.fp16:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment