Unverified Commit d1d74413 authored by Jun Ru Anderson's avatar Jun Ru Anderson Committed by GitHub
Browse files

[test] specify chunks for pipe/transformer benchmark (#52)



* specify chunks for pipe/transformer benchmark

Set chunks to be equal to len(balance) for pipe/transformer benchmark. Will update words per second and memory usage checks in next commit (must test on CircleCI to find appropriate values)

* change benchmark words per second and memory usage

Did six runs for words-per-second, with results: 9144.40, 9163.91, 9993.01, 9082.82, 9155.09, 9000.67
Peak allocated bytes per device (which does not change between runs) were 193206272, 645632, 562688, 92688384 for devices 0, 1, 2 and 3, respectively

* increase batch size

batch size was small enough that the GPU's computing power was not the bottleneck, slowing training and specifically making more chunks slower. Increasing batch size has therefore increased training speed

* update benchmark numbers

ran six times, with wps 36917.44, 36797.65, 37006.03, 36872.84, 37129.31, 37003.31 and peak allocated bytes 4061909504, 4050944, 10427392, 2031824896 for devices 0,1,2 and 3 respectively.
Co-authored-by: default avatarJun Ru Anderson <andersonic@fb.com>
parent ab32cb7d
......@@ -98,8 +98,8 @@ def get_data(device):
TEXT.build_vocab(train_txt)
ntokens = len(TEXT.vocab.stoi)
batch_size = 20
eval_batch_size = 10
batch_size = 500
eval_batch_size = 200
train_data = batchify(train_txt, batch_size, TEXT, device)
val_data = batchify(val_txt, eval_batch_size, TEXT, device)
test_data = batchify(test_txt, eval_batch_size, TEXT, device)
......@@ -131,7 +131,7 @@ def make_model(device, ntokens):
model = TransformerLMSequntial(ntokens, ninp, nhead, nhid, dropout, initrange).half().to(device)
balance = generate_balance(min(num_devices, 4), len(model))
p = Pipe(model, balance)
p = Pipe(model, balance, chunks=len(balance))
criterion = nn.CrossEntropyLoss()
lr = 0.0005 # learning rate
......@@ -161,7 +161,7 @@ def train(train_data, model, criterion, optimizer, bptt, ntokens):
optimizer.step()
total_loss += loss.item()
log_interval = 200
log_interval = 50
if batch % log_interval == 0 and batch > 0:
cur_loss = total_loss / log_interval
elapsed = time.time() - start_time
......@@ -227,7 +227,7 @@ def benchmark_language_model(train_data, val_data, test_data, model, criterion,
if can_benchmark and len(model.balance) == 4:
# Assert that words per second is within 3 standard deviations of the average
# of six golden runs
assert wps > 27799.2 - (3 * 522.145)
assert wps > 36954.4 - (3 * 116.825)
print("Peak allocated bytes on cuda:0: {:1d}".format(torch.cuda.memory_stats(0)["allocated_bytes.all.peak"]))
print("Peak allocated bytes on cuda:1: {:1d}".format(torch.cuda.memory_stats(1)["allocated_bytes.all.peak"]))
......@@ -236,10 +236,10 @@ def benchmark_language_model(train_data, val_data, test_data, model, criterion,
# Assert that memory usage on each GPU is within 10% of golden run
# Right-hand-side is golden run bytes * 110%
assert torch.cuda.memory_stats(0)["allocated_bytes.all.peak"] < 193206272 * 1.1
assert torch.cuda.memory_stats(1)["allocated_bytes.all.peak"] < 640512 * 1.1
assert torch.cuda.memory_stats(2)["allocated_bytes.all.peak"] < 1412608 * 1.1
assert torch.cuda.memory_stats(3)["allocated_bytes.all.peak"] < 95364608 * 1.1
assert torch.cuda.memory_stats(0)["allocated_bytes.all.peak"] < 4061909504 * 1.1
assert torch.cuda.memory_stats(1)["allocated_bytes.all.peak"] < 4050944 * 1.1
assert torch.cuda.memory_stats(2)["allocated_bytes.all.peak"] < 10427392 * 1.1
assert torch.cuda.memory_stats(3)["allocated_bytes.all.peak"] < 2031824896 * 1.1
print("No regression detected")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment