"torchvision/vscode:/vscode.git/clone" did not exist on "8837e0efbe16dc07ccdd4f1d06460643c7b41c50"
Unverified Commit 9f325542 authored by Zihao Ye's avatar Zihao Ye Committed by GitHub
Browse files

[Model]Transformer (#186)

* change the signature of node/edge filter

* upd filter

* Support multi-dimension node feature in SPMV

* push transformer

* remove some experimental settings

* stable version

* hotfix

* upd tutorial

* upd README

* merge

* remove redundency

* remove tqdm

* several changes

* Refactor

* Refactor

* tutorial train

* fixed a bug

* fixed perf issue

* upd

* change dir

* move un-related to contrib

* tutuorial code

* remove redundency

* upd

* upd

* upd

* upd

* improve viz

* universal done

* halt norm

* fixed a bug

* add draw graph

* fixed several bugs

* remove dependency on core

* upd format of README

* trigger

* trigger

* upd viz

* trigger

* add transformer tutorial

* fix tutorial

* fix readme

* small fix on tutorials

* url fix in readme

* fixed func link

* upd
parent 37feb479
# Beam Search Module
from modules import *
from dataset import *
from tqdm import tqdm
import numpy as n
import argparse
k = 5 # Beam size
if __name__ == '__main__':
argparser = argparse.ArgumentParser('testing translation model')
argparser.add_argument('--gpu', default=-1, help='gpu id')
argparser.add_argument('--N', default=6, type=int, help='num of layers')
argparser.add_argument('--dataset', default='multi30k', help='dataset')
argparser.add_argument('--batch', default=64, help='batch size')
argparser.add_argument('--universal', action='store_true', help='use universal transformer')
argparser.add_argument('--checkpoint', type=int, help='checkpoint: you must specify it')
argparser.add_argument('--print', action='store_true', help='whether to print translated text')
args = argparser.parse_args()
args_filter = ['batch', 'gpu', 'print']
exp_setting = '-'.join('{}'.format(v) for k, v in vars(args).items() if k not in args_filter)
device = 'cpu' if args.gpu == -1 else 'cuda:{}'.format(args.gpu)
dataset = get_dataset(args.dataset)
V = dataset.vocab_size
dim_model = 512
fpred = open('pred.txt', 'w')
fref = open('ref.txt', 'w')
graph_pool = GraphPool()
model = make_model(V, V, N=args.N, dim_model=dim_model)
with open('checkpoints/{}.pkl'.format(exp_setting), 'rb') as f:
model.load_state_dict(th.load(f, map_location=lambda storage, loc: storage))
model = model.to(device)
model.eval()
test_iter = dataset(graph_pool, mode='test', batch_size=args.batch, devices=[device], k=k)
for i, g in enumerate(test_iter):
with th.no_grad():
output = model.infer(g, dataset.MAX_LENGTH, dataset.eos_id, k)
for line in dataset.get_sequence(output):
if args.print:
print(line)
print(line, file=fpred)
for line in dataset.tgt['test']:
print(line.strip(), file=fref)
fpred.close()
fref.close()
os.system(r'bash scripts/bleu.sh pred.txt ref.txt')
os.remove('pred.txt')
os.remove('ref.txt')
"""
In current version we use multi30k as the default training and validation set.
Multi-GPU support is required to train the model on WMT14.
"""
from modules import *
from parallel import *
from loss import *
from optims import *
from dataset import *
from modules.config import *
from modules.viz import *
from tqdm import tqdm
import numpy as np
import argparse
def run_epoch(data_iter, model, loss_compute, is_train=True):
universal = isinstance(model, UTransformer)
for i, g in tqdm(enumerate(data_iter)):
with T.set_grad_enabled(is_train):
if isinstance(model, list):
model = model[:len(gs)]
output = parallel_apply(model, g)
tgt_y = [g.tgt_y for g in gs]
n_tokens = [g.n_tokens for g in gs]
else:
if universal:
output, loss_act = model(g)
if is_train: loss_act.backward(retain_graph=True)
else:
output = model(g)
tgt_y = g.tgt_y
n_tokens = g.n_tokens
loss = loss_compute(output, tgt_y, n_tokens)
if universal:
for step in range(1, model.MAX_DEPTH + 1):
print("nodes entering step {}: {:.2f}%".format(step, (1.0 * model.stat[step] / model.stat[0])))
model.reset_stat()
print('average loss: {}'.format(loss_compute.avg_loss))
print('accuracy: {}'.format(loss_compute.accuracy))
if __name__ == '__main__':
if not os.path.exists('checkpoints'):
os.makedirs('checkpoints')
np.random.seed(1111)
argparser = argparse.ArgumentParser('training translation model')
argparser.add_argument('--gpus', default='-1', type=str, help='gpu id')
argparser.add_argument('--N', default=6, type=int, help='enc/dec layers')
argparser.add_argument('--dataset', default='multi30k', help='dataset')
argparser.add_argument('--batch', default=128, type=int, help='batch size')
argparser.add_argument('--viz', action='store_true', help='visualize attention')
argparser.add_argument('--universal', action='store_true', help='use universal transformer')
args = argparser.parse_args()
args_filter = ['batch', 'gpus', 'viz']
exp_setting = '-'.join('{}'.format(v) for k, v in vars(args).items() if k not in args_filter)
devices = ['cpu'] if args.gpus == '-1' else [int(gpu_id) for gpu_id in args.gpus.split(',')]
dataset = get_dataset(args.dataset)
V = dataset.vocab_size
criterion = LabelSmoothing(V, padding_idx=dataset.pad_id, smoothing=0.1)
dim_model = 512
graph_pool = GraphPool()
model = make_model(V, V, N=args.N, dim_model=dim_model, universal=args.universal)
# Sharing weights between Encoder & Decoder
model.src_embed.lut.weight = model.tgt_embed.lut.weight
model.generator.proj.weight = model.tgt_embed.lut.weight
model, criterion = model.to(devices[0]), criterion.to(devices[0])
model_opt = NoamOpt(dim_model, 1, 400,
T.optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.98), eps=1e-9))
if len(devices) > 1:
model, criterion = map(nn.parallel.replicate, [model, criterion], [devices, devices])
loss_compute = SimpleLossCompute if len(devices) == 1 else MultiGPULossCompute
for epoch in range(100):
train_iter = dataset(graph_pool, mode='train', batch_size=args.batch, devices=devices)
valid_iter = dataset(graph_pool, mode='valid', batch_size=args.batch, devices=devices)
print('Epoch: {} Training...'.format(epoch))
model.train(True)
run_epoch(train_iter, model,
loss_compute(criterion, model_opt), is_train=True)
print('Epoch: {} Evaluating...'.format(epoch))
model.att_weight_map = None
model.eval()
run_epoch(valid_iter, model,
loss_compute(criterion, None), is_train=False)
# Visualize attention
if args.viz:
src_seq = dataset.get_seq_by_id(VIZ_IDX, mode='valid', field='src')
tgt_seq = dataset.get_seq_by_id(VIZ_IDX, mode='valid', field='tgt')[:-1]
draw_atts(model.att_weight_map, src_seq, tgt_seq, exp_setting, 'epoch_{}'.format(epoch))
print('----------------------------------')
with open('checkpoints/{}-{}.pkl'.format(exp_setting, epoch), 'wb') as f:
th.save(model.state_dict(), f)
...@@ -33,7 +33,7 @@ Graph Neural Network and its variant ...@@ -33,7 +33,7 @@ Graph Neural Network and its variant
DGL. DGL.
* **SSE** `[paper] <http://proceedings.mlr.press/v80/dai18a/dai18a.pdf>`__ * **SSE** `[paper] <http://proceedings.mlr.press/v80/dai18a/dai18a.pdf>`__
`[tutorial (wip)]` `[code] `[tutorial <1_gnn/8_sse_mx.html>]` `[code]
<https://github.com/jermainewang/dgl/blob/master/examples/mxnet/sse>`__: <https://github.com/jermainewang/dgl/blob/master/examples/mxnet/sse>`__:
the emphasize here is *giant* graph that cannot fit comfortably on one GPU the emphasize here is *giant* graph that cannot fit comfortably on one GPU
card. SSE is an example to illustrate the co-design of both algorithm and card. SSE is an example to illustrate the co-design of both algorithm and
......
This diff is collapsed.
...@@ -13,9 +13,11 @@ Old (new) wines in new bottle ...@@ -13,9 +13,11 @@ Old (new) wines in new bottle
with non-parametric message-passing. We show how the later can be nicely with non-parametric message-passing. We show how the later can be nicely
implemented with DGL APIs. implemented with DGL APIs.
* **Transformer** `[paper] <https://arxiv.org/abs/1706.03762>`__ `[tutorial
(wip)]` `[code (wip)]` and **Universal Transformer** `[paper] * **Transformer** `[paper] <https://arxiv.org/abs/1706.03762>`__ `[tutorial] <4_old_wines/7_transformer.html>`__
<https://arxiv.org/abs/1807.03819>`__ `[tutorial (wip)]` `[code (wip)]`: `[code] <https://github.com/jermainewang/dgl/tree/master/examples/pytorch/transformer>`__ and **Universal Transformer**
`[paper] <https://arxiv.org/abs/1807.03819>`__ `[tutorial] <4_old_wines/7_transformer.html>`__
`[code] <https://github.com/jermainewang/dgl/tree/master/examples/pytorch/transformer/modules/act.py>`__:
these two models replace RNN with several layers of multi-head attention to these two models replace RNN with several layers of multi-head attention to
encode and discover structures among tokens of a sentence. These attention encode and discover structures among tokens of a sentence. These attention
mechanisms can similarly formulated as graph operations with mechanisms can similarly formulated as graph operations with
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment