Unverified Commit 598d746e authored by Chang Liu's avatar Chang Liu Committed by GitHub
Browse files

[Example][Bugfix] Remove all torchtext legacy-related APIs for pytorch/pinsage example (#4130)



* Remove all torchtext legacy-related APIs

* Remove unused BagOfWordsPretrained class, and fix some typos
Co-authored-by: default avatarMufei Li <mufeili1996@gmail.com>
parent 077e002f
......@@ -55,9 +55,9 @@ class PandasGraphBuilder(object):
>>> builder.add_binary_relations(plays, 'user_id', 'game_id', 'plays')
>>> builder.add_binary_relations(plays, 'game_id', 'user_id', 'played-by')
>>> g = builder.build()
>>> g.number_of_nodes('user')
>>> g.num_nodes('user')
3
>>> g.number_of_edges('plays')
>>> g.num_edges('plays')
4
"""
def __init__(self):
......
......@@ -44,8 +44,8 @@ def build_train_graph(g, train_indices, utype, itype, etype, etype_rev):
return train_g
def build_val_test_matrix(g, val_indices, test_indices, utype, itype, etype):
n_users = g.number_of_nodes(utype)
n_items = g.number_of_nodes(itype)
n_users = g.num_nodes(utype)
n_items = g.num_nodes(itype)
val_src, val_dst = g.find_edges(val_indices, etype=etype)
test_src, test_dst = g.find_edges(test_indices, etype=etype)
val_src = val_src.numpy()
......
......@@ -26,7 +26,7 @@ class LatestNNRecommender(object):
Return a (n_user, K) matrix of recommended items for each user
"""
graph_slice = full_graph.edge_type_subgraph([self.user_to_item_etype])
n_users = full_graph.number_of_nodes(self.user_ntype)
n_users = full_graph.num_nodes(self.user_ntype)
latest_interactions = dgl.sampling.select_topk(graph_slice, 1, self.timestamp, edge_dir='out')
user, latest_items = latest_interactions.all_edges(form='uv', order='srcdst')
# each user should have at least one "latest" interaction
......
......@@ -36,44 +36,19 @@ def _init_input_modules(g, ntype, textset, hidden_dims):
module_dict[column] = m
if textset is not None:
for column, field in textset.fields.items():
if field.vocab.vectors:
module_dict[column] = BagOfWordsPretrained(field, hidden_dims)
else:
module_dict[column] = BagOfWords(field, hidden_dims)
for column, field in textset.items():
textlist, vocab, pad_var, batch_first = field
module_dict[column] = BagOfWords(vocab, hidden_dims)
return module_dict
class BagOfWordsPretrained(nn.Module):
def __init__(self, field, hidden_dims):
super().__init__()
input_dims = field.vocab.vectors.shape[1]
self.emb = nn.Embedding(
len(field.vocab.itos), input_dims,
padding_idx=field.vocab.stoi[field.pad_token])
self.emb.weight[:] = field.vocab.vectors
self.proj = nn.Linear(input_dims, hidden_dims)
nn.init.xavier_uniform_(self.proj.weight)
nn.init.constant_(self.proj.bias, 0)
disable_grad(self.emb)
def forward(self, x, length):
"""
x: (batch_size, max_length) LongTensor
length: (batch_size,) LongTensor
"""
x = self.emb(x).sum(1) / length.unsqueeze(1).float()
return self.proj(x)
class BagOfWords(nn.Module):
def __init__(self, field, hidden_dims):
def __init__(self, vocab, hidden_dims):
super().__init__()
self.emb = nn.Embedding(
len(field.vocab.itos), hidden_dims,
padding_idx=field.vocab.stoi[field.pad_token])
len(vocab.get_itos()), hidden_dims,
padding_idx=vocab.get_stoi()['<pad>'])
nn.init.xavier_uniform_(self.emb.weight)
def forward(self, x, length):
......@@ -98,7 +73,7 @@ class LinearProjector(nn.Module):
continue
module = self.inputs[feature]
if isinstance(module, (BagOfWords, BagOfWordsPretrained)):
if isinstance(module, BagOfWords):
# Textual feature; find the length and pass it to the textual module.
length = ndata[feature + '__len']
result = module(data, length)
......@@ -162,7 +137,7 @@ class SAGENet(nn.Module):
def forward(self, blocks, h):
for layer, block in zip(self.convs, blocks):
h_dst = h[:block.number_of_nodes('DST/' + block.ntypes[0])]
h_dst = h[:block.num_nodes('DST/' + block.ntypes[0])]
h = layer(block, (h, h_dst), block.edata['weights'])
return h
......@@ -170,7 +145,7 @@ class ItemToItemScorer(nn.Module):
def __init__(self, full_graph, ntype):
super().__init__()
n_nodes = full_graph.number_of_nodes(ntype)
n_nodes = full_graph.num_nodes(ntype)
self.bias = nn.Parameter(torch.zeros(n_nodes, 1))
def _add_bias(self, edges):
......
......@@ -12,6 +12,8 @@ import tqdm
import layers
import sampler as sampler_module
import evaluation
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
class PinSAGEModel(nn.Module):
def __init__(self, full_graph, ntype, textsets, hidden_dims, n_layers):
......@@ -46,23 +48,23 @@ def train(dataset, args):
# Assign user and movie IDs and use them as features (to learn an individual trainable
# embedding for each entity)
g.nodes[user_ntype].data['id'] = torch.arange(g.number_of_nodes(user_ntype))
g.nodes[item_ntype].data['id'] = torch.arange(g.number_of_nodes(item_ntype))
# Prepare torchtext dataset and vocabulary
fields = {}
examples = []
for key, texts in item_texts.items():
fields[key] = torchtext.legacy.data.Field(include_lengths=True, lower=True, batch_first=True)
for i in range(g.number_of_nodes(item_ntype)):
example = torchtext.legacy.data.Example.fromlist(
[item_texts[key][i] for key in item_texts.keys()],
[(key, fields[key]) for key in item_texts.keys()])
examples.append(example)
textset = torchtext.legacy.data.Dataset(examples, fields)
for key, field in fields.items():
field.build_vocab(getattr(textset, key))
#field.build_vocab(getattr(textset, key), vectors='fasttext.simple.300d')
g.nodes[user_ntype].data['id'] = torch.arange(g.num_nodes(user_ntype))
g.nodes[item_ntype].data['id'] = torch.arange(g.num_nodes(item_ntype))
# Prepare torchtext dataset and Vocabulary
textset = {}
tokenizer = get_tokenizer(None)
textlist = []
batch_first = True
for i in range(g.num_nodes(item_ntype)):
for key in item_texts.keys():
l = tokenizer(item_texts[key][i].lower())
textlist.append(l)
for key, field in item_texts.items():
vocab2 = build_vocab_from_iterator(textlist, specials=["<unk>","<pad>"])
textset[key] = (textlist, vocab2, vocab2.get_stoi()['<pad>'], batch_first)
# Sampler
batch_sampler = sampler_module.ItemToItemBatchSampler(
......@@ -77,7 +79,7 @@ def train(dataset, args):
collate_fn=collator.collate_train,
num_workers=args.num_workers)
dataloader_test = DataLoader(
torch.arange(g.number_of_nodes(item_ntype)),
torch.arange(g.num_nodes(item_ntype)),
batch_size=args.batch_size,
collate_fn=collator.collate_test,
num_workers=args.num_workers)
......@@ -107,7 +109,7 @@ def train(dataset, args):
# Evaluate
model.eval()
with torch.no_grad():
item_batches = torch.arange(g.number_of_nodes(item_ntype)).split(args.batch_size)
item_batches = torch.arange(g.num_nodes(item_ntype)).split(args.batch_size)
h_item_batches = []
for blocks in dataloader_test:
for i in range(len(blocks)):
......
......@@ -12,6 +12,8 @@ import tqdm
import layers
import sampler as sampler_module
import evaluation
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
class PinSAGEModel(nn.Module):
def __init__(self, full_graph, ntype, textsets, hidden_dims, n_layers):
......@@ -51,19 +53,19 @@ def train(dataset, args):
device = torch.device(args.device)
# Prepare torchtext dataset and vocabulary
fields = {}
examples = []
for key, texts in item_texts.items():
fields[key] = torchtext.data.Field(include_lengths=True, lower=True, batch_first=True)
for i in range(g.number_of_nodes(item_ntype)):
example = torchtext.data.Example.fromlist(
[item_texts[key][i] for key in item_texts.keys()],
[(key, fields[key]) for key in item_texts.keys()])
examples.append(example)
textset = torchtext.data.Dataset(examples, fields)
for key, field in fields.items():
field.build_vocab(getattr(textset, key))
#field.build_vocab(getattr(textset, key), vectors='fasttext.simple.300d')
textset = {}
tokenizer = get_tokenizer(None)
textlist = []
batch_first = True
for i in range(g.num_nodes(item_ntype)):
for key in item_texts.keys():
l = tokenizer(item_texts[key][i].lower())
textlist.append(l)
for key, field in item_texts.items():
vocab2 = build_vocab_from_iterator(textlist, specials=["<unk>","<pad>"])
textset[key] = (textlist, vocab2, vocab2.get_stoi()['<pad>'], batch_first)
# Sampler
batch_sampler = sampler_module.ItemToItemBatchSampler(
......@@ -78,7 +80,7 @@ def train(dataset, args):
collate_fn=collator.collate_train,
num_workers=args.num_workers)
dataloader_test = DataLoader(
torch.arange(g.number_of_nodes(item_ntype)),
torch.arange(g.num_nodes(item_ntype)),
batch_size=args.batch_size,
collate_fn=collator.collate_test,
num_workers=args.num_workers)
......@@ -86,7 +88,7 @@ def train(dataset, args):
# Model
model = PinSAGEModel(g, item_ntype, textset, args.hidden_dims, args.num_layers).to(device)
item_emb = nn.Embedding(g.number_of_nodes(item_ntype), args.hidden_dims, sparse=True)
item_emb = nn.Embedding(g.num_nodes(item_ntype), args.hidden_dims, sparse=True)
# Optimizer
opt = torch.optim.Adam(model.parameters(), lr=args.lr)
opt_emb = torch.optim.SparseAdam(item_emb.parameters(), lr=args.lr)
......@@ -112,7 +114,7 @@ def train(dataset, args):
# Evaluate
model.eval()
with torch.no_grad():
item_batches = torch.arange(g.number_of_nodes(item_ntype)).split(args.batch_size)
item_batches = torch.arange(g.num_nodes(item_ntype)).split(args.batch_size)
h_item_batches = []
for blocks in tqdm.tqdm(dataloader_test):
for i in range(len(blocks)):
......
......@@ -54,7 +54,7 @@ if __name__ == '__main__':
g.edges['listened'].data['created_at'] = torch.LongTensor(events['created_at'].values)
g.edges['listened-by'].data['created_at'] = torch.LongTensor(events['created_at'].values)
n_edges = g.number_of_edges('listened')
n_edges = g.num_edges('listened')
train_indices, val_indices, test_indices = train_test_split_by_time(events, 'created_at', 'user_id')
train_g = build_train_graph(g, train_indices, 'user', 'track', 'listened', 'listened-by')
assert train_g.out_degrees(etype='listened').min() > 0
......
......@@ -2,6 +2,20 @@ import numpy as np
import dgl
import torch
from torch.utils.data import IterableDataset, DataLoader
from torchtext.data.functional import numericalize_tokens_from_iterator
def padding(array, yy, val):
"""
:param array: torch tensor array
:param yy: desired width
:param val: padded value
:return: padded array
"""
w = array.shape[0]
b = 0
bb = yy - b - w
return torch.nn.functional.pad(array, pad=(b, bb), mode='constant', value=val)
def compact_and_copy(frontier, seeds):
block = dgl.to_block(frontier, seeds)
......@@ -22,12 +36,12 @@ class ItemToItemBatchSampler(IterableDataset):
def __iter__(self):
while True:
heads = torch.randint(0, self.g.number_of_nodes(self.item_type), (self.batch_size,))
heads = torch.randint(0, self.g.num_nodes(self.item_type), (self.batch_size,))
tails = dgl.sampling.random_walk(
self.g,
heads,
metapath=[self.item_to_user_etype, self.user_to_item_etype])[0][:, 2]
neg_tails = torch.randint(0, self.g.number_of_nodes(self.item_type), (self.batch_size,))
neg_tails = torch.randint(0, self.g.num_nodes(self.item_type), (self.batch_size,))
mask = (tails != -1)
yield heads[mask], tails[mask], neg_tails[mask]
......@@ -68,10 +82,10 @@ class NeighborSampler(object):
# connections only.
pos_graph = dgl.graph(
(heads, tails),
num_nodes=self.g.number_of_nodes(self.item_type))
num_nodes=self.g.num_nodes(self.item_type))
neg_graph = dgl.graph(
(heads, neg_tails),
num_nodes=self.g.number_of_nodes(self.item_type))
num_nodes=self.g.num_nodes(self.item_type))
pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph])
seeds = pos_graph.ndata[dgl.NID]
......@@ -111,12 +125,26 @@ def assign_textual_node_features(ndata, textset, ntype):
"""
node_ids = ndata[dgl.NID].numpy()
for field_name, field in textset.fields.items():
examples = [getattr(textset[i], field_name) for i in node_ids]
tokens, lengths = field.process(examples)
if not field.batch_first:
for field_name, field in textset.items():
textlist, vocab, pad_var, batch_first = field
examples = [textlist[i] for i in node_ids]
ids_iter = numericalize_tokens_from_iterator(vocab, examples)
maxsize = max([len(textlist[i]) for i in node_ids])
ids = next(ids_iter)
x = torch.asarray([num for num in ids])
lengths = torch.tensor([len(x)])
tokens = padding(x, maxsize, pad_var)
for ids in ids_iter:
x = torch.asarray([num for num in ids])
l = torch.tensor([len(x)])
y = padding(x, maxsize, pad_var)
tokens = torch.vstack((tokens,y))
lengths = torch.cat((lengths, l))
if not batch_first:
tokens = tokens.t()
ndata[field_name] = tokens
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment