Unverified Commit 598d746e authored by Chang Liu's avatar Chang Liu Committed by GitHub
Browse files

[Example][Bugfix] Remove all torchtext legacy-related APIs for pytorch/pinsage example (#4130)



* Remove all torchtext legacy-related APIs

* Remove unused BagOfWordsPretrained class, and fix some typos
Co-authored-by: default avatarMufei Li <mufeili1996@gmail.com>
parent 077e002f
...@@ -55,9 +55,9 @@ class PandasGraphBuilder(object): ...@@ -55,9 +55,9 @@ class PandasGraphBuilder(object):
>>> builder.add_binary_relations(plays, 'user_id', 'game_id', 'plays') >>> builder.add_binary_relations(plays, 'user_id', 'game_id', 'plays')
>>> builder.add_binary_relations(plays, 'game_id', 'user_id', 'played-by') >>> builder.add_binary_relations(plays, 'game_id', 'user_id', 'played-by')
>>> g = builder.build() >>> g = builder.build()
>>> g.number_of_nodes('user') >>> g.num_nodes('user')
3 3
>>> g.number_of_edges('plays') >>> g.num_edges('plays')
4 4
""" """
def __init__(self): def __init__(self):
......
...@@ -44,8 +44,8 @@ def build_train_graph(g, train_indices, utype, itype, etype, etype_rev): ...@@ -44,8 +44,8 @@ def build_train_graph(g, train_indices, utype, itype, etype, etype_rev):
return train_g return train_g
def build_val_test_matrix(g, val_indices, test_indices, utype, itype, etype): def build_val_test_matrix(g, val_indices, test_indices, utype, itype, etype):
n_users = g.number_of_nodes(utype) n_users = g.num_nodes(utype)
n_items = g.number_of_nodes(itype) n_items = g.num_nodes(itype)
val_src, val_dst = g.find_edges(val_indices, etype=etype) val_src, val_dst = g.find_edges(val_indices, etype=etype)
test_src, test_dst = g.find_edges(test_indices, etype=etype) test_src, test_dst = g.find_edges(test_indices, etype=etype)
val_src = val_src.numpy() val_src = val_src.numpy()
......
...@@ -26,7 +26,7 @@ class LatestNNRecommender(object): ...@@ -26,7 +26,7 @@ class LatestNNRecommender(object):
Return a (n_user, K) matrix of recommended items for each user Return a (n_user, K) matrix of recommended items for each user
""" """
graph_slice = full_graph.edge_type_subgraph([self.user_to_item_etype]) graph_slice = full_graph.edge_type_subgraph([self.user_to_item_etype])
n_users = full_graph.number_of_nodes(self.user_ntype) n_users = full_graph.num_nodes(self.user_ntype)
latest_interactions = dgl.sampling.select_topk(graph_slice, 1, self.timestamp, edge_dir='out') latest_interactions = dgl.sampling.select_topk(graph_slice, 1, self.timestamp, edge_dir='out')
user, latest_items = latest_interactions.all_edges(form='uv', order='srcdst') user, latest_items = latest_interactions.all_edges(form='uv', order='srcdst')
# each user should have at least one "latest" interaction # each user should have at least one "latest" interaction
......
...@@ -36,44 +36,19 @@ def _init_input_modules(g, ntype, textset, hidden_dims): ...@@ -36,44 +36,19 @@ def _init_input_modules(g, ntype, textset, hidden_dims):
module_dict[column] = m module_dict[column] = m
if textset is not None: if textset is not None:
for column, field in textset.fields.items(): for column, field in textset.items():
if field.vocab.vectors: textlist, vocab, pad_var, batch_first = field
module_dict[column] = BagOfWordsPretrained(field, hidden_dims) module_dict[column] = BagOfWords(vocab, hidden_dims)
else:
module_dict[column] = BagOfWords(field, hidden_dims)
return module_dict return module_dict
class BagOfWordsPretrained(nn.Module):
def __init__(self, field, hidden_dims):
super().__init__()
input_dims = field.vocab.vectors.shape[1]
self.emb = nn.Embedding(
len(field.vocab.itos), input_dims,
padding_idx=field.vocab.stoi[field.pad_token])
self.emb.weight[:] = field.vocab.vectors
self.proj = nn.Linear(input_dims, hidden_dims)
nn.init.xavier_uniform_(self.proj.weight)
nn.init.constant_(self.proj.bias, 0)
disable_grad(self.emb)
def forward(self, x, length):
"""
x: (batch_size, max_length) LongTensor
length: (batch_size,) LongTensor
"""
x = self.emb(x).sum(1) / length.unsqueeze(1).float()
return self.proj(x)
class BagOfWords(nn.Module): class BagOfWords(nn.Module):
def __init__(self, field, hidden_dims): def __init__(self, vocab, hidden_dims):
super().__init__() super().__init__()
self.emb = nn.Embedding( self.emb = nn.Embedding(
len(field.vocab.itos), hidden_dims, len(vocab.get_itos()), hidden_dims,
padding_idx=field.vocab.stoi[field.pad_token]) padding_idx=vocab.get_stoi()['<pad>'])
nn.init.xavier_uniform_(self.emb.weight) nn.init.xavier_uniform_(self.emb.weight)
def forward(self, x, length): def forward(self, x, length):
...@@ -98,7 +73,7 @@ class LinearProjector(nn.Module): ...@@ -98,7 +73,7 @@ class LinearProjector(nn.Module):
continue continue
module = self.inputs[feature] module = self.inputs[feature]
if isinstance(module, (BagOfWords, BagOfWordsPretrained)): if isinstance(module, BagOfWords):
# Textual feature; find the length and pass it to the textual module. # Textual feature; find the length and pass it to the textual module.
length = ndata[feature + '__len'] length = ndata[feature + '__len']
result = module(data, length) result = module(data, length)
...@@ -162,7 +137,7 @@ class SAGENet(nn.Module): ...@@ -162,7 +137,7 @@ class SAGENet(nn.Module):
def forward(self, blocks, h): def forward(self, blocks, h):
for layer, block in zip(self.convs, blocks): for layer, block in zip(self.convs, blocks):
h_dst = h[:block.number_of_nodes('DST/' + block.ntypes[0])] h_dst = h[:block.num_nodes('DST/' + block.ntypes[0])]
h = layer(block, (h, h_dst), block.edata['weights']) h = layer(block, (h, h_dst), block.edata['weights'])
return h return h
...@@ -170,7 +145,7 @@ class ItemToItemScorer(nn.Module): ...@@ -170,7 +145,7 @@ class ItemToItemScorer(nn.Module):
def __init__(self, full_graph, ntype): def __init__(self, full_graph, ntype):
super().__init__() super().__init__()
n_nodes = full_graph.number_of_nodes(ntype) n_nodes = full_graph.num_nodes(ntype)
self.bias = nn.Parameter(torch.zeros(n_nodes, 1)) self.bias = nn.Parameter(torch.zeros(n_nodes, 1))
def _add_bias(self, edges): def _add_bias(self, edges):
......
...@@ -12,6 +12,8 @@ import tqdm ...@@ -12,6 +12,8 @@ import tqdm
import layers import layers
import sampler as sampler_module import sampler as sampler_module
import evaluation import evaluation
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
class PinSAGEModel(nn.Module): class PinSAGEModel(nn.Module):
def __init__(self, full_graph, ntype, textsets, hidden_dims, n_layers): def __init__(self, full_graph, ntype, textsets, hidden_dims, n_layers):
...@@ -46,23 +48,23 @@ def train(dataset, args): ...@@ -46,23 +48,23 @@ def train(dataset, args):
# Assign user and movie IDs and use them as features (to learn an individual trainable # Assign user and movie IDs and use them as features (to learn an individual trainable
# embedding for each entity) # embedding for each entity)
g.nodes[user_ntype].data['id'] = torch.arange(g.number_of_nodes(user_ntype)) g.nodes[user_ntype].data['id'] = torch.arange(g.num_nodes(user_ntype))
g.nodes[item_ntype].data['id'] = torch.arange(g.number_of_nodes(item_ntype)) g.nodes[item_ntype].data['id'] = torch.arange(g.num_nodes(item_ntype))
# Prepare torchtext dataset and vocabulary # Prepare torchtext dataset and Vocabulary
fields = {} textset = {}
examples = [] tokenizer = get_tokenizer(None)
for key, texts in item_texts.items():
fields[key] = torchtext.legacy.data.Field(include_lengths=True, lower=True, batch_first=True) textlist = []
for i in range(g.number_of_nodes(item_ntype)): batch_first = True
example = torchtext.legacy.data.Example.fromlist(
[item_texts[key][i] for key in item_texts.keys()], for i in range(g.num_nodes(item_ntype)):
[(key, fields[key]) for key in item_texts.keys()]) for key in item_texts.keys():
examples.append(example) l = tokenizer(item_texts[key][i].lower())
textset = torchtext.legacy.data.Dataset(examples, fields) textlist.append(l)
for key, field in fields.items(): for key, field in item_texts.items():
field.build_vocab(getattr(textset, key)) vocab2 = build_vocab_from_iterator(textlist, specials=["<unk>","<pad>"])
#field.build_vocab(getattr(textset, key), vectors='fasttext.simple.300d') textset[key] = (textlist, vocab2, vocab2.get_stoi()['<pad>'], batch_first)
# Sampler # Sampler
batch_sampler = sampler_module.ItemToItemBatchSampler( batch_sampler = sampler_module.ItemToItemBatchSampler(
...@@ -77,7 +79,7 @@ def train(dataset, args): ...@@ -77,7 +79,7 @@ def train(dataset, args):
collate_fn=collator.collate_train, collate_fn=collator.collate_train,
num_workers=args.num_workers) num_workers=args.num_workers)
dataloader_test = DataLoader( dataloader_test = DataLoader(
torch.arange(g.number_of_nodes(item_ntype)), torch.arange(g.num_nodes(item_ntype)),
batch_size=args.batch_size, batch_size=args.batch_size,
collate_fn=collator.collate_test, collate_fn=collator.collate_test,
num_workers=args.num_workers) num_workers=args.num_workers)
...@@ -107,7 +109,7 @@ def train(dataset, args): ...@@ -107,7 +109,7 @@ def train(dataset, args):
# Evaluate # Evaluate
model.eval() model.eval()
with torch.no_grad(): with torch.no_grad():
item_batches = torch.arange(g.number_of_nodes(item_ntype)).split(args.batch_size) item_batches = torch.arange(g.num_nodes(item_ntype)).split(args.batch_size)
h_item_batches = [] h_item_batches = []
for blocks in dataloader_test: for blocks in dataloader_test:
for i in range(len(blocks)): for i in range(len(blocks)):
......
...@@ -12,6 +12,8 @@ import tqdm ...@@ -12,6 +12,8 @@ import tqdm
import layers import layers
import sampler as sampler_module import sampler as sampler_module
import evaluation import evaluation
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
class PinSAGEModel(nn.Module): class PinSAGEModel(nn.Module):
def __init__(self, full_graph, ntype, textsets, hidden_dims, n_layers): def __init__(self, full_graph, ntype, textsets, hidden_dims, n_layers):
...@@ -51,19 +53,19 @@ def train(dataset, args): ...@@ -51,19 +53,19 @@ def train(dataset, args):
device = torch.device(args.device) device = torch.device(args.device)
# Prepare torchtext dataset and vocabulary # Prepare torchtext dataset and vocabulary
fields = {} textset = {}
examples = [] tokenizer = get_tokenizer(None)
for key, texts in item_texts.items():
fields[key] = torchtext.data.Field(include_lengths=True, lower=True, batch_first=True) textlist = []
for i in range(g.number_of_nodes(item_ntype)): batch_first = True
example = torchtext.data.Example.fromlist(
[item_texts[key][i] for key in item_texts.keys()], for i in range(g.num_nodes(item_ntype)):
[(key, fields[key]) for key in item_texts.keys()]) for key in item_texts.keys():
examples.append(example) l = tokenizer(item_texts[key][i].lower())
textset = torchtext.data.Dataset(examples, fields) textlist.append(l)
for key, field in fields.items(): for key, field in item_texts.items():
field.build_vocab(getattr(textset, key)) vocab2 = build_vocab_from_iterator(textlist, specials=["<unk>","<pad>"])
#field.build_vocab(getattr(textset, key), vectors='fasttext.simple.300d') textset[key] = (textlist, vocab2, vocab2.get_stoi()['<pad>'], batch_first)
# Sampler # Sampler
batch_sampler = sampler_module.ItemToItemBatchSampler( batch_sampler = sampler_module.ItemToItemBatchSampler(
...@@ -78,7 +80,7 @@ def train(dataset, args): ...@@ -78,7 +80,7 @@ def train(dataset, args):
collate_fn=collator.collate_train, collate_fn=collator.collate_train,
num_workers=args.num_workers) num_workers=args.num_workers)
dataloader_test = DataLoader( dataloader_test = DataLoader(
torch.arange(g.number_of_nodes(item_ntype)), torch.arange(g.num_nodes(item_ntype)),
batch_size=args.batch_size, batch_size=args.batch_size,
collate_fn=collator.collate_test, collate_fn=collator.collate_test,
num_workers=args.num_workers) num_workers=args.num_workers)
...@@ -86,7 +88,7 @@ def train(dataset, args): ...@@ -86,7 +88,7 @@ def train(dataset, args):
# Model # Model
model = PinSAGEModel(g, item_ntype, textset, args.hidden_dims, args.num_layers).to(device) model = PinSAGEModel(g, item_ntype, textset, args.hidden_dims, args.num_layers).to(device)
item_emb = nn.Embedding(g.number_of_nodes(item_ntype), args.hidden_dims, sparse=True) item_emb = nn.Embedding(g.num_nodes(item_ntype), args.hidden_dims, sparse=True)
# Optimizer # Optimizer
opt = torch.optim.Adam(model.parameters(), lr=args.lr) opt = torch.optim.Adam(model.parameters(), lr=args.lr)
opt_emb = torch.optim.SparseAdam(item_emb.parameters(), lr=args.lr) opt_emb = torch.optim.SparseAdam(item_emb.parameters(), lr=args.lr)
...@@ -112,7 +114,7 @@ def train(dataset, args): ...@@ -112,7 +114,7 @@ def train(dataset, args):
# Evaluate # Evaluate
model.eval() model.eval()
with torch.no_grad(): with torch.no_grad():
item_batches = torch.arange(g.number_of_nodes(item_ntype)).split(args.batch_size) item_batches = torch.arange(g.num_nodes(item_ntype)).split(args.batch_size)
h_item_batches = [] h_item_batches = []
for blocks in tqdm.tqdm(dataloader_test): for blocks in tqdm.tqdm(dataloader_test):
for i in range(len(blocks)): for i in range(len(blocks)):
......
...@@ -54,7 +54,7 @@ if __name__ == '__main__': ...@@ -54,7 +54,7 @@ if __name__ == '__main__':
g.edges['listened'].data['created_at'] = torch.LongTensor(events['created_at'].values) g.edges['listened'].data['created_at'] = torch.LongTensor(events['created_at'].values)
g.edges['listened-by'].data['created_at'] = torch.LongTensor(events['created_at'].values) g.edges['listened-by'].data['created_at'] = torch.LongTensor(events['created_at'].values)
n_edges = g.number_of_edges('listened') n_edges = g.num_edges('listened')
train_indices, val_indices, test_indices = train_test_split_by_time(events, 'created_at', 'user_id') train_indices, val_indices, test_indices = train_test_split_by_time(events, 'created_at', 'user_id')
train_g = build_train_graph(g, train_indices, 'user', 'track', 'listened', 'listened-by') train_g = build_train_graph(g, train_indices, 'user', 'track', 'listened', 'listened-by')
assert train_g.out_degrees(etype='listened').min() > 0 assert train_g.out_degrees(etype='listened').min() > 0
......
...@@ -2,6 +2,20 @@ import numpy as np ...@@ -2,6 +2,20 @@ import numpy as np
import dgl import dgl
import torch import torch
from torch.utils.data import IterableDataset, DataLoader from torch.utils.data import IterableDataset, DataLoader
from torchtext.data.functional import numericalize_tokens_from_iterator
def padding(array, yy, val):
"""
:param array: torch tensor array
:param yy: desired width
:param val: padded value
:return: padded array
"""
w = array.shape[0]
b = 0
bb = yy - b - w
return torch.nn.functional.pad(array, pad=(b, bb), mode='constant', value=val)
def compact_and_copy(frontier, seeds): def compact_and_copy(frontier, seeds):
block = dgl.to_block(frontier, seeds) block = dgl.to_block(frontier, seeds)
...@@ -22,12 +36,12 @@ class ItemToItemBatchSampler(IterableDataset): ...@@ -22,12 +36,12 @@ class ItemToItemBatchSampler(IterableDataset):
def __iter__(self): def __iter__(self):
while True: while True:
heads = torch.randint(0, self.g.number_of_nodes(self.item_type), (self.batch_size,)) heads = torch.randint(0, self.g.num_nodes(self.item_type), (self.batch_size,))
tails = dgl.sampling.random_walk( tails = dgl.sampling.random_walk(
self.g, self.g,
heads, heads,
metapath=[self.item_to_user_etype, self.user_to_item_etype])[0][:, 2] metapath=[self.item_to_user_etype, self.user_to_item_etype])[0][:, 2]
neg_tails = torch.randint(0, self.g.number_of_nodes(self.item_type), (self.batch_size,)) neg_tails = torch.randint(0, self.g.num_nodes(self.item_type), (self.batch_size,))
mask = (tails != -1) mask = (tails != -1)
yield heads[mask], tails[mask], neg_tails[mask] yield heads[mask], tails[mask], neg_tails[mask]
...@@ -68,10 +82,10 @@ class NeighborSampler(object): ...@@ -68,10 +82,10 @@ class NeighborSampler(object):
# connections only. # connections only.
pos_graph = dgl.graph( pos_graph = dgl.graph(
(heads, tails), (heads, tails),
num_nodes=self.g.number_of_nodes(self.item_type)) num_nodes=self.g.num_nodes(self.item_type))
neg_graph = dgl.graph( neg_graph = dgl.graph(
(heads, neg_tails), (heads, neg_tails),
num_nodes=self.g.number_of_nodes(self.item_type)) num_nodes=self.g.num_nodes(self.item_type))
pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph]) pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph])
seeds = pos_graph.ndata[dgl.NID] seeds = pos_graph.ndata[dgl.NID]
...@@ -111,12 +125,26 @@ def assign_textual_node_features(ndata, textset, ntype): ...@@ -111,12 +125,26 @@ def assign_textual_node_features(ndata, textset, ntype):
""" """
node_ids = ndata[dgl.NID].numpy() node_ids = ndata[dgl.NID].numpy()
for field_name, field in textset.fields.items(): for field_name, field in textset.items():
examples = [getattr(textset[i], field_name) for i in node_ids] textlist, vocab, pad_var, batch_first = field
tokens, lengths = field.process(examples) examples = [textlist[i] for i in node_ids]
ids_iter = numericalize_tokens_from_iterator(vocab, examples)
if not field.batch_first:
maxsize = max([len(textlist[i]) for i in node_ids])
ids = next(ids_iter)
x = torch.asarray([num for num in ids])
lengths = torch.tensor([len(x)])
tokens = padding(x, maxsize, pad_var)
for ids in ids_iter:
x = torch.asarray([num for num in ids])
l = torch.tensor([len(x)])
y = padding(x, maxsize, pad_var)
tokens = torch.vstack((tokens,y))
lengths = torch.cat((lengths, l))
if not batch_first:
tokens = tokens.t() tokens = tokens.t()
ndata[field_name] = tokens ndata[field_name] = tokens
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment