Unverified Commit 9c41c22d authored by Tong He's avatar Tong He Committed by GitHub
Browse files

[Model] Official implementation for HiLANDER model. (#3087)



* add hilander model implementation draft

* use focal loss

* fix

* change data root

* add necessary scripts

* update download links

* update

* update example table

* fix

* update readme with numbers

* add empty folder

* only eval at the end

* set up hilander

* inform results may fluctuate

* address comments
Co-authored-by: default avatarsneakerkg <xiaotj1990327@gmail.com>
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-19-212.us-east-2.compute.internal>
parent 66ad774f
import argparse, time, os, pickle
import numpy as np
import dgl
import torch
import torch.optim as optim
from models import LANDER
from dataset import LanderDataset
###########
# ArgParser
parser = argparse.ArgumentParser()
# Dataset
parser.add_argument('--data_path', type=str, required=True)
parser.add_argument('--test_data_path', type=str, required=True)
parser.add_argument('--levels', type=str, default='1')
parser.add_argument('--faiss_gpu', action='store_true')
parser.add_argument('--model_filename', type=str, default='lander.pth')
# KNN
parser.add_argument('--knn_k', type=str, default='10')
# Model
parser.add_argument('--hidden', type=int, default=512)
parser.add_argument('--num_conv', type=int, default=4)
parser.add_argument('--dropout', type=float, default=0.)
parser.add_argument('--gat', action='store_true')
parser.add_argument('--gat_k', type=int, default=1)
parser.add_argument('--balance', action='store_true')
parser.add_argument('--use_cluster_feat', action='store_true')
parser.add_argument('--use_focal_loss', action='store_true')
# Training
parser.add_argument('--epochs', type=int, default=100)
parser.add_argument('--lr', type=float, default=0.1)
parser.add_argument('--momentum', type=float, default=0.9)
parser.add_argument('--weight_decay', type=float, default=1e-5)
args = parser.parse_args()
###########################
# Environment Configuration
if torch.cuda.is_available():
device = torch.device('cuda')
else:
device = torch.device('cpu')
##################
# Data Preparation
def prepare_dataset_graphs(data_path, k_list, lvl_list):
with open(data_path, 'rb') as f:
features, labels = pickle.load(f)
gs = []
for k, l in zip(k_list, lvl_list):
dataset = LanderDataset(features=features, labels=labels, k=k,
levels=l, faiss_gpu=args.faiss_gpu)
gs += [g.to(device) for g in dataset.gs]
return gs
k_list = [int(k) for k in args.knn_k.split(',')]
lvl_list = [int(l) for l in args.levels.split(',')]
gs = prepare_dataset_graphs(args.data_path, k_list, lvl_list)
test_gs = prepare_dataset_graphs(args.test_data_path, k_list, lvl_list)
##################
# Model Definition
feature_dim = gs[0].ndata['features'].shape[1]
model = LANDER(feature_dim=feature_dim, nhid=args.hidden,
num_conv=args.num_conv, dropout=args.dropout,
use_GAT=args.gat, K=args.gat_k,
balance=args.balance,
use_cluster_feat=args.use_cluster_feat,
use_focal_loss=args.use_focal_loss)
model = model.to(device)
model.train()
best_model = None
best_loss = np.Inf
#################
# Hyperparameters
opt = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum,
weight_decay=args.weight_decay)
scheduler = optim.lr_scheduler.CosineAnnealingLR(opt, T_max=args.epochs, eta_min=1e-5)
###############
# Training Loop
for epoch in range(args.epochs):
all_loss_den_val = 0
all_loss_conn_val = 0
for g in gs:
opt.zero_grad()
g = model(g)
loss, loss_den_val, loss_conn_val = model.compute_loss(g)
all_loss_den_val += loss_den_val
all_loss_conn_val += loss_conn_val
loss.backward()
opt.step()
scheduler.step()
print('Training, epoch: %d, loss_den: %.6f, loss_conn: %.6f'%
(epoch, all_loss_den_val, all_loss_conn_val))
# Report test
all_test_loss_den_val = 0
all_test_loss_conn_val = 0
with torch.no_grad():
for g in test_gs:
g = model(g)
loss, loss_den_val, loss_conn_val = model.compute_loss(g)
all_test_loss_den_val += loss_den_val
all_test_loss_conn_val += loss_conn_val
print('Testing, epoch: %d, loss_den: %.6f, loss_conn: %.6f'%
(epoch, all_test_loss_den_val, all_test_loss_conn_val))
if all_test_loss_conn_val + all_test_loss_den_val < best_loss:
best_loss = all_test_loss_conn_val + all_test_loss_den_val
print ('New best epoch', epoch)
torch.save(model.state_dict(), args.model_filename+'_best')
torch.save(model.state_dict(), args.model_filename)
torch.save(model.state_dict(), args.model_filename)
import argparse, time, os, pickle
import numpy as np
import dgl
import torch
import torch.optim as optim
from models import LANDER
from dataset import LanderDataset
###########
# ArgParser
parser = argparse.ArgumentParser()
# Dataset
parser.add_argument('--data_path', type=str, required=True)
parser.add_argument('--levels', type=str, default='1')
parser.add_argument('--faiss_gpu', action='store_true')
parser.add_argument('--model_filename', type=str, default='lander.pth')
# KNN
parser.add_argument('--knn_k', type=str, default='10')
parser.add_argument('--num_workers', type=int, default=0)
# Model
parser.add_argument('--hidden', type=int, default=512)
parser.add_argument('--num_conv', type=int, default=1)
parser.add_argument('--dropout', type=float, default=0.)
parser.add_argument('--gat', action='store_true')
parser.add_argument('--gat_k', type=int, default=1)
parser.add_argument('--balance', action='store_true')
parser.add_argument('--use_cluster_feat', action='store_true')
parser.add_argument('--use_focal_loss', action='store_true')
# Training
parser.add_argument('--epochs', type=int, default=100)
parser.add_argument('--batch_size', type=int, default=1024)
parser.add_argument('--lr', type=float, default=0.1)
parser.add_argument('--momentum', type=float, default=0.9)
parser.add_argument('--weight_decay', type=float, default=1e-5)
args = parser.parse_args()
print(args)
###########################
# Environment Configuration
if torch.cuda.is_available():
device = torch.device('cuda')
else:
device = torch.device('cpu')
##################
# Data Preparation
with open(args.data_path, 'rb') as f:
features, labels = pickle.load(f)
k_list = [int(k) for k in args.knn_k.split(',')]
lvl_list = [int(l) for l in args.levels.split(',')]
gs = []
nbrs = []
ks = []
for k, l in zip(k_list, lvl_list):
dataset = LanderDataset(features=features, labels=labels, k=k,
levels=l, faiss_gpu=args.faiss_gpu)
gs += [g for g in dataset.gs]
ks += [k for g in dataset.gs]
nbrs += [nbr for nbr in dataset.nbrs]
print('Dataset Prepared.')
def set_train_sampler_loader(g, k):
fanouts = [k-1 for i in range(args.num_conv + 1)]
sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
# fix the number of edges
train_dataloader = dgl.dataloading.NodeDataLoader(
g, torch.arange(g.number_of_nodes()), sampler,
batch_size=args.batch_size,
shuffle=True,
drop_last=False,
num_workers=args.num_workers
)
return train_dataloader
train_loaders = []
for gidx, g in enumerate(gs):
train_dataloader = set_train_sampler_loader(gs[gidx], ks[gidx])
train_loaders.append(train_dataloader)
##################
# Model Definition
feature_dim = gs[0].ndata['features'].shape[1]
model = LANDER(feature_dim=feature_dim, nhid=args.hidden,
num_conv=args.num_conv, dropout=args.dropout,
use_GAT=args.gat, K=args.gat_k,
balance=args.balance,
use_cluster_feat=args.use_cluster_feat,
use_focal_loss=args.use_focal_loss)
model = model.to(device)
model.train()
#################
# Hyperparameters
opt = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum,
weight_decay=args.weight_decay)
# keep num_batch_per_loader the same for every sub_dataloader
num_batch_per_loader = len(train_loaders[0])
train_loaders = [iter(train_loader) for train_loader in train_loaders]
num_loaders = len(train_loaders)
scheduler = optim.lr_scheduler.CosineAnnealingLR(opt,
T_max=args.epochs * num_batch_per_loader * num_loaders,
eta_min=1e-5)
print('Start Training.')
###############
# Training Loop
for epoch in range(args.epochs):
loss_den_val_total = []
loss_conn_val_total = []
loss_val_total = []
for batch in range(num_batch_per_loader):
for loader_id in range(num_loaders):
try:
minibatch = next(train_loaders[loader_id])
except:
train_loaders[loader_id] = iter(set_train_sampler_loader(gs[loader_id], ks[loader_id]))
minibatch = next(train_loaders[loader_id])
input_nodes, sub_g, bipartites = minibatch
sub_g = sub_g.to(device)
bipartites = [b.to(device) for b in bipartites]
# get the feature for the input_nodes
opt.zero_grad()
output_bipartite = model(bipartites)
loss, loss_den_val, loss_conn_val = model.compute_loss(output_bipartite)
loss_den_val_total.append(loss_den_val)
loss_conn_val_total.append(loss_conn_val)
loss_val_total.append(loss.item())
loss.backward()
opt.step()
if (batch + 1) % 10 == 0:
print('epoch: %d, batch: %d / %d, loader_id : %d / %d, loss: %.6f, loss_den: %.6f, loss_conn: %.6f'%
(epoch, batch, num_batch_per_loader, loader_id, num_loaders,
loss.item(), loss_den_val, loss_conn_val))
scheduler.step()
print('epoch: %d, loss: %.6f, loss_den: %.6f, loss_conn: %.6f'%
(epoch, np.array(loss_val_total).mean(),
np.array(loss_den_val_total).mean(), np.array(loss_conn_val_total).mean()))
torch.save(model.state_dict(), args.model_filename)
torch.save(model.state_dict(), args.model_filename)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from .misc import *
from .knn import *
from .adjacency import *
from .faiss_search import faiss_search_knn
from .faiss_gpu import faiss_search_approx_knn
from .evaluate import *
from .deduce import *
from .density import *
from .metrics import *
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This file re-uses implementation from https://github.com/yl-1993/learn-to-cluster
"""
import numpy as np
import scipy.sparse as sp
from scipy.sparse import coo_matrix
def row_normalize(mx):
"""Row-normalize sparse matrix"""
rowsum = np.array(mx.sum(1))
# if rowsum <= 0, keep its previous value
rowsum[rowsum <= 0] = 1
r_inv = np.power(rowsum, -1).flatten()
r_inv[np.isinf(r_inv)] = 0.
r_mat_inv = sp.diags(r_inv)
mx = r_mat_inv.dot(mx)
return mx, r_inv
def sparse_mx_to_indices_values(sparse_mx):
sparse_mx = sparse_mx.tocoo().astype(np.float32)
indices = np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)
values = sparse_mx.data
shape = np.array(sparse_mx.shape)
return indices, values, shape
"""
This file re-uses implementation from https://github.com/yl-1993/learn-to-cluster
"""
import numpy as np
from sklearn import mixture
import torch
import dgl
from .density import density_to_peaks_vectorize, density_to_peaks
__all__ = ['peaks_to_labels', 'edge_to_connected_graph', 'decode', 'build_next_level']
def _find_parent(parent, u):
idx = []
# parent is a fixed point
while (u != parent[u]):
idx.append(u)
u = parent[u]
for i in idx:
parent[i] = u
return u
def edge_to_connected_graph(edges, num):
parent = list(range(num))
for u, v in edges:
p_u = _find_parent(parent, u)
p_v = _find_parent(parent, v)
parent[p_u] = p_v
for i in range(num):
parent[i] = _find_parent(parent, i)
remap = {}
uf = np.unique(np.array(parent))
for i, f in enumerate(uf):
remap[f] = i
cluster_id = np.array([remap[f] for f in parent])
return cluster_id
def peaks_to_edges(peaks, dist2peak, tau):
edges = []
for src in peaks:
dsts = peaks[src]
dists = dist2peak[src]
for dst, dist in zip(dsts, dists):
if src == dst or dist >= 1 - tau:
continue
edges.append([src, dst])
return edges
def peaks_to_labels(peaks, dist2peak, tau, inst_num):
edges = peaks_to_edges(peaks, dist2peak, tau)
pred_labels = edge_to_connected_graph(edges, inst_num)
return pred_labels, edges
def get_dists(g, nbrs, use_gt):
k = nbrs.shape[1]
src_id = nbrs[:,1:].reshape(-1)
dst_id = nbrs[:,0].repeat(k - 1)
eids = g.edge_ids(src_id, dst_id)
if use_gt:
new_dists = (1 - g.edata['labels_edge'][eids]).reshape(-1, k - 1).float()
else:
new_dists = g.edata['prob_conn'][eids, 0].reshape(-1, k - 1)
ind = torch.argsort(new_dists, 1)
offset = torch.LongTensor((nbrs[:, 0] * (k - 1)).repeat(k - 1).reshape(-1, k - 1)).to(g.device)
ind = ind + offset
nbrs = torch.LongTensor(nbrs).to(g.device)
new_nbrs = torch.take(nbrs[:,1:], ind)
new_dists = torch.cat([torch.zeros((new_dists.shape[0], 1)).to(g.device), new_dists], dim=1)
new_nbrs = torch.cat([torch.arange(new_nbrs.shape[0]).view(-1, 1).to(g.device), new_nbrs], dim=1)
return new_nbrs.cpu().detach().numpy(), new_dists.cpu().detach().numpy()
def get_edge_dist(g, threshold):
if threshold == 'prob':
return g.edata['prob_conn'][:,0]
return 1 - g.edata['raw_affine']
def tree_generation(ng):
ng.ndata['keep_eid'] = torch.zeros(ng.number_of_nodes()).long() - 1
def message_func(edges):
return {'mval': edges.data['edge_dist'],
'meid': edges.data[dgl.EID]}
def reduce_func(nodes):
ind = torch.min(nodes.mailbox['mval'], dim=1)[1]
keep_eid = nodes.mailbox['meid'].gather(1, ind.view(-1, 1))
return {'keep_eid': keep_eid[:, 0]}
node_order = dgl.traversal.topological_nodes_generator(ng)
ng.prop_nodes(node_order, message_func, reduce_func)
eids = ng.ndata['keep_eid']
eids = eids[eids > -1]
edges = ng.find_edges(eids)
treeg = dgl.graph(edges, num_nodes=ng.number_of_nodes())
return treeg
def peak_propogation(treeg):
treeg.ndata['pred_labels'] = torch.zeros(treeg.number_of_nodes()).long() - 1
peaks = torch.where(treeg.in_degrees() == 0)[0].cpu().numpy()
treeg.ndata['pred_labels'][peaks] = torch.arange(peaks.shape[0])
def message_func(edges):
return {'mlb': edges.src['pred_labels']}
def reduce_func(nodes):
return {'pred_labels': nodes.mailbox['mlb'][:, 0]}
node_order = dgl.traversal.topological_nodes_generator(treeg)
treeg.prop_nodes(node_order, message_func, reduce_func)
pred_labels = treeg.ndata['pred_labels'].cpu().numpy()
return peaks, pred_labels
def decode(g, tau, threshold, use_gt,
ids=None, global_edges=None, global_num_nodes=None, global_peaks=None):
# Edge filtering with tau and density
den_key = 'density' if use_gt else 'pred_den'
g = g.local_var()
g.edata['edge_dist'] = get_edge_dist(g, threshold)
g.apply_edges(lambda edges: {'keep': (edges.src[den_key] > edges.dst[den_key]).long() * \
(edges.data['edge_dist'] < 1 - tau).long()})
eids = torch.where(g.edata['keep'] == 0)[0]
ng = dgl.remove_edges(g, eids)
# Tree generation
ng.edata[dgl.EID] = torch.arange(ng.number_of_edges())
treeg = tree_generation(ng)
# Label propogation
peaks, pred_labels = peak_propogation(treeg)
if ids is None:
return pred_labels, peaks
# Merge with previous layers
src, dst = treeg.edges()
new_global_edges = (global_edges[0] + ids[src.numpy()].tolist(),
global_edges[1] + ids[dst.numpy()].tolist())
global_treeg = dgl.graph(new_global_edges, num_nodes=global_num_nodes)
global_peaks, global_pred_labels = peak_propogation(global_treeg)
return pred_labels, peaks, new_global_edges, global_pred_labels, global_peaks
def build_next_level(features, labels, peaks,
global_features, global_pred_labels, global_peaks):
global_peak_to_label = global_pred_labels[global_peaks]
global_label_to_peak = np.zeros_like(global_peak_to_label)
for i, pl in enumerate(global_peak_to_label):
global_label_to_peak[pl] = i
cluster_ind = np.split(np.argsort(global_pred_labels),
np.unique(np.sort(global_pred_labels), return_index=True)[1][1:])
cluster_features = np.zeros((len(peaks), global_features.shape[1]))
for pi in range(len(peaks)):
cluster_features[global_label_to_peak[pi],:] = np.mean(global_features[cluster_ind[pi],:], axis=0)
features = features[peaks]
labels = labels[peaks]
return features, labels, cluster_features
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This file re-uses implementation from https://github.com/yl-1993/learn-to-cluster
"""
import numpy as np
from tqdm import tqdm
from itertools import groupby
import torch
__all__ = ['density_estimation', 'density_to_peaks', 'density_to_peaks_vectorize']
def density_estimation(dists, nbrs, labels, **kwargs):
''' use supervised density defined on neigborhood
'''
num, k_knn = dists.shape
conf = np.ones((num, ), dtype=np.float32)
ind_array = labels[nbrs] == np.expand_dims(labels, 1).repeat(k_knn, 1)
pos = ((1-dists[:,1:]) * ind_array[:,1:]).sum(1)
neg = ((1-dists[:,1:]) * (1-ind_array[:,1:])).sum(1)
conf = (pos - neg) * conf
conf /= (k_knn - 1)
return conf
def density_to_peaks_vectorize(dists, nbrs, density, max_conn=1, name = ''):
# just calculate 1 connectivity
assert dists.shape[0] == density.shape[0]
assert dists.shape == nbrs.shape
num, k = dists.shape
if name == 'gcn_feat':
include_mask = nbrs != np.arange(0, num).reshape(-1, 1)
secondary_mask = np.sum(include_mask, axis = 1) == k # TODO: the condition == k should not happen as distance to the node self should be smallest, check for numerical stability; TODO: make top M instead of only supporting top 1
include_mask[secondary_mask, -1] = False
nbrs_exclude_self = nbrs[include_mask].reshape(-1, k-1) # (V, 79)
dists_exclude_self = dists[include_mask].reshape(-1, k-1) # (V, 79)
else:
include_mask = nbrs != np.arange(0, num).reshape(-1, 1)
nbrs_exclude_self = nbrs[include_mask].reshape(-1, k-1) # (V, 79)
dists_exclude_self = dists[include_mask].reshape(-1, k-1) # (V, 79)
compare_map = density[nbrs_exclude_self] > density.reshape(-1, 1)
peak_index = np.argmax(np.where(compare_map, 1, 0), axis = 1) # (V,)
compare_map_sum = np.sum(compare_map.cpu().data.numpy(), axis=1) # (V,)
dist2peak = {i: [] if compare_map_sum[i] == 0 else [dists_exclude_self[i, peak_index[i]]] for i in range(num)}
peaks = {i: [] if compare_map_sum[i] == 0 else [nbrs_exclude_self[i, peak_index[i]]] for i in range(num)}
return dist2peak, peaks
def density_to_peaks(dists, nbrs, density, max_conn=1, sort='dist'):
# Note that dists has been sorted in ascending order
assert dists.shape[0] == density.shape[0]
assert dists.shape == nbrs.shape
num, _ = dists.shape
dist2peak = {i: [] for i in range(num)}
peaks = {i: [] for i in range(num)}
for i, nbr in tqdm(enumerate(nbrs)):
nbr_conf = density[nbr]
for j, c in enumerate(nbr_conf):
nbr_idx = nbr[j]
if i == nbr_idx or c <= density[i]:
continue
dist2peak[i].append(dists[i, j])
peaks[i].append(nbr_idx)
if len(dist2peak[i]) >= max_conn:
break
return dist2peak, peaks
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import inspect
import argparse
import numpy as np
from utils import Timer, TextColors, metrics
from clustering_benchmark import ClusteringBenchmark
def _read_meta(fn):
labels = list()
lb_set = set()
with open(fn) as f:
for lb in f.readlines():
lb = int(lb.strip())
labels.append(lb)
lb_set.add(lb)
return np.array(labels), lb_set
def evaluate(gt_labels, pred_labels, metric='pairwise'):
if isinstance(gt_labels, str) and isinstance(pred_labels, str):
print('[gt_labels] {}'.format(gt_labels))
print('[pred_labels] {}'.format(pred_labels))
gt_labels, gt_lb_set = _read_meta(gt_labels)
pred_labels, pred_lb_set = _read_meta(pred_labels)
print('#inst: gt({}) vs pred({})'.format(len(gt_labels),
len(pred_labels)))
print('#cls: gt({}) vs pred({})'.format(len(gt_lb_set),
len(pred_lb_set)))
metric_func = metrics.__dict__[metric]
with Timer('evaluate with {}{}{}'.format(TextColors.FATAL, metric,
TextColors.ENDC)):
result = metric_func(gt_labels, pred_labels)
if isinstance(result, np.float):
print('{}{}: {:.4f}{}'.format(TextColors.OKGREEN, metric, result,
TextColors.ENDC))
else:
ave_pre, ave_rec, fscore = result
print('{}ave_pre: {:.4f}, ave_rec: {:.4f}, fscore: {:.4f}{}'.format(
TextColors.OKGREEN, ave_pre, ave_rec, fscore, TextColors.ENDC))
def evaluation(pred_labels, labels, metrics):
print('==> evaluation')
#pred_labels = g.ndata['pred_labels'].cpu().numpy()
max_cluster = np.max(pred_labels)
#gt_labels_all = g.ndata['labels'].cpu().numpy()
gt_labels_all = labels
pred_labels_all = pred_labels
metric_list = metrics.split(',')
for metric in metric_list:
evaluate(gt_labels_all, pred_labels_all, metric)
# H and C-scores
gt_dict = {}
pred_dict = {}
for i in range(len(gt_labels_all)):
gt_dict[str(i)] = gt_labels_all[i]
pred_dict[str(i)] = pred_labels_all[i]
bm = ClusteringBenchmark(gt_dict)
scores = bm.evaluate_vmeasure(pred_dict)
fmi_scores = bm.evaluate_fowlkes_mallows_score(pred_dict)
print(scores)
"""
This file re-uses implementation from https://github.com/yl-1993/learn-to-cluster
"""
import os
import gc
import numpy as np
from tqdm import tqdm
import faiss
__all__ = ['faiss_search_approx_knn']
class faiss_index_wrapper():
def __init__(self,
target,
nprobe=128,
index_factory_str=None,
verbose=False,
mode='proxy',
using_gpu=True):
self._res_list = []
num_gpu = faiss.get_num_gpus()
print('[faiss gpu] #GPU: {}'.format(num_gpu))
size, dim = target.shape
assert size > 0, "size: {}".format(size)
index_factory_str = "IVF{},PQ{}".format(
min(8192, 16 * round(np.sqrt(size))),
32) if index_factory_str is None else index_factory_str
cpu_index = faiss.index_factory(dim, index_factory_str)
cpu_index.nprobe = nprobe
if mode == 'proxy':
co = faiss.GpuClonerOptions()
co.useFloat16 = True
co.usePrecomputed = False
index = faiss.IndexProxy()
for i in range(num_gpu):
res = faiss.StandardGpuResources()
self._res_list.append(res)
sub_index = faiss.index_cpu_to_gpu(
res, i, cpu_index, co) if using_gpu else cpu_index
index.addIndex(sub_index)
elif mode == 'shard':
co = faiss.GpuMultipleClonerOptions()
co.useFloat16 = True
co.usePrecomputed = False
co.shard = True
index = faiss.index_cpu_to_all_gpus(cpu_index,
co,
ngpu=num_gpu)
else:
raise KeyError("Unknown index mode")
index = faiss.IndexIDMap(index)
index.verbose = verbose
# get nlist to decide how many samples used for training
nlist = int(float([
item for item in index_factory_str.split(",") if 'IVF' in item
][0].replace("IVF", "")))
# training
if not index.is_trained:
indexes_sample_for_train = np.random.randint(
0, size, nlist * 256)
index.train(target[indexes_sample_for_train])
# add with ids
target_ids = np.arange(0, size)
index.add_with_ids(target, target_ids)
self.index = index
def search(self, *args, **kargs):
return self.index.search(*args, **kargs)
def __del__(self):
self.index.reset()
del self.index
for res in self._res_list:
del res
def batch_search(index, query, k, bs, verbose=False):
n = len(query)
dists = np.zeros((n, k), dtype=np.float32)
nbrs = np.zeros((n, k), dtype=np.int64)
for sid in tqdm(range(0, n, bs),
desc="faiss searching...",
disable=not verbose):
eid = min(n, sid + bs)
dists[sid:eid], nbrs[sid:eid] = index.search(query[sid:eid], k)
return dists, nbrs
def faiss_search_approx_knn(query,
target,
k,
nprobe=128,
bs=int(1e6),
index_factory_str=None,
verbose=False):
index = faiss_index_wrapper(target,
nprobe=nprobe,
index_factory_str=index_factory_str,
verbose=verbose)
dists, nbrs = batch_search(index, query, k=k, bs=bs, verbose=verbose)
del index
gc.collect()
return dists, nbrs
"""
This file re-uses implementation from https://github.com/yl-1993/learn-to-cluster
"""
import gc
from tqdm import tqdm
from .faiss_gpu import faiss_search_approx_knn
__all__ = ['faiss_search_knn']
def precise_dist(feat, nbrs, num_process=4, sort=True, verbose=False):
import torch
feat_share = torch.from_numpy(feat).share_memory_()
nbrs_share = torch.from_numpy(nbrs).share_memory_()
dist_share = torch.zeros_like(nbrs_share).float().share_memory_()
precise_dist_share_mem(feat_share,
nbrs_share,
dist_share,
num_process=num_process,
sort=sort,
verbose=verbose)
del feat_share
gc.collect()
return dist_share.numpy(), nbrs_share.numpy()
def precise_dist_share_mem(feat,
nbrs,
dist,
num_process=16,
sort=True,
process_unit=4000,
verbose=False):
from torch import multiprocessing as mp
num, _ = feat.shape
num_per_proc = int(num / num_process) + 1
for pi in range(num_process):
sid = pi * num_per_proc
eid = min(sid + num_per_proc, num)
kwargs={'feat': feat,
'nbrs': nbrs,
'dist': dist,
'sid': sid,
'eid': eid,
'sort': sort,
'process_unit': process_unit,
'verbose': verbose,
}
bmm(**kwargs)
def bmm(feat,
nbrs,
dist,
sid,
eid,
sort=True,
process_unit=4000,
verbose=False):
import torch
_, cols = dist.shape
batch_sim = torch.zeros((eid - sid, cols), dtype=torch.float32)
for s in tqdm(range(sid, eid, process_unit),
desc='bmm',
disable=not verbose):
e = min(eid, s + process_unit)
query = feat[s:e].unsqueeze(1)
gallery = feat[nbrs[s:e]].permute(0, 2, 1)
batch_sim[s - sid:e - sid] = torch.clamp(torch.bmm(query, gallery).view(-1, cols), 0.0, 1.0)
if sort:
sort_unit = int(1e6)
batch_nbr = nbrs[sid:eid]
for s in range(0, batch_sim.shape[0], sort_unit):
e = min(s + sort_unit, eid)
batch_sim[s:e], indices = torch.sort(batch_sim[s:e],
descending=True)
batch_nbr[s:e] = torch.gather(batch_nbr[s:e], 1, indices)
nbrs[sid:eid] = batch_nbr
dist[sid:eid] = 1. - batch_sim
def faiss_search_knn(feat,
k,
nprobe=128,
num_process=4,
is_precise=True,
sort=True,
verbose=False):
dists, nbrs = faiss_search_approx_knn(query=feat,
target=feat,
k=k,
nprobe=nprobe,
verbose=verbose)
if is_precise:
print('compute precise dist among k={} nearest neighbors'.format(k))
dists, nbrs = precise_dist(feat,
nbrs,
num_process=num_process,
sort=sort,
verbose=verbose)
return dists, nbrs
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This file re-uses implementation from https://github.com/yl-1993/learn-to-cluster
"""
import os
import math
import numpy as np
import multiprocessing as mp
from tqdm import tqdm
from utils import Timer
from .faiss_search import faiss_search_knn
__all__ = [
'knn_faiss', 'knn_faiss_gpu',
'fast_knns2spmat', 'build_knns',
'knns2ordered_nbrs'
]
def knns2ordered_nbrs(knns, sort=True):
if isinstance(knns, list):
knns = np.array(knns)
nbrs = knns[:, 0, :].astype(np.int32)
dists = knns[:, 1, :]
if sort:
# sort dists from low to high
nb_idx = np.argsort(dists, axis=1)
idxs = np.arange(nb_idx.shape[0]).reshape(-1, 1)
dists = dists[idxs, nb_idx]
nbrs = nbrs[idxs, nb_idx]
return dists, nbrs
def fast_knns2spmat(knns, k, th_sim=0, use_sim=True, fill_value=None):
# convert knns to symmetric sparse matrix
from scipy.sparse import csr_matrix
eps = 1e-5
n = len(knns)
if isinstance(knns, list):
knns = np.array(knns)
if len(knns.shape) == 2:
# knns saved by hnsw has different shape
n = len(knns)
ndarr = np.ones([n, 2, k])
ndarr[:, 0, :] = -1 # assign unknown dist to 1 and nbr to -1
for i, (nbr, dist) in enumerate(knns):
size = len(nbr)
assert size == len(dist)
ndarr[i, 0, :size] = nbr[:size]
ndarr[i, 1, :size] = dist[:size]
knns = ndarr
nbrs = knns[:, 0, :]
dists = knns[:, 1, :]
assert -eps <= dists.min() <= dists.max(
) <= 1 + eps, "min: {}, max: {}".format(dists.min(), dists.max())
if use_sim:
sims = 1. - dists
else:
sims = dists
if fill_value is not None:
print('[fast_knns2spmat] edge fill value:', fill_value)
sims.fill(fill_value)
row, col = np.where(sims >= th_sim)
# remove the self-loop
idxs = np.where(row != nbrs[row, col])
row = row[idxs]
col = col[idxs]
data = sims[row, col]
col = nbrs[row, col] # convert to absolute column
assert len(row) == len(col) == len(data)
spmat = csr_matrix((data, (row, col)), shape=(n, n))
return spmat
def build_knns(feats,
k,
knn_method,
dump=True):
with Timer('build index'):
if knn_method == 'faiss':
index = knn_faiss(feats, k, omp_num_threads=None)
elif knn_method == 'faiss_gpu':
index = knn_faiss_gpu(feats, k)
else:
raise KeyError(
'Only support faiss and faiss_gpu currently ({}).'.format(knn_method))
knns = index.get_knns()
return knns
class knn():
def __init__(self, feats, k, index_path='', verbose=True):
pass
def filter_by_th(self, i):
th_nbrs = []
th_dists = []
nbrs, dists = self.knns[i]
for n, dist in zip(nbrs, dists):
if 1 - dist < self.th:
continue
th_nbrs.append(n)
th_dists.append(dist)
th_nbrs = np.array(th_nbrs)
th_dists = np.array(th_dists)
return (th_nbrs, th_dists)
def get_knns(self, th=None):
if th is None or th <= 0.:
return self.knns
# TODO: optimize the filtering process by numpy
# nproc = mp.cpu_count()
nproc = 1
with Timer('filter edges by th {} (CPU={})'.format(th, nproc),
self.verbose):
self.th = th
self.th_knns = []
tot = len(self.knns)
if nproc > 1:
pool = mp.Pool(nproc)
th_knns = list(
tqdm(pool.imap(self.filter_by_th, range(tot)), total=tot))
pool.close()
else:
th_knns = [self.filter_by_th(i) for i in range(tot)]
return th_knns
class knn_faiss(knn):
def __init__(self,
feats,
k,
nprobe=128,
omp_num_threads=None,
rebuild_index=True,
verbose=True,
**kwargs):
import faiss
if omp_num_threads is not None:
faiss.omp_set_num_threads(omp_num_threads)
self.verbose = verbose
with Timer('[faiss] build index', verbose):
feats = feats.astype('float32')
size, dim = feats.shape
index = faiss.IndexFlatIP(dim)
index.add(feats)
with Timer('[faiss] query topk {}'.format(k), verbose):
sims, nbrs = index.search(feats, k=k)
self.knns = [(np.array(nbr, dtype=np.int32),
1 - np.array(sim, dtype=np.float32))
for nbr, sim in zip(nbrs, sims)]
class knn_faiss_gpu(knn):
def __init__(self,
feats,
k,
nprobe=128,
num_process=4,
is_precise=True,
sort=True,
verbose=True,
**kwargs):
with Timer('[faiss_gpu] query topk {}'.format(k), verbose):
dists, nbrs = faiss_search_knn(feats,
k=k,
nprobe=nprobe,
num_process=num_process,
is_precise=is_precise,
sort=sort,
verbose=verbose)
self.knns = [(np.array(nbr, dtype=np.int32),
np.array(dist, dtype=np.float32))
for nbr, dist in zip(nbrs, dists)]
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This file re-uses implementation from https://github.com/yl-1993/learn-to-cluster
"""
from __future__ import division
import numpy as np
from sklearn.metrics.cluster import (contingency_matrix,
normalized_mutual_info_score)
from sklearn.metrics import (precision_score, recall_score)
__all__ = ['pairwise', 'bcubed', 'nmi', 'precision', 'recall', 'accuracy']
def _check(gt_labels, pred_labels):
if gt_labels.ndim != 1:
raise ValueError("gt_labels must be 1D: shape is %r" %
(gt_labels.shape, ))
if pred_labels.ndim != 1:
raise ValueError("pred_labels must be 1D: shape is %r" %
(pred_labels.shape, ))
if gt_labels.shape != pred_labels.shape:
raise ValueError(
"gt_labels and pred_labels must have same size, got %d and %d" %
(gt_labels.shape[0], pred_labels.shape[0]))
return gt_labels, pred_labels
def _get_lb2idxs(labels):
lb2idxs = {}
for idx, lb in enumerate(labels):
if lb not in lb2idxs:
lb2idxs[lb] = []
lb2idxs[lb].append(idx)
return lb2idxs
def _compute_fscore(pre, rec):
return 2. * pre * rec / (pre + rec)
def fowlkes_mallows_score(gt_labels, pred_labels, sparse=True):
''' The original function is from `sklearn.metrics.fowlkes_mallows_score`.
We output the pairwise precision, pairwise recall and F-measure,
instead of calculating the geometry mean of precision and recall.
'''
n_samples, = gt_labels.shape
c = contingency_matrix(gt_labels, pred_labels, sparse=sparse)
tk = np.dot(c.data, c.data) - n_samples
pk = np.sum(np.asarray(c.sum(axis=0)).ravel()**2) - n_samples
qk = np.sum(np.asarray(c.sum(axis=1)).ravel()**2) - n_samples
avg_pre = tk / pk
avg_rec = tk / qk
fscore = _compute_fscore(avg_pre, avg_rec)
return avg_pre, avg_rec, fscore
def pairwise(gt_labels, pred_labels, sparse=True):
_check(gt_labels, pred_labels)
return fowlkes_mallows_score(gt_labels, pred_labels, sparse)
def bcubed(gt_labels, pred_labels):
_check(gt_labels, pred_labels)
gt_lb2idxs = _get_lb2idxs(gt_labels)
pred_lb2idxs = _get_lb2idxs(pred_labels)
num_lbs = len(gt_lb2idxs)
pre = np.zeros(num_lbs)
rec = np.zeros(num_lbs)
gt_num = np.zeros(num_lbs)
for i, gt_idxs in enumerate(gt_lb2idxs.values()):
all_pred_lbs = np.unique(pred_labels[gt_idxs])
gt_num[i] = len(gt_idxs)
for pred_lb in all_pred_lbs:
pred_idxs = pred_lb2idxs[pred_lb]
n = 1. * np.intersect1d(gt_idxs, pred_idxs).size
pre[i] += n**2 / len(pred_idxs)
rec[i] += n**2 / gt_num[i]
gt_num = gt_num.sum()
avg_pre = pre.sum() / gt_num
avg_rec = rec.sum() / gt_num
fscore = _compute_fscore(avg_pre, avg_rec)
return avg_pre, avg_rec, fscore
def nmi(gt_labels, pred_labels):
return normalized_mutual_info_score(pred_labels, gt_labels)
def precision(gt_labels, pred_labels):
return precision_score(gt_labels, pred_labels)
def recall(gt_labels, pred_labels):
return recall_score(gt_labels, pred_labels)
def accuracy(gt_labels, pred_labels):
return np.mean(gt_labels == pred_labels)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This file re-uses implementation from https://github.com/yl-1993/learn-to-cluster
"""
import os
import time
import json
import pickle
import random
import numpy as np
class TextColors:
HEADER = '\033[35m'
OKBLUE = '\033[34m'
OKGREEN = '\033[32m'
WARNING = '\033[33m'
FATAL = '\033[31m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
class Timer():
def __init__(self, name='task', verbose=True):
self.name = name
self.verbose = verbose
def __enter__(self):
self.start = time.time()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
if self.verbose:
print('[Time] {} consumes {:.4f} s'.format(
self.name,
time.time() - self.start))
return exc_type is None
def set_random_seed(seed, cuda=False):
import torch
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if cuda:
torch.cuda.manual_seed_all(seed)
def l2norm(vec):
vec /= np.linalg.norm(vec, axis=1).reshape(-1, 1)
return vec
def is_l2norm(features, size):
rand_i = random.choice(range(size))
norm_ = np.dot(features[rand_i, :], features[rand_i, :])
return abs(norm_ - 1) < 1e-6
def is_spmat_eq(a, b):
return (a != b).nnz == 0
def aggregate(features, adj, times):
dtype = features.dtype
for i in range(times):
features = adj * features
return features.astype(dtype)
def mkdir_if_no_exists(path, subdirs=[''], is_folder=False):
if path == '':
return
for sd in subdirs:
if sd != '' or is_folder:
d = os.path.dirname(os.path.join(path, sd))
else:
d = os.path.dirname(path)
if not os.path.exists(d):
os.makedirs(d)
def stop_iterating(current_l, total_l, early_stop, num_edges_add_this_level, num_edges_add_last_level, knn_k):
# Stopping rule 1: run all levels
if current_l == total_l - 1:
return True
# Stopping rule 2: no new edges
if num_edges_add_this_level == 0:
return True
# Stopping rule 3: early stopping, two levels start to produce similar numbers of edges
if early_stop and float(num_edges_add_last_level) / num_edges_add_this_level < knn_k - 1:
return True
return False
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment