#!/usr/bin/env python # -*- coding: utf-8 -*- """ This file re-uses implementation from https://github.com/yl-1993/learn-to-cluster """ from itertools import groupby import numpy as np import torch from tqdm import tqdm __all__ = [ "density_estimation", "density_to_peaks", "density_to_peaks_vectorize", ] def density_estimation(dists, nbrs, labels, **kwargs): """use supervised density defined on neigborhood""" num, k_knn = dists.shape conf = np.ones((num,), dtype=np.float32) ind_array = labels[nbrs] == np.expand_dims(labels, 1).repeat(k_knn, 1) pos = ((1 - dists[:, 1:]) * ind_array[:, 1:]).sum(1) neg = ((1 - dists[:, 1:]) * (1 - ind_array[:, 1:])).sum(1) conf = (pos - neg) * conf conf /= k_knn - 1 return conf def density_to_peaks_vectorize(dists, nbrs, density, max_conn=1, name=""): # just calculate 1 connectivity assert dists.shape[0] == density.shape[0] assert dists.shape == nbrs.shape num, k = dists.shape if name == "gcn_feat": include_mask = nbrs != np.arange(0, num).reshape(-1, 1) secondary_mask = ( np.sum(include_mask, axis=1) == k ) # TODO: the condition == k should not happen as distance to the node self should be smallest, check for numerical stability; TODO: make top M instead of only supporting top 1 include_mask[secondary_mask, -1] = False nbrs_exclude_self = nbrs[include_mask].reshape(-1, k - 1) # (V, 79) dists_exclude_self = dists[include_mask].reshape(-1, k - 1) # (V, 79) else: include_mask = nbrs != np.arange(0, num).reshape(-1, 1) nbrs_exclude_self = nbrs[include_mask].reshape(-1, k - 1) # (V, 79) dists_exclude_self = dists[include_mask].reshape(-1, k - 1) # (V, 79) compare_map = density[nbrs_exclude_self] > density.reshape(-1, 1) peak_index = np.argmax(np.where(compare_map, 1, 0), axis=1) # (V,) compare_map_sum = np.sum(compare_map.cpu().data.numpy(), axis=1) # (V,) dist2peak = { i: [] if compare_map_sum[i] == 0 else [dists_exclude_self[i, peak_index[i]]] for i in range(num) } peaks = { i: [] if compare_map_sum[i] == 0 else [nbrs_exclude_self[i, peak_index[i]]] for i in range(num) } return dist2peak, peaks def density_to_peaks(dists, nbrs, density, max_conn=1, sort="dist"): # Note that dists has been sorted in ascending order assert dists.shape[0] == density.shape[0] assert dists.shape == nbrs.shape num, _ = dists.shape dist2peak = {i: [] for i in range(num)} peaks = {i: [] for i in range(num)} for i, nbr in tqdm(enumerate(nbrs)): nbr_conf = density[nbr] for j, c in enumerate(nbr_conf): nbr_idx = nbr[j] if i == nbr_idx or c <= density[i]: continue dist2peak[i].append(dists[i, j]) peaks[i].append(nbr_idx) if len(dist2peak[i]) >= max_conn: break return dist2peak, peaks