Source code for skhubness.reduction.mutual_proximity

# -*- coding: utf-8 -*-
# SPDX-License-Identifier: BSD-3-Clause

from __future__ import annotations
import warnings

import numpy as np
from scipy import stats
from sklearn.utils.validation import check_is_fitted, check_consistent_length, check_array
from tqdm.auto import tqdm

from .base import HubnessReduction


[docs]class MutualProximity(HubnessReduction): """ Hubness reduction with Mutual Proximity [1]_. Parameters ---------- method: 'normal' or 'empiric', default = 'normal' Model distance distribution with 'method'. - 'normal' or 'gaussi' model distance distributions with independent Gaussians (fast) - 'empiric' or 'exact' model distances with the empiric distributions (slow) verbose: int, default = 0 If verbose > 0, show progress bar. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """
[docs] def __init__(self, method: str = 'normal', verbose: int = 0, **kwargs): super().__init__(**kwargs) self.method = method self.verbose = verbose
[docs] def fit(self, neigh_dist, neigh_ind, X=None, assume_sorted=None, *args, **kwargs) -> MutualProximity: """ Fit the model using neigh_dist and neigh_ind as training data. Parameters ---------- neigh_dist: np.ndarray, shape (n_samples, n_neighbors) Distance matrix of training objects (rows) against their individual k nearest neighbors (columns). neigh_ind: np.ndarray, shape (n_samples, n_neighbors) Neighbor indices corresponding to the values in neigh_dist. X: ignored assume_sorted: ignored """ # Check equal number of rows and columns check_consistent_length(neigh_ind, neigh_dist) check_consistent_length(neigh_ind.T, neigh_dist.T) check_array(neigh_dist, force_all_finite=False) check_array(neigh_ind) self.n_train = neigh_dist.shape[0] if self.method in ['exact', 'empiric']: self.method = 'empiric' self.neigh_dist_train_ = neigh_dist self.neigh_ind_train_ = neigh_ind elif self.method in ['normal', 'gaussi']: self.method = 'normal' self.mu_train_ = np.nanmean(neigh_dist, axis=1) self.sd_train_ = np.nanstd(neigh_dist, axis=1, ddof=0) else: raise ValueError(f'Mutual proximity method "{self.method}" not recognized. Try "normal" or "empiric".') return self
[docs] def transform(self, neigh_dist, neigh_ind, X=None, assume_sorted=None, *args, **kwargs): """ Transform distance between test and training data with Mutual Proximity. Parameters ---------- neigh_dist: np.ndarray Distance matrix of test objects (rows) against their individual k nearest neighbors among the training data (columns). neigh_ind: np.ndarray Neighbor indices corresponding to the values in neigh_dist X: ignored assume_sorted: ignored Returns ------- hub_reduced_dist, neigh_ind Mutual Proximity distances, and corresponding neighbor indices Notes ----- The returned distances are NOT sorted! If you use this class directly, you will need to sort the returned matrices according to hub_reduced_dist. Classes from :mod:`skhubness.neighbors` do this automatically. """ check_is_fitted(self, ['mu_train_', 'sd_train_', 'neigh_dist_train_', 'neigh_ind_train_'], all_or_any=any) check_array(neigh_dist, force_all_finite='allow-nan') check_array(neigh_ind) n_test, n_indexed = neigh_dist.shape if n_indexed == 1: warnings.warn(f'Cannot perform hubness reduction with a single neighbor per query. ' f'Skipping hubness reduction, and returning untransformed distances.') return neigh_dist, neigh_ind hub_reduced_dist = np.empty_like(neigh_dist) # Show progress in hubness reduction loop disable_tqdm = False if self.verbose else True range_n_test = tqdm(range(n_test), desc=f'MP ({self.method})', disable=disable_tqdm, ) # Calculate MP with independent Gaussians if self.method == 'normal': mu_train = self.mu_train_ sd_train = self.sd_train_ for i in range_n_test: j_mom = neigh_ind[i] mu = np.nanmean(neigh_dist[i]) sd = np.nanstd(neigh_dist[i], ddof=0) p1 = stats.norm.sf(neigh_dist[i, :], mu, sd) p2 = stats.norm.sf(neigh_dist[i, :], mu_train[j_mom], sd_train[j_mom]) hub_reduced_dist[i, :] = (1 - p1 * p2).ravel() # Calculate MP empiric (slow) elif self.method == 'empiric': max_ind = self.neigh_ind_train_.max() for i in range_n_test: dI = neigh_dist[i, :][np.newaxis, :] # broadcasted afterwards dJ = np.zeros((dI.size, n_indexed)) for j in range(n_indexed): tmp = np.zeros(max_ind + 1) + (self.neigh_dist_train_[neigh_ind[i, j], -1] + 1e-6) tmp[self.neigh_ind_train_[neigh_ind[i, j]]] = self.neigh_dist_train_[neigh_ind[i, j]] dJ[j, :] = tmp[neigh_ind[i]] # dJ = self.neigh_dist_train_[neigh_ind[i], :n_indexed] d = dI.T hub_reduced_dist[i, :] = 1. - (np.sum((dI > d) & (dJ > d), axis=1) / n_indexed) else: raise ValueError(f"Internal: Invalid method {self.method}.") # Return the hubness reduced distances # These must be sorted downstream return hub_reduced_dist, neigh_ind