Source code for skhubness.reduction.dis_sim
# -*- coding: utf-8 -*-
# SPDX-License-Identifier: BSD-3-Clause
from __future__ import annotations
import warnings
import numpy as np
from sklearn.metrics import euclidean_distances
from sklearn.utils.extmath import row_norms
from sklearn.utils.validation import check_is_fitted, check_consistent_length, check_array
from .base import HubnessReduction
[docs]class DisSimLocal(HubnessReduction):
""" Hubness reduction with DisSimLocal [1]_.
Parameters
----------
k: int, default = 5
Number of neighbors to consider for the local centroids
squared: bool, default = True
DisSimLocal operates on squared Euclidean distances.
If True, return (quasi) squared Euclidean distances;
if False, return (quasi) Eucldean distances.
References
----------
.. [1] Hara K, Suzuki I, Kobayashi K, Fukumizu K, Radovanović M (2016)
Flattening the density gradient for eliminating spatial centrality to reduce hubness.
In: Proceedings of the 30th AAAI conference on artificial intelligence, pp 1659–1665.
https://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/viewPaper/12055
"""
[docs] def __init__(self, k: int = 5, squared: bool = True, *args, **kwargs):
super().__init__()
self.k = k
self.squared = squared
[docs] def fit(self, neigh_dist: np.ndarray, neigh_ind: np.ndarray, X: np.ndarray,
assume_sorted: bool = True, *args, **kwargs) -> DisSimLocal:
""" Fit the model using X, neigh_dist, and neigh_ind as training data.
Parameters
----------
neigh_dist: np.ndarray, shape (n_samples, n_neighbors)
Distance matrix of training objects (rows) against their
individual k nearest neighbors (colums).
neigh_ind: np.ndarray, shape (n_samples, n_neighbors)
Neighbor indices corresponding to the values in neigh_dist.
X: np.ndarray, shape (n_samples, n_features)
Training data, where n_samples is the number of vectors,
and n_features their dimensionality (number of features).
assume_sorted: bool, default = True
Assume input matrices are sorted according to neigh_dist.
If False, these are sorted here.
"""
# Check equal number of rows and columns
check_consistent_length(neigh_ind, neigh_dist)
check_consistent_length(neigh_ind.T, neigh_dist.T)
X = check_array(X)
try:
if self.k <= 0:
raise ValueError(f"Expected k > 0. Got {self.k}")
except TypeError:
raise TypeError(f'Expected k: int > 0. Got {self.k}')
k = self.k
if k > neigh_ind.shape[1]:
warnings.warn(f'Neighborhood parameter k larger than provided neighbors in neigh_dist, neigh_ind. '
f'Will reduce to k={neigh_ind.shape[1]}.')
k = neigh_ind.shape[1]
# Calculate local neighborhood centroids among the training points
if assume_sorted:
knn = neigh_ind[:, :k]
else:
mask = np.argpartition(neigh_dist, kth=k-1)[:, :k]
knn = np.take_along_axis(neigh_ind, mask, axis=1)
centroids = X[knn].mean(axis=1)
dist_to_cent = row_norms(X - centroids, squared=True)
self.X_train_ = X
self.X_train_centroids_ = centroids
self.X_train_dist_to_centroids_ = dist_to_cent
return self
[docs] def transform(self, neigh_dist: np.ndarray, neigh_ind: np.ndarray, X: np.ndarray,
assume_sorted: bool = True, *args, **kwargs) -> (np.ndarray, np.ndarray):
""" Transform distance between test and training data with DisSimLocal.
Parameters
----------
neigh_dist: np.ndarray, shape (n_query, n_neighbors)
Distance matrix of test objects (rows) against their individual
k nearest neighbors among the training data (columns).
neigh_ind: np.ndarray, shape (n_query, n_neighbors)
Neighbor indices corresponding to the values in neigh_dist
X: np.ndarray, shape (n_query, n_features)
Test data, where n_query is the number of vectors,
and n_features their dimensionality (number of features).
assume_sorted: ignored
Returns
-------
hub_reduced_dist, neigh_ind
DisSimLocal distances, and corresponding neighbor indices
Notes
-----
The returned distances are NOT sorted! If you use this class directly,
you will need to sort the returned matrices according to hub_reduced_dist.
Classes from :mod:`skhubness.neighbors` do this automatically.
"""
check_is_fitted(self, ['X_train_', 'X_train_centroids_', 'X_train_dist_to_centroids_'])
if X is None:
X = self.X_train_
else:
X = check_array(X)
n_test, n_indexed = neigh_dist.shape
if n_indexed == 1:
warnings.warn(f'Cannot perform hubness reduction with a single neighbor per query. '
f'Skipping hubness reduction, and returning untransformed distances.')
return neigh_dist, neigh_ind
k = self.k
if k > neigh_ind.shape[1]:
warnings.warn(f'Neighborhood parameter k larger than provided neighbors in neigh_dist, neigh_ind. '
f'Will reduce to k={neigh_ind.shape[1]}.')
k = neigh_ind.shape[1]
# Calculate local neighborhood centroids for test objects among training objects
mask = np.argpartition(neigh_dist, kth=k-1)
# neigh_dist = np.take_along_axis(neigh_dist, mask, axis=1)
for i, ind in enumerate(neigh_ind):
neigh_dist[i, :] = euclidean_distances(X[i].reshape(1, -1), self.X_train_[ind], squared=True)
neigh_ind = np.take_along_axis(neigh_ind, mask, axis=1)
knn = neigh_ind[:, :k]
centroids = self.X_train_centroids_[knn].mean(axis=1)
X_test = X - centroids
X_test **= 2
X_test_dist_to_centroids = X_test.sum(axis=1)
X_train_dist_to_centroids = self.X_train_dist_to_centroids_[neigh_ind]
hub_reduced_dist = neigh_dist.copy()
hub_reduced_dist -= X_test_dist_to_centroids[:, np.newaxis]
hub_reduced_dist -= X_train_dist_to_centroids
# DisSimLocal can yield negative dissimilarities, which can cause problems with
# certain scikit-learn routines (e.g. in metric='precomputed' usages).
# We, therefore, shift dissimilarities to non-negative values, if necessary.
min_dist = hub_reduced_dist.min()
if min_dist < 0.:
hub_reduced_dist += (-min_dist)
# Return Euclidean or squared Euclidean distances?
if not self.squared:
hub_reduced_dist **= (1 / 2)
# Return the hubness reduced distances
# These must be sorted downstream
return hub_reduced_dist, neigh_ind