Source code for skhubness.reduction.local_scaling
# -*- coding: utf-8 -*-
# SPDX-License-Identifier: BSD-3-Clause
from __future__ import annotations
import warnings
import numpy as np
from sklearn.utils.validation import check_is_fitted, check_consistent_length
from tqdm.auto import tqdm
from .base import HubnessReduction
[docs]class LocalScaling(HubnessReduction):
""" Hubness reduction with Local Scaling [1]_.
Parameters
----------
k: int, default = 5
Number of neighbors to consider for the rescaling
method: 'standard' or 'nicdm', default = 'standard'
Perform local scaling with the specified variant:
- 'standard' or 'ls' rescale distances using the distance to the k-th neighbor
- 'nicdm' rescales distances using a statistic over distances to k neighbors
verbose: int, default = 0
If verbose > 0, show progress bar.
References
----------
.. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
Local and global scaling reduce hubs in space. The Journal of Machine
Learning Research, 13(1), 2871–2902.
"""
[docs] def __init__(self, k: int = 5, method: str = 'standard', verbose: int = 0, **kwargs):
super().__init__(**kwargs)
self.k = k
self.method = method
self.verbose = verbose
[docs] def fit(self, neigh_dist, neigh_ind, X=None, assume_sorted: bool = True, *args, **kwargs) -> LocalScaling:
""" Fit the model using neigh_dist and neigh_ind as training data.
Parameters
----------
neigh_dist: np.ndarray, shape (n_samples, n_neighbors)
Distance matrix of training objects (rows) against their
individual k nearest neighbors (colums).
neigh_ind: np.ndarray, shape (n_samples, n_neighbors)
Neighbor indices corresponding to the values in neigh_dist.
X: ignored
assume_sorted: bool, default = True
Assume input matrices are sorted according to neigh_dist.
If False, these are sorted here.
"""
# Check equal number of rows and columns
check_consistent_length(neigh_ind, neigh_dist)
check_consistent_length(neigh_ind.T, neigh_dist.T)
# increment to include the k-th element in slicing
k = self.k + 1
# Find distances to the k-th neighbor (standard LS) or the k neighbors (NICDM)
if assume_sorted:
self.r_dist_train_ = neigh_dist[:, :k]
self.r_ind_train_ = neigh_ind[:, :k]
else:
kth = np.arange(self.k)
mask = np.argpartition(neigh_dist, kth=kth)[:, :k]
self.r_dist_train_ = np.take_along_axis(neigh_dist, mask, axis=1)
self.r_ind_train_ = np.take_along_axis(neigh_ind, mask, axis=1)
return self
[docs] def transform(self, neigh_dist, neigh_ind, X=None,
assume_sorted: bool = True, *args, **kwargs) -> (np.ndarray, np.ndarray):
""" Transform distance between test and training data with Mutual Proximity.
Parameters
----------
neigh_dist: np.ndarray, shape (n_query, n_neighbors)
Distance matrix of test objects (rows) against their individual
k nearest neighbors among the training data (columns).
neigh_ind: np.ndarray, shape (n_query, n_neighbors)
Neighbor indices corresponding to the values in neigh_dist
X: ignored
assume_sorted: bool, default = True
Assume input matrices are sorted according to neigh_dist.
If False, these are partitioned here.
NOTE: The returned matrices are never sorted.
Returns
-------
hub_reduced_dist, neigh_ind
Local scaling distances, and corresponding neighbor indices
Notes
-----
The returned distances are NOT sorted! If you use this class directly,
you will need to sort the returned matrices according to hub_reduced_dist.
Classes from :mod:`skhubness.neighbors` do this automatically.
"""
check_is_fitted(self, 'r_dist_train_')
n_test, n_indexed = neigh_dist.shape
if n_indexed == 1:
warnings.warn(f'Cannot perform hubness reduction with a single neighbor per query. '
f'Skipping hubness reduction, and returning untransformed distances.')
return neigh_dist, neigh_ind
# increment to include the k-th element in slicing
k = self.k + 1
# Find distances to the k-th neighbor (standard LS) or the k neighbors (NICDM)
if assume_sorted:
r_dist_test = neigh_dist[:, :k]
else:
kth = np.arange(self.k)
mask = np.argpartition(neigh_dist, kth=kth)[:, :k]
r_dist_test = np.take_along_axis(neigh_dist, mask, axis=1)
# Calculate LS or NICDM
hub_reduced_dist = np.empty_like(neigh_dist)
# Optionally show progress of local scaling loop
disable_tqdm = False if self.verbose else True
range_n_test = tqdm(range(n_test),
desc=f'LS {self.method}',
disable=disable_tqdm,
)
# Perform standard local scaling...
if self.method in ['ls', 'standard']:
r_train = self.r_dist_train_[:, -1]
r_test = r_dist_test[:, -1]
for i in range_n_test:
hub_reduced_dist[i, :] = \
1. - np.exp(-1 * neigh_dist[i] ** 2 / (r_test[i] * r_train[neigh_ind[i]]))
# ...or use non-iterative contextual dissimilarity measure
elif self.method == 'nicdm':
r_train = self.r_dist_train_.mean(axis=1)
r_test = r_dist_test.mean(axis=1)
for i in range_n_test:
hub_reduced_dist[i, :] = neigh_dist[i] / np.sqrt((r_test[i] * r_train[neigh_ind[i]]))
else:
raise ValueError(f"Internal: Invalid method {self.method}. Try 'ls' or 'nicdm'.")
# Return the hubness reduced distances
# These must be sorted downstream
return hub_reduced_dist, neigh_ind