Source code for skhubness.neighbors.lsh

# -*- coding: utf-8 -*-
# SPDX-License-Identifier: BSD-3-Clause

# PEP 563: Postponed Evaluation of Annotations
from __future__ import annotations

from functools import partial
import logging
import sys
from typing import Tuple, Union
import warnings
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.metrics import euclidean_distances, pairwise_distances
from sklearn.metrics.pairwise import cosine_distances
from sklearn.utils.validation import check_is_fitted, check_array, check_X_y
try:
    import puffinn
except ImportError:
    logging.warning("The package 'puffinn' is not available.")  # pragma: no cover
try:
    import falconn
except ImportError:
    logging.warning("The package 'falconn' is not available.")  # pragma: no cover
from tqdm.auto import tqdm
from .approximate_neighbors import ApproximateNearestNeighbor
from ..utils.check import check_n_candidates

__all__ = ['FalconnLSH', 'PuffinnLSH', ]


[docs]class PuffinnLSH(BaseEstimator, ApproximateNearestNeighbor): """ Wrap Puffinn LSH for scikit-learn compatibility. Parameters ---------- n_candidates: int, default = 5 Number of neighbors to retrieve metric: str, default = 'euclidean' Distance metric, allowed are "angular", "jaccard". Other metrics are partially supported, such as 'euclidean', 'sqeuclidean'. In these cases, 'angular' distances are used to find the candidate set of neighbors with LSH among all indexed objects, and (squared) Euclidean distances are subsequently only computed for the candidates. memory: int, default = 1GB Max memory usage recall: float, default = 0.90 Probability of finding the true nearest neighbors among the candidates n_jobs: int, default = 1 Number of parallel jobs verbose: int, default = 0 Verbosity level. If verbose > 0, show tqdm progress bar on indexing and querying. Attributes ---------- valid_metrics: List of valid distance metrics/measures """ valid_metrics = ["angular", "cosine", "euclidean", "sqeuclidean", "minkowski", "jaccard", ] metric_map = {'euclidean': 'angular', 'sqeuclidean': 'angular', 'minkowski': 'angular', 'cosine': 'angular', }
[docs] def __init__(self, n_candidates: int = 5, metric: str = 'euclidean', memory: int = 1024**3, recall: float = 0.9, n_jobs: int = 1, verbose: int = 0, ): super().__init__(n_candidates=n_candidates, metric=metric, n_jobs=n_jobs, verbose=verbose, ) self.memory = memory self.recall = recall
[docs] def fit(self, X, y=None) -> PuffinnLSH: """ Build the puffinn LSH index and insert data from X. Parameters ---------- X: np.array Data to be indexed y: any Ignored Returns ------- self: Puffinn An instance of Puffinn with a built index """ if y is None: X = check_array(X) else: X, y = check_X_y(X, y) self.y_train_ = y if self.metric not in self.valid_metrics: warnings.warn(f'Invalid metric "{self.metric}". Using "euclidean" instead') self.metric = 'euclidean' try: self.effective_metric = self.metric_map[self.metric] except KeyError: self.effective_metric = self.metric # Reduce default memory consumption for unit tests if "pytest" in sys.modules: self.memory = 3*1024**2 # Construct the index index = puffinn.Index(self.effective_metric, X.shape[1], self.memory, ) disable_tqdm = False if self.verbose else True for v in tqdm(X, desc='Indexing', disable=disable_tqdm): index.insert(v.tolist()) index.rebuild(num_threads=self.n_jobs) self.index_ = index self.X_train_ = X # remove, once we can retrieve vectors from the index itself return self
[docs] def kneighbors(self, X=None, n_candidates=None, return_distance=True) -> Union[Tuple[np.array, np.array], np.array]: """ Retrieve k nearest neighbors. Parameters ---------- X: np.array or None, optional, default = None Query objects. If None, search among the indexed objects. n_candidates: int or None, optional, default = None Number of neighbors to retrieve. If None, use the value passed during construction. return_distance: bool, default = True If return_distance, will return distances and indices to neighbors. Else, only return the indices. """ check_is_fitted(self, 'index_') if n_candidates is None: n_candidates = self.n_candidates n_candidates = check_n_candidates(n_candidates) # For compatibility reasons, as each sample is considered as its own # neighbor, one extra neighbor will be computed. if X is None: X = self.X_train_ n_neighbors = n_candidates + 1 start = 1 else: X = check_array(X) n_neighbors = n_candidates start = 0 n_test = X.shape[0] dtype = X.dtype # If chosen metric is not among the natively support ones, reorder the neighbors reorder = True if self.metric not in ('angular', 'cosine', 'jaccard') else False # If fewer candidates than required are found for a query, # we save index=-1 and distance=NaN neigh_ind = -np.ones((n_test, n_candidates), dtype=np.int32) if return_distance or reorder: neigh_dist = np.empty_like(neigh_ind, dtype=dtype) * np.nan metric = 'cosine' if self.metric == 'angular' else self.metric index = self.index_ disable_tqdm = False if self.verbose else True for i, x in tqdm(enumerate(X), desc='Querying', disable=disable_tqdm, ): # Find the approximate nearest neighbors. # Each of the true `n_candidates` nearest neighbors # has at least `recall` chance of being found. ind = index.search(x.tolist(), n_neighbors, self.recall, ) ind = ind[start:] neigh_ind[i, :len(ind)] = ind if return_distance or reorder: neigh_dist[i, :len(ind)] = pairwise_distances(x.reshape(1, -1), self.X_train_[ind], metric=metric, ) if reorder: sort = np.argsort(neigh_dist, axis=1) neigh_dist = np.take_along_axis(neigh_dist, sort, axis=1) neigh_ind = np.take_along_axis(neigh_ind, sort, axis=1) if return_distance: return neigh_dist, neigh_ind else: return neigh_ind
[docs]class FalconnLSH(ApproximateNearestNeighbor): """Wrapper for using falconn LSH Falconn is an approximate nearest neighbor library, that uses multiprobe locality-sensitive hashing. Parameters ---------- n_candidates: int, default = 5 Number of neighbors to retrieve radius: float or None, optional, default = None Retrieve neighbors within this radius. Can be negative: See Notes. metric: str, default = 'euclidean' Distance metric, allowed are "angular", "euclidean", "manhattan", "hamming", "dot" num_probes: int, default = 50 The number of buckets the query algorithm probes. The higher number of probes is, the better accuracy one gets, but the slower queries are. n_jobs: int, default = 1 Number of parallel jobs verbose: int, default = 0 Verbosity level. If verbose > 0, show tqdm progress bar on indexing and querying. Attributes ---------- valid_metrics: List of valid distance metrics/measures Notes ----- From the falconn docs: radius can be negative, and for the distance function 'negative_inner_product' it actually makes sense. """ valid_metrics = ['euclidean', 'l2', 'minkowski', 'squared_euclidean', 'sqeuclidean', 'cosine', 'neg_inner', 'NegativeInnerProduct']
[docs] def __init__(self, n_candidates: int = 5, radius: float = 1., metric: str = 'euclidean', num_probes: int = 50, n_jobs: int = 1, verbose: int = 0): super().__init__(n_candidates=n_candidates, metric=metric, n_jobs=n_jobs, verbose=verbose, ) self.num_probes = num_probes self.radius = radius
[docs] def fit(self, X: np.ndarray, y: np.ndarray = None) -> FalconnLSH: """ Setup the LSH index from training data. Parameters ---------- X: np.array Data to be indexed y: any Ignored Returns ------- self: FalconnLSH An instance of LSH with a built index """ X = check_array(X, dtype=[np.float32, np.float64]) if self.metric in ['euclidean', 'l2', 'minkowski']: self.metric = 'euclidean' distance = falconn.DistanceFunction.EuclideanSquared elif self.metric in ['squared_euclidean', 'sqeuclidean']: self.metric = 'sqeuclidean' distance = falconn.DistanceFunction.EuclideanSquared elif self.metric in ['cosine', 'NegativeInnerProduct', 'neg_inner']: self.metric = 'cosine' distance = falconn.DistanceFunction.NegativeInnerProduct else: warnings.warn(f'Invalid metric "{self.metric}". Using "euclidean" instead') self.metric = 'euclidean' distance = falconn.DistanceFunction.EuclideanSquared # Set up the LSH index lsh_construction_params = falconn.get_default_parameters(*X.shape, distance=distance) lsh_index = falconn.LSHIndex(lsh_construction_params) lsh_index.setup(X) self.X_train_ = X self.y_train_ = y self.index_ = lsh_index return self
[docs] def kneighbors(self, X: np.ndarray = None, n_candidates: int = None, return_distance: bool = True) -> Union[Tuple[np.array, np.array], np.array]: """ Retrieve k nearest neighbors. Parameters ---------- X: np.array or None, optional, default = None Query objects. If None, search among the indexed objects. n_candidates: int or None, optional, default = None Number of neighbors to retrieve. If None, use the value passed during construction. return_distance: bool, default = True If return_distance, will return distances and indices to neighbors. Else, only return the indices. """ check_is_fitted(self, ["index_", 'X_train_']) # Check the n_neighbors parameter if n_candidates is None: n_candidates = self.n_candidates elif n_candidates <= 0: raise ValueError(f"Expected n_neighbors > 0. Got {n_candidates:d}") else: if not np.issubdtype(type(n_candidates), np.integer): raise TypeError(f"n_neighbors does not take {type(n_candidates)} value, enter integer value") if X is not None: X = check_array(X, dtype=self.X_train_.dtype) query_is_train = False X = check_array(X, accept_sparse='csr') n_retrieve = n_candidates else: query_is_train = True X = self.X_train_ # Include an extra neighbor to account for the sample itself being # returned, which is removed later n_retrieve = n_candidates + 1 # Configure the LSH query object query = self.index_.construct_query_object() query.set_num_probes(self.num_probes) if return_distance: if self.metric == 'euclidean': distances = partial(euclidean_distances, squared=False) elif self.metric == 'sqeuclidean': distances = partial(euclidean_distances, squared=True) elif self.metric == 'cosine': distances = cosine_distances else: raise ValueError(f'Internal error: unrecognized metric "{self.metric}"') # Allocate memory for neighbor indices (and distances) n_objects = X.shape[0] neigh_ind = np.empty((n_objects, n_candidates), dtype=np.int32) if return_distance: neigh_dist = np.empty_like(neigh_ind, dtype=X.dtype) # If verbose, show progress bar on the search loop disable_tqdm = False if self.verbose else True for i, x in tqdm(enumerate(X), desc='LSH', disable=disable_tqdm, ): knn = np.array(query.find_k_nearest_neighbors(x, k=n_retrieve)) if query_is_train: knn = knn[1:] neigh_ind[i, :knn.size] = knn if return_distance: neigh_dist[i, :knn.size] = distances(x.reshape(1, -1), self.X_train_[knn]) # LSH may yield fewer neighbors than n_neighbors. # We set distances to NaN, and indices to -1 if knn.size < n_candidates: neigh_ind[i, knn.size:] = -1 if return_distance: neigh_dist[i, knn.size:] = np.nan if return_distance: return neigh_dist, neigh_ind else: return neigh_ind
[docs] def radius_neighbors(self, X: np.ndarray = None, radius: float = None, return_distance: bool = True) -> Union[Tuple[np.array, np.array], np.array]: """ Retrieve neighbors within a certain radius. Parameters ---------- X: np.array or None, optional, default = None Query objects. If None, search among the indexed objects. radius: float or None, optional, default = None Retrieve neighbors within this radius. Can be negative: See Notes. return_distance: bool, default = True If return_distance, will return distances and indices to neighbors. Else, only return the indices. Notes ----- From the falconn docs: radius can be negative, and for the distance function 'negative_inner_product' it actually makes sense. """ check_is_fitted(self, ["index_", 'X_train_']) # Constructing a query object query = self.index_.construct_query_object() query.set_num_probes(self.num_probes) if return_distance: if self.metric == 'euclidean': distances = partial(euclidean_distances, squared=False) elif self.metric == 'sqeuclidean': distances = partial(euclidean_distances, squared=True) elif self.metric == 'cosine': distances = cosine_distances else: raise ValueError(f'Internal error: unrecognized metric "{self.metric}"') if X is not None: query_is_train = False X = check_array(X, accept_sparse='csr', dtype=self.X_train_.dtype) else: query_is_train = True X = self.X_train_ if radius is None: radius = self.radius # LSH uses squared Euclidean internally if self.metric == 'euclidean': radius *= radius # Add a small number to imitate <= threshold radius += 1e-7 # Allocate memory for neighbor indices (and distances) n_objects = X.shape[0] neigh_ind = np.empty(n_objects, dtype='object') if return_distance: neigh_dist = np.empty_like(neigh_ind) # If verbose, show progress bar on the search loop disable_tqdm = False if self.verbose else True for i, x in tqdm(enumerate(X), desc='LSH', disable=disable_tqdm, ): knn = np.array(query.find_near_neighbors(x, threshold=radius)) if len(knn) == 0: knn = np.array([], dtype=int) else: if query_is_train: knn = knn[1:] neigh_ind[i] = knn if return_distance: if len(knn): neigh_dist[i] = distances(x.reshape(1, -1), self.X_train_[knn]).ravel() else: neigh_dist[i] = np.array([]) if return_distance: return neigh_dist, neigh_ind else: return neigh_ind