# -*- coding: utf-8 -*-
# SPDX-License-Identifier: BSD-3-Clause
# PEP 563: Postponed Evaluation of Annotations
from __future__ import annotations
from functools import partial
import logging
import sys
from typing import Tuple, Union
import warnings
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.metrics import euclidean_distances, pairwise_distances
from sklearn.metrics.pairwise import cosine_distances
from sklearn.utils.validation import check_is_fitted, check_array, check_X_y
try:
import puffinn
except ImportError:
logging.warning("The package 'puffinn' is not available.") # pragma: no cover
try:
import falconn
except ImportError:
logging.warning("The package 'falconn' is not available.") # pragma: no cover
from tqdm.auto import tqdm
from .approximate_neighbors import ApproximateNearestNeighbor
from ..utils.check import check_n_candidates
__all__ = ['FalconnLSH', 'PuffinnLSH', ]
[docs]class PuffinnLSH(BaseEstimator, ApproximateNearestNeighbor):
""" Wrap Puffinn LSH for scikit-learn compatibility.
Parameters
----------
n_candidates: int, default = 5
Number of neighbors to retrieve
metric: str, default = 'euclidean'
Distance metric, allowed are "angular", "jaccard".
Other metrics are partially supported, such as 'euclidean', 'sqeuclidean'.
In these cases, 'angular' distances are used to find the candidate set
of neighbors with LSH among all indexed objects, and (squared) Euclidean
distances are subsequently only computed for the candidates.
memory: int, default = 1GB
Max memory usage
recall: float, default = 0.90
Probability of finding the true nearest neighbors among the candidates
n_jobs: int, default = 1
Number of parallel jobs
verbose: int, default = 0
Verbosity level. If verbose > 0, show tqdm progress bar on indexing and querying.
Attributes
----------
valid_metrics:
List of valid distance metrics/measures
"""
valid_metrics = ["angular", "cosine", "euclidean", "sqeuclidean", "minkowski",
"jaccard",
]
metric_map = {'euclidean': 'angular',
'sqeuclidean': 'angular',
'minkowski': 'angular',
'cosine': 'angular',
}
[docs] def __init__(self, n_candidates: int = 5,
metric: str = 'euclidean',
memory: int = 1024**3,
recall: float = 0.9,
n_jobs: int = 1,
verbose: int = 0,
):
super().__init__(n_candidates=n_candidates,
metric=metric,
n_jobs=n_jobs,
verbose=verbose,
)
self.memory = memory
self.recall = recall
[docs] def fit(self, X, y=None) -> PuffinnLSH:
""" Build the puffinn LSH index and insert data from X.
Parameters
----------
X: np.array
Data to be indexed
y: any
Ignored
Returns
-------
self: Puffinn
An instance of Puffinn with a built index
"""
if y is None:
X = check_array(X)
else:
X, y = check_X_y(X, y)
self.y_train_ = y
if self.metric not in self.valid_metrics:
warnings.warn(f'Invalid metric "{self.metric}". Using "euclidean" instead')
self.metric = 'euclidean'
try:
self.effective_metric = self.metric_map[self.metric]
except KeyError:
self.effective_metric = self.metric
# Reduce default memory consumption for unit tests
if "pytest" in sys.modules:
self.memory = 3*1024**2
# Construct the index
index = puffinn.Index(self.effective_metric,
X.shape[1],
self.memory,
)
disable_tqdm = False if self.verbose else True
for v in tqdm(X, desc='Indexing', disable=disable_tqdm):
index.insert(v.tolist())
index.rebuild(num_threads=self.n_jobs)
self.index_ = index
self.X_train_ = X # remove, once we can retrieve vectors from the index itself
return self
[docs] def kneighbors(self, X=None, n_candidates=None, return_distance=True) -> Union[Tuple[np.array, np.array], np.array]:
""" Retrieve k nearest neighbors.
Parameters
----------
X: np.array or None, optional, default = None
Query objects. If None, search among the indexed objects.
n_candidates: int or None, optional, default = None
Number of neighbors to retrieve.
If None, use the value passed during construction.
return_distance: bool, default = True
If return_distance, will return distances and indices to neighbors.
Else, only return the indices.
"""
check_is_fitted(self, 'index_')
if n_candidates is None:
n_candidates = self.n_candidates
n_candidates = check_n_candidates(n_candidates)
# For compatibility reasons, as each sample is considered as its own
# neighbor, one extra neighbor will be computed.
if X is None:
X = self.X_train_
n_neighbors = n_candidates + 1
start = 1
else:
X = check_array(X)
n_neighbors = n_candidates
start = 0
n_test = X.shape[0]
dtype = X.dtype
# If chosen metric is not among the natively support ones, reorder the neighbors
reorder = True if self.metric not in ('angular', 'cosine', 'jaccard') else False
# If fewer candidates than required are found for a query,
# we save index=-1 and distance=NaN
neigh_ind = -np.ones((n_test, n_candidates),
dtype=np.int32)
if return_distance or reorder:
neigh_dist = np.empty_like(neigh_ind,
dtype=dtype) * np.nan
metric = 'cosine' if self.metric == 'angular' else self.metric
index = self.index_
disable_tqdm = False if self.verbose else True
for i, x in tqdm(enumerate(X),
desc='Querying',
disable=disable_tqdm,
):
# Find the approximate nearest neighbors.
# Each of the true `n_candidates` nearest neighbors
# has at least `recall` chance of being found.
ind = index.search(x.tolist(),
n_neighbors,
self.recall,
)
ind = ind[start:]
neigh_ind[i, :len(ind)] = ind
if return_distance or reorder:
neigh_dist[i, :len(ind)] = pairwise_distances(x.reshape(1, -1),
self.X_train_[ind],
metric=metric,
)
if reorder:
sort = np.argsort(neigh_dist, axis=1)
neigh_dist = np.take_along_axis(neigh_dist, sort, axis=1)
neigh_ind = np.take_along_axis(neigh_ind, sort, axis=1)
if return_distance:
return neigh_dist, neigh_ind
else:
return neigh_ind
[docs]class FalconnLSH(ApproximateNearestNeighbor):
"""Wrapper for using falconn LSH
Falconn is an approximate nearest neighbor library,
that uses multiprobe locality-sensitive hashing.
Parameters
----------
n_candidates: int, default = 5
Number of neighbors to retrieve
radius: float or None, optional, default = None
Retrieve neighbors within this radius.
Can be negative: See Notes.
metric: str, default = 'euclidean'
Distance metric, allowed are "angular", "euclidean", "manhattan", "hamming", "dot"
num_probes: int, default = 50
The number of buckets the query algorithm probes.
The higher number of probes is, the better accuracy one gets,
but the slower queries are.
n_jobs: int, default = 1
Number of parallel jobs
verbose: int, default = 0
Verbosity level. If verbose > 0, show tqdm progress bar on indexing and querying.
Attributes
----------
valid_metrics:
List of valid distance metrics/measures
Notes
-----
From the falconn docs: radius can be negative, and for the distance function
'negative_inner_product' it actually makes sense.
"""
valid_metrics = ['euclidean', 'l2', 'minkowski', 'squared_euclidean', 'sqeuclidean',
'cosine', 'neg_inner', 'NegativeInnerProduct']
[docs] def __init__(self, n_candidates: int = 5, radius: float = 1., metric: str = 'euclidean', num_probes: int = 50,
n_jobs: int = 1, verbose: int = 0):
super().__init__(n_candidates=n_candidates,
metric=metric,
n_jobs=n_jobs,
verbose=verbose,
)
self.num_probes = num_probes
self.radius = radius
[docs] def fit(self, X: np.ndarray, y: np.ndarray = None) -> FalconnLSH:
""" Setup the LSH index from training data.
Parameters
----------
X: np.array
Data to be indexed
y: any
Ignored
Returns
-------
self: FalconnLSH
An instance of LSH with a built index
"""
X = check_array(X, dtype=[np.float32, np.float64])
if self.metric in ['euclidean', 'l2', 'minkowski']:
self.metric = 'euclidean'
distance = falconn.DistanceFunction.EuclideanSquared
elif self.metric in ['squared_euclidean', 'sqeuclidean']:
self.metric = 'sqeuclidean'
distance = falconn.DistanceFunction.EuclideanSquared
elif self.metric in ['cosine', 'NegativeInnerProduct', 'neg_inner']:
self.metric = 'cosine'
distance = falconn.DistanceFunction.NegativeInnerProduct
else:
warnings.warn(f'Invalid metric "{self.metric}". Using "euclidean" instead')
self.metric = 'euclidean'
distance = falconn.DistanceFunction.EuclideanSquared
# Set up the LSH index
lsh_construction_params = falconn.get_default_parameters(*X.shape,
distance=distance)
lsh_index = falconn.LSHIndex(lsh_construction_params)
lsh_index.setup(X)
self.X_train_ = X
self.y_train_ = y
self.index_ = lsh_index
return self
[docs] def kneighbors(self, X: np.ndarray = None,
n_candidates: int = None,
return_distance: bool = True) -> Union[Tuple[np.array, np.array], np.array]:
""" Retrieve k nearest neighbors.
Parameters
----------
X: np.array or None, optional, default = None
Query objects. If None, search among the indexed objects.
n_candidates: int or None, optional, default = None
Number of neighbors to retrieve.
If None, use the value passed during construction.
return_distance: bool, default = True
If return_distance, will return distances and indices to neighbors.
Else, only return the indices.
"""
check_is_fitted(self, ["index_", 'X_train_'])
# Check the n_neighbors parameter
if n_candidates is None:
n_candidates = self.n_candidates
elif n_candidates <= 0:
raise ValueError(f"Expected n_neighbors > 0. Got {n_candidates:d}")
else:
if not np.issubdtype(type(n_candidates), np.integer):
raise TypeError(f"n_neighbors does not take {type(n_candidates)} value, enter integer value")
if X is not None:
X = check_array(X, dtype=self.X_train_.dtype)
query_is_train = False
X = check_array(X, accept_sparse='csr')
n_retrieve = n_candidates
else:
query_is_train = True
X = self.X_train_
# Include an extra neighbor to account for the sample itself being
# returned, which is removed later
n_retrieve = n_candidates + 1
# Configure the LSH query object
query = self.index_.construct_query_object()
query.set_num_probes(self.num_probes)
if return_distance:
if self.metric == 'euclidean':
distances = partial(euclidean_distances, squared=False)
elif self.metric == 'sqeuclidean':
distances = partial(euclidean_distances, squared=True)
elif self.metric == 'cosine':
distances = cosine_distances
else:
raise ValueError(f'Internal error: unrecognized metric "{self.metric}"')
# Allocate memory for neighbor indices (and distances)
n_objects = X.shape[0]
neigh_ind = np.empty((n_objects, n_candidates), dtype=np.int32)
if return_distance:
neigh_dist = np.empty_like(neigh_ind, dtype=X.dtype)
# If verbose, show progress bar on the search loop
disable_tqdm = False if self.verbose else True
for i, x in tqdm(enumerate(X),
desc='LSH',
disable=disable_tqdm,
):
knn = np.array(query.find_k_nearest_neighbors(x, k=n_retrieve))
if query_is_train:
knn = knn[1:]
neigh_ind[i, :knn.size] = knn
if return_distance:
neigh_dist[i, :knn.size] = distances(x.reshape(1, -1), self.X_train_[knn])
# LSH may yield fewer neighbors than n_neighbors.
# We set distances to NaN, and indices to -1
if knn.size < n_candidates:
neigh_ind[i, knn.size:] = -1
if return_distance:
neigh_dist[i, knn.size:] = np.nan
if return_distance:
return neigh_dist, neigh_ind
else:
return neigh_ind
[docs] def radius_neighbors(self, X: np.ndarray = None,
radius: float = None,
return_distance: bool = True) -> Union[Tuple[np.array, np.array], np.array]:
""" Retrieve neighbors within a certain radius.
Parameters
----------
X: np.array or None, optional, default = None
Query objects. If None, search among the indexed objects.
radius: float or None, optional, default = None
Retrieve neighbors within this radius.
Can be negative: See Notes.
return_distance: bool, default = True
If return_distance, will return distances and indices to neighbors.
Else, only return the indices.
Notes
-----
From the falconn docs: radius can be negative, and for the distance function
'negative_inner_product' it actually makes sense.
"""
check_is_fitted(self, ["index_", 'X_train_'])
# Constructing a query object
query = self.index_.construct_query_object()
query.set_num_probes(self.num_probes)
if return_distance:
if self.metric == 'euclidean':
distances = partial(euclidean_distances, squared=False)
elif self.metric == 'sqeuclidean':
distances = partial(euclidean_distances, squared=True)
elif self.metric == 'cosine':
distances = cosine_distances
else:
raise ValueError(f'Internal error: unrecognized metric "{self.metric}"')
if X is not None:
query_is_train = False
X = check_array(X, accept_sparse='csr', dtype=self.X_train_.dtype)
else:
query_is_train = True
X = self.X_train_
if radius is None:
radius = self.radius
# LSH uses squared Euclidean internally
if self.metric == 'euclidean':
radius *= radius
# Add a small number to imitate <= threshold
radius += 1e-7
# Allocate memory for neighbor indices (and distances)
n_objects = X.shape[0]
neigh_ind = np.empty(n_objects, dtype='object')
if return_distance:
neigh_dist = np.empty_like(neigh_ind)
# If verbose, show progress bar on the search loop
disable_tqdm = False if self.verbose else True
for i, x in tqdm(enumerate(X),
desc='LSH',
disable=disable_tqdm,
):
knn = np.array(query.find_near_neighbors(x, threshold=radius))
if len(knn) == 0:
knn = np.array([], dtype=int)
else:
if query_is_train:
knn = knn[1:]
neigh_ind[i] = knn
if return_distance:
if len(knn):
neigh_dist[i] = distances(x.reshape(1, -1), self.X_train_[knn]).ravel()
else:
neigh_dist[i] = np.array([])
if return_distance:
return neigh_dist, neigh_ind
else:
return neigh_ind