Source code for skhubness.neighbors.hnsw
# -*- coding: utf-8 -*-
# SPDX-License-Identifier: BSD-3-Clause
# PEP 563: Postponed Evaluation of Annotations
from __future__ import annotations
from typing import Tuple, Union
import numpy as np
from sklearn.utils.validation import check_is_fitted, check_array
import nmslib
from .approximate_neighbors import ApproximateNearestNeighbor
from ..utils.check import check_n_candidates
__all__ = ['HNSW']
[docs]class HNSW(ApproximateNearestNeighbor):
"""Wrapper for using nmslib
Hierarchical navigable small-world graphs are data structures,
that allow for approximate nearest neighbor search.
Here, an implementation from nmslib is used.
Parameters
----------
n_candidates: int, default = 5
Number of neighbors to retrieve
metric: str, default = 'euclidean'
Distance metric, allowed are "angular", "euclidean", "manhattan", "hamming", "dot"
method: str, default = 'hnsw',
ANN method to use. Currently, only 'hnsw' is supported.
post_processing: int, default = 2
More post processing means longer index creation,
and higher retrieval accuracy.
n_jobs: int, default = 1
Number of parallel jobs
verbose: int, default = 0
Verbosity level. If verbose > 0, show tqdm progress bar on indexing and querying.
Attributes
----------
valid_metrics:
List of valid distance metrics/measures
"""
valid_metrics = ['euclidean', 'l2', 'minkowski', 'squared_euclidean', 'sqeuclidean',
'cosine', 'cosinesimil']
[docs] def __init__(self, n_candidates: int = 5, metric: str = 'euclidean',
method: str = 'hnsw', post_processing: int = 2,
n_jobs: int = 1, verbose: int = 0):
super().__init__(n_candidates=n_candidates,
metric=metric,
n_jobs=n_jobs,
verbose=verbose)
self.method = method
self.post_processing = post_processing
self.space = None
[docs] def fit(self, X, y=None) -> HNSW:
""" Setup the HNSW index from training data.
Parameters
----------
X: np.array
Data to be indexed
y: any
Ignored
Returns
-------
self: HNSW
An instance of HNSW with a built graph
"""
X = check_array(X)
method = self.method
post_processing = self.post_processing
if self.metric in ['euclidean', 'l2', 'minkowski', 'squared_euclidean', 'sqeuclidean']:
if self.metric in ['squared_euclidean', 'sqeuclidean']:
self.metric = 'sqeuclidean'
else:
self.metric = 'euclidean'
self.space = 'l2'
elif self.metric in ['cosine', 'cosinesimil']:
self.space = 'cosinesimil'
else:
raise ValueError(f'Invalid metric "{self.metric}". Please try "euclidean" or "cosine".')
hnsw_index = nmslib.init(method=method,
space=self.space)
hnsw_index.addDataPointBatch(X)
hnsw_index.createIndex({'post': post_processing},
print_progress=(self.verbose >= 2))
self.index_ = hnsw_index
assert self.space in ['l2', 'cosinesimil'], f'Internal: self.space={self.space} not allowed'
return self
[docs] def kneighbors(self, X: np.ndarray = None,
n_candidates: int = None,
return_distance: bool = True) -> Union[Tuple[np.array, np.array], np.array]:
""" Retrieve k nearest neighbors.
Parameters
----------
X: np.array or None, optional, default = None
Query objects. If None, search among the indexed objects.
n_candidates: int or None, optional, default = None
Number of neighbors to retrieve.
If None, use the value passed during construction.
return_distance: bool, default = True
If return_distance, will return distances and indices to neighbors.
Else, only return the indices.
"""
check_is_fitted(self, ["index_", ])
if X is None:
raise NotImplementedError(f'Please provide X to hnsw.kneighbors().')
# Check the n_neighbors parameter
if n_candidates is None:
n_candidates = self.n_candidates
n_candidates = check_n_candidates(n_candidates)
# Fetch the neighbor candidates
neigh_ind_dist = self.index_.knnQueryBatch(X,
k=n_candidates,
num_threads=self.n_jobs)
# If fewer candidates than required are found for a query,
# we save index=-1 and distance=NaN
n_test = X.shape[0]
neigh_ind = -np.ones((n_test, n_candidates),
dtype=np.int32)
neigh_dist = np.empty_like(neigh_ind,
dtype=X.dtype) * np.nan
for i, (ind, dist) in enumerate(neigh_ind_dist):
neigh_ind[i, :ind.size] = ind
neigh_dist[i, :dist.size] = dist
# Convert cosine similarities to cosine distances
if self.space == 'cosinesimil':
neigh_dist *= -1
neigh_dist += 1
elif self.space == 'l2' and self.metric == 'sqeuclidean':
neigh_dist **= 2
if return_distance:
return neigh_dist, neigh_ind
else:
return neigh_ind