# -*- coding: utf-8 -*-
# SPDX-License-Identifier: BSD-3-Clause
# Author: Roman Feldbauer (adaptions for scikit-hubness)
# PEP 563: Postponed Evaluation of Annotations
from __future__ import annotations
import logging
from typing import Union, Tuple
try:
import ngtpy
except (ImportError, ModuleNotFoundError) as e:
logging.warning("The package 'ngt' is required to run this example.") # pragma: no cover
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
from tqdm.auto import tqdm
from .approximate_neighbors import ApproximateNearestNeighbor
from ..utils.check import check_n_candidates
from ..utils.io import create_tempfile_preferably_in_dir
print(__doc__)
__all__ = ['NNG',
]
[docs]class NNG(BaseEstimator, ApproximateNearestNeighbor):
"""Wrapper for ngtpy and NNG
Parameters
----------
n_candidates: int, default = 5
Number of neighbors to retrieve
metric: str, default = 'euclidean'
Distance metric, allowed are 'manhattan', 'L1', 'euclidean', 'L2', 'minkowski',
'Angle', 'Normalized Angle', 'Hamming', 'Jaccard', 'Cosine' or 'Normalized Cosine'.
index_dir: str, default = 'auto'
Store the index in the given directory.
If None, keep the index in main memory (NON pickleable index),
If index_dir is a string, it is interpreted as a directory to store the index into,
if 'auto', create a temp dir for the index, preferably in /dev/shm on Linux.
Note: The directory/the index will NOT be deleted automatically.
n_jobs: int, default = 1
Number of parallel jobs
verbose: int, default = 0
Verbosity level. If verbose > 0, show tqdm progress bar on indexing and querying.
Attributes
----------
valid_metrics:
List of valid distance metrics/measures
Notes
-----
NNG stores the index to a directory specified in `index_dir`.
The index is persistent, and will NOT be deleted automatically.
It is the user's responsibility to take care of deletion,
when required.
"""
valid_metrics = ['manhattan', 'L1', 'euclidean', 'L2', 'minkowski', 'sqeuclidean',
'Angle', 'Normalized Angle', 'Cosine', 'Normalized Cosine', 'Hamming', 'Jaccard']
internal_distance_type = {'manhattan': 'L1',
'euclidean': 'L2',
'minkowski': 'L2',
'sqeuclidean': 'L2',
}
[docs] def __init__(self, n_candidates: int = 5,
metric: str = 'euclidean',
index_dir: str = 'auto',
edge_size_for_creation: int = 40,
edge_size_for_search: int = 10,
n_jobs: int = 1,
verbose: int = 0):
super().__init__(n_candidates=n_candidates,
metric=metric,
n_jobs=n_jobs,
verbose=verbose,
)
self.index_dir = index_dir
self.edge_size_for_creation = edge_size_for_creation
self.edge_size_for_search = edge_size_for_search
[docs] def fit(self, X, y=None) -> NNG:
""" Build the ngtpy.Index and insert data from X.
Parameters
----------
X: np.array
Data to be indexed
y: any
Ignored
Returns
-------
self: NNG
An instance of NNG with a built index
"""
if y is None:
X = check_array(X)
else:
X, y = check_X_y(X, y)
self.y_train_ = y
self.n_samples_train_ = X.shape[0]
self.n_features_ = X.shape[1]
self.X_dtype_ = X.dtype
# Map common distance names to names used by ngt
try:
self.effective_metric_ = NNG.internal_distance_type[self.metric]
except KeyError:
self.effective_metric_ = self.metric
if self.effective_metric_ not in NNG.valid_metrics:
raise ValueError(f'Unknown distance/similarity measure: {self.effective_metric_}. '
f'Please use one of: {NNG.valid_metrics}.')
# Set up a directory to save the index to
if self.index_dir in ['auto']:
index_path = create_tempfile_preferably_in_dir(prefix='skhubness_',
suffix='.onng',
directory='/dev/shm')
logging.warning(f'The index will be stored in {index_path}. '
f'It will NOT be deleted automatically, when this instance is destructed.')
elif isinstance(self.index_dir, str):
index_path = create_tempfile_preferably_in_dir(prefix='skhubness_',
suffix='.onng',
directory=self.index_dir)
elif self.index_dir is None:
index_path = create_tempfile_preferably_in_dir(prefix='skhubness_',
suffix='.onng')
else:
raise TypeError(f'NNG requires to write an index to the filesystem. '
f'Please provide a valid path with parameter `index_dir`.')
# Create the NNG index, insert data
# TODO add ngt optimizer
ngtpy.create(path=index_path,
dimension=self.n_features_,
edge_size_for_creation=self.edge_size_for_creation,
edge_size_for_search=self.edge_size_for_search,
distance_type=self.effective_metric_,
)
index_obj = ngtpy.Index(index_path)
index_obj.batch_insert(X, num_threads=self.n_jobs)
# Keep index in memory or store in path
if self.index_dir is None:
self.index_ = index_obj
else:
index_obj.save()
self.index_ = index_path
return self
[docs] def kneighbors(self, X=None, n_candidates=None, return_distance=True) -> Union[Tuple[np.array, np.array], np.array]:
""" Retrieve k nearest neighbors.
Parameters
----------
X: np.array or None, optional, default = None
Query objects. If None, search among the indexed objects.
n_candidates: int or None, optional, default = None
Number of neighbors to retrieve.
If None, use the value passed during construction.
return_distance: bool, default = True
If return_distance, will return distances and indices to neighbors.
Else, only return the indices.
"""
check_is_fitted(self, 'index_')
if X is not None:
X = check_array(X)
n_test = self.n_samples_train_ if X is None else X.shape[0]
dtype = self.X_dtype_ if X is None else X.dtype
if n_candidates is None:
n_candidates = self.n_candidates
n_candidates = check_n_candidates(n_candidates)
# For compatibility reasons, as each sample is considered as its own
# neighbor, one extra neighbor will be computed.
if X is None:
n_neighbors = n_candidates + 1
start = 1
else:
n_neighbors = n_candidates
start = 0
# If fewer candidates than required are found for a query,
# we save index=-1 and distance=NaN
neigh_ind = -np.ones((n_test, n_candidates),
dtype=np.int32)
if return_distance:
neigh_dist = np.empty_like(neigh_ind,
dtype=dtype) * np.nan
if isinstance(self.index_, str):
index = ngtpy.Index(self.index_)
else:
index = self.index_
disable_tqdm = False if self.verbose else True
if X is None:
for i in tqdm(range(n_test),
desc='Query NNG',
disable=disable_tqdm,
):
query = index.get_object(i)
response = index.search(query=query,
size=n_neighbors,
with_distance=return_distance,
)
if return_distance:
ind, dist = [np.array(arr) for arr in zip(*response)]
else:
ind = response
ind = ind[start:]
neigh_ind[i, :len(ind)] = ind
if return_distance:
dist = dist[start:]
neigh_dist[i, :len(dist)] = dist
else: # if X was provided
for i, x in tqdm(enumerate(X),
desc='Query NNG',
disable=disable_tqdm,
):
response = index.search(query=x,
size=n_neighbors,
with_distance=return_distance,
)
if return_distance:
ind, dist = [np.array(arr) for arr in zip(*response)]
else:
ind = response
ind = ind[start:]
neigh_ind[i, :len(ind)] = ind
if return_distance:
dist = dist[start:]
neigh_dist[i, :len(dist)] = dist
if return_distance and self.metric == 'sqeuclidean':
neigh_dist **= 2
if return_distance:
return neigh_dist, neigh_ind
else:
return neigh_ind