# Authors: Marina Meila <mmp@stat.washington.edu>
# James McQueen <jmcq@u.washington.edu>
# License: BSD 3 clause
from __future__ import division ## removes integer division
import numpy as np
from scipy import sparse
from scipy.spatial.distance import pdist
import subprocess, os, sys, warnings
from .cyflann.index import Index
def compute_adjacency_matrix(X, method='auto', **kwargs):
graph = distance_matrix(X, method = method, **kwargs)
return graph
def _row_col_from_condensed_index(N,compr_ind):
# convert from pdist compressed index format to (I, J) (upper triangular) pairs.
b = 1 -2*N
i = np.floor((-b - np.sqrt(b**2 - 8*compr_ind))/2)
j = compr_ind + i*(b + i + 2)/2 + 1
return (i,j)
[docs]def distance_matrix(X, method='auto', flindex=None, radius=None, cyindex=None):
"""
Computes pairwise distance matrix. Interface function.
Parameters
----------
X : data matrix, array_like, shape = (n_samples, n_dimensions)
method : {'auto', 'brute', 'pyflann', 'cyflann'}
'pyflann' requires python library pyflann
'cyflann' requires FLANN and cython
flindex : a pyflann pre-computed flindex
radius : neighborhood radius, scalar
the neighbors lying approximately within radius of a node will
be returned. Or, in other words, all distances will be less or equal
to radius. There will be entries in the matrix for zero distances.
Attention when converting to dense: The rest of the distances
should not be considered 0, but "large".
cyindex : A cython computed FLANN index.
Returns
-------
graph : the distance matrix, array_like, shape (X.shape[0]. X.shape[0])
sparse csr_format. Zeros on the diagonal are true zeros.
Zeros not on the diagonal should be considered infinite
"""
if radius is None:
# TODO: is this a suitable default?
radius = 1 / X.shape[1]
if method == 'auto':
# TODO: change this based on benchmarking results
method = 'brute'
if method == 'pyflann':
try:
import pyflann as pyf
except ImportError:
raise ValueError("pyflann not installed.")
if flindex is None:
flindex = pyf.FLANN()
flindex.build_index(X, algorithm='kmeans',target_precision=0.9)
graph = fl_radius_neighbors_graph(X, radius, flindex)
elif method == 'cyflann':
if cyindex is None:
cyindex = Index(X)
cyindex.buildIndex()
graph = cyindex.radius_neighbors_graph(X, radius)
elif method == 'brute':
graph = radius_neighbors_graph(X, radius)
else:
raise ValueError('method must be one of: {auto, brute, pyflann, cyflann}')
return graph
[docs]def radius_neighbors_graph(X, radius):
"""
Computes pairwise distance matrix using dense method.
Parameters
----------
X: data matrix, array_like, shape = (n_samples, n_dimensions)
radius: neighborhood radius, scalar
Returns
-------
graph: the distance matrix, array_like, shape (X.shape[0]. X.shape[0])
sparse csr_format. Zeros on the diagonal are true zeros.
Zeros not on the diagonal should be considered infinite
"""
N = X.shape[0]
all_dists = pdist(X)
compr_ind = np.where(all_dists <= radius)[0]
(I, J) = _row_col_from_condensed_index(X.shape[0],compr_ind)
graph = sparse.coo_matrix((all_dists[compr_ind], (I, J)), shape = (N, N))
graph = graph + graph.T # symmetrize distance matrix (converts to csr)
with warnings.catch_warnings():
warnings.simplefilter('ignore')
# sparse will complain this is faster with lil_matrix:
graph.setdiag(0.0) # diagonal values are true zeros, otherwise they're infinite
return(graph)
[docs]def fl_radius_neighbors_graph(X, radius, flindex):
"""
Constructs a sparse distance matrix in coo format using pyflann.
Parameters
----------
X: array_like, shape=(n_samples, n_dimensions)
The matrix of data
radius: float
Neighborhood radius. The neighbors lying approximately within this
radius of a node will be used to construct the graph.
flindex: pyflann.FLANN object
The pre-built FLANN index for the data X
Returns
-------
graph: sparse matrix, shape=(n_samples, n_samples)
The sparse distance matrix
Notes
-----
With approximate neighborhood search, the matrix is not
necessarily symmetric.
"""
if radius < 0.:
raise ValueError('neighbors_radius must be >=0.')
nsam, ndim = X.shape
X = np.require(X, requirements = ['A', 'C']) # required for FLANN
radius *= radius
graph_jindices = []
graph_iindices = []
graph_data = []
for i in xrange(nsam):
jj, dd = flindex.nn_radius(X[i,:], radius)
graph_data.append(dd)
graph_jindices.append(jj)
graph_iindices.append(i*np.ones(jj.shape, dtype=int))
graph_data = np.concatenate( graph_data )
graph_iindices = np.concatenate( graph_iindices )
graph_jindices = np.concatenate( graph_jindices )
graph = sparse.coo_matrix((graph_data, (graph_iindices, graph_jindices)), shape=(nsam, nsam))
graph.data = np.sqrt(graph.data) # FLANN returns squared distance
return graph