Source code for Mmani.geometry.distance

# Authors: Marina Meila <mmp@stat.washington.edu>
#         James McQueen <jmcq@u.washington.edu>
# License: BSD 3 clause
from __future__ import division ## removes integer division
import numpy as np
from scipy import sparse
from scipy.spatial.distance import pdist
from Mmani.geometry.cyflann.index import Index
import subprocess, os, sys, warnings

def _row_col_from_condensed_index(N,compr_ind):
    # convert from pdist compressed index format to (I, J) (upper triangular) pairs.
    b = 1 -2*N 
    i = np.floor((-b - np.sqrt(b**2 - 8*compr_ind))/2)
    j = compr_ind + i*(b + i + 2)/2 + 1
    return (i,j)  

[docs]def distance_matrix(X, method = 'auto', flindex = None, radius = None, cyindex = None): """ Computes pairwise distance matrix. Interface function. Parameters ---------- X : data matrix, array_like, shape = (n_samples, n_dimensions) method : one of 'auto', 'brute', 'pyflann', 'cyflann', or 'cython'. 'pyflann' requires python library pyflann 'cython' requires FLANN and cython 'cyflann' requires a UNIX system flindex : a pyflann pre-computed flindex radius : neighborhood radius, scalar the neighbors lying approximately within radius of a node will be returned. Or, in other words, all distances will be less or equal to radius. There will be entries in the matrix for zero distances. Attention when converting to dense: The rest of the distances should not be considered 0, but "large". cyindex : A cython computed FLANN index. Returns ------- graph : the distance matrix, array_like, shape (X.shape[0]. X.shape[0]) sparse csr_format. Zeros on the diagonal are true zeros. Zeros not on the diagonal should be considered infinite """ if radius is None: radius = 1/X.shape[1] if method == 'auto': # change this based on benchmarking results method = 'brute' if method == 'pyflann': if flindex is None: raise ValueError('must pass a flindex when using pyflann') return None else: graph = fl_radius_neighbors_graph(X, radius, flindex) elif method == 'cyflann': graph = fl_cpp_radius_neighbors_graph(X, radius) elif method == 'cython': if cyindex is None: cyindex = Index(X) cyindex.buildIndex() graph = cyindex.radius_neighbors_graph(X, radius) elif method == 'brute': graph = radius_neighbors_graph(X, radius) else: raise ValueError('method must be one of: (auto, brute, pyflann, cyflann)') return None return graph
[docs]def radius_neighbors_graph(X, radius): """ Computes pairwise distance matrix using dense method. Parameters ---------- X: data matrix, array_like, shape = (n_samples, n_dimensions) radius: neighborhood radius, scalar Returns ------- graph: the distance matrix, array_like, shape (X.shape[0]. X.shape[0]) sparse csr_format. Zeros on the diagonal are true zeros. Zeros not on the diagonal should be considered infinite """ N = X.shape[0] all_dists = pdist(X) compr_ind = np.where(all_dists <= radius)[0] (I, J) = _row_col_from_condensed_index(X.shape[0],compr_ind) graph = sparse.coo_matrix((all_dists[compr_ind], (I, J)), shape = (N, N)) graph = graph + graph.T # symmetrize distance matrix (converts to csr) with warnings.catch_warnings(): warnings.simplefilter('ignore') # sparse will complain this is faster with lil_matrix: graph.setdiag(0.0) # diagonal values are true zeros, otherwise they're infinite return(graph)
[docs]def fl_cpp_radius_neighbors_graph(X, radius): """ Constructs a sparse distance matrix called graph in coo format using pre-compiled C++ function. Parameters ---------- X: data matrix, array_like, shape = (n_samples, n_dimensions ) radius: neighborhood radius, scalar the neighbors lying approximately within radius of a node will be returned. Or, in other words, all distances will be less or equal to radius. There will be entries in the matrix for zero distances. Attention when converting to dense: The rest of the distances should not be considered 0, but "large". Returns ------- graph: the distance matrix, array_like, shape = (X.shape[0],X.shape[0]) sparse csr format Notes ----- With approximate neiborhood search, the matrix is not necessarily symmetric. """ radius *= radius # FLANN computes squared distance. # To run the C++ executable we must save the data to a binary file nsam, ndim = X.shape fname = os.getcwd() + "/test_data_file.dat" X.tofile(fname) geom_path = os.path.abspath(__file__) # this is where geometry.py is located split_path = geom_path.split("/") split_path[-1] = "compute_flann_neighbors_cpp" cpp_file_path = "/".join(split_path) unix_call = "{file_path} {N} {D} {radius} {fname}" dist_call = unix_call.format(file_path = cpp_file_path, N=nsam, D=ndim, radius=radius, fname=fname) ret_code = subprocess.call([dist_call], shell = True) if ret_code != 0: raise RuntimeError("shell call: " + dist_call + " failed with code: " + str(ret_code)) # the resulting files from the C++ function are: # neighbors.dat: contains nsam rows with space separated index of nbr # distances.dat: contains nsam rows with space separated distance of nbr with open("neighbors.dat", "rb") as handle: allnbrs = handle.readlines() with open("distances.dat", "rb") as handle: alldists = handle.readlines() allnbrs = [nbrs.split() for nbrs in allnbrs] alldists = [dists.split() for dists in alldists] # for sparse storage indices = np.array([int(nbr) for nbri in allnbrs for nbr in nbri]) data = np.array([float(dist) for disti in alldists for dist in disti]) lengths = [len(nbri) for nbri in allnbrs] indpts = list(np.cumsum(lengths)) indpts.insert(0,0) indpts = np.array(indpts) graph = sparse.csr_matrix((data, indices, indpts), shape = (nsam, nsam)) graph.data = np.sqrt(graph.data) # FLANN returns squared distance return graph
[docs]def fl_radius_neighbors_graph(X, radius, flindex): """ Constructs a sparse distance matrix called graph in coo format using pyflann. Parameters ---------- X: data matrix, array_like, shape = (n_samples, n_dimensions ) radius: neighborhood radius, scalar the neighbors lying approximately within radius of a node will be returned. Or, in other words, all distances will be less or equal to radius. There will be entries in the matrix for zero distances. Attention when converting to dense: The rest of the distances should not be considered 0, but "large". flindex: FLANN index of the data X Returns ------- graph: the distance matrix, array_like, shape = (X.shape[0],X.shape[0]) sparse coo or csr format Notes ----- With approximate neighborhood search, the matrix is not necessarily symmetric. mode = 'adjacency' not implemented """ if radius < 0.: raise ValueError('neighbors_radius must be >=0.') nsam, ndim = X.shape X = np.require(X, requirements = ['A', 'C']) # required for FLANN radius *= radius graph_jindices = [] graph_iindices = [] graph_data = [] for i in xrange(nsam): jj, dd = flindex.nn_radius(X[i,:], radius) graph_data.append(dd) graph_jindices.append(jj) graph_iindices.append(i*np.ones(jj.shape, dtype=int)) graph_data = np.concatenate( graph_data ) graph_iindices = np.concatenate( graph_iindices ) graph_jindices = np.concatenate( graph_jindices ) graph = sparse.coo_matrix((graph_data, (graph_iindices, graph_jindices)), shape=(nsam, nsam)) graph.data = np.sqrt(graph.data) # FLANN returns squared distance return graph