Module emblaze.neighbors

Defines model classes to compute and store nearest neighbor sets that can be inherited across different Embedding objects.

Expand source code
"""
Defines model classes to compute and store nearest neighbor sets that can be
inherited across different `Embedding` objects.
"""

import numpy as np
from sklearn.neighbors import NearestNeighbors
from .utils import *

class Neighbors:
    """
    An object representing a serializable set of nearest neighbors within an
    embedding. The `Neighbors` object simply stores a matrix of integer IDs, where rows
    correspond to points in the embedding and columns are IDs of neighbors in
    order of proximity to each point. These neighbors can be accessed through the
    `values` property.
    """
    def __init__(self, values, ids=None, metric='euclidean', n_neighbors=100, clf=None):
        """
        This constructor should typically not be used - use [`Neighbors.compute`](#emblaze.neighbors.Neighbors.compute) instead.
        
        Args:
            values: Matrix of n x D high-dimensional positions
            ids: If supplied, a list of IDs for the points in the matrix
            metric: Distance metric to use to compute neighbors (can be any supported
                metric for `sklearn.neighbors.NearestNeighbors`)
            n_neighbors: Number of neighbors to compute and save
            clf: The `NearestNeighbors` object (only used when loading a `Neighbors`
                object from file)
        """
        super().__init__()
        self.values = values
        self.ids = ids
        self._id_index = {id: i for i, id in enumerate(self.ids)}
        self.metric = metric
        self.n_neighbors = n_neighbors
        self.clf = clf
    
    @classmethod
    def compute(cls, pos, ids=None, metric='euclidean', n_neighbors=100):
        """
        Compute a nearest-neighbor set using a given metric.
        
        Args:
            pos: Matrix of n x D high-dimensional positions
            ids: If supplied, a list of IDs for the points in the matrix
            metric: Distance metric to use to compute neighbors (can be any supported
                metric for `sklearn.neighbors.NearestNeighbors`)
            n_neighbors: Number of neighbors to compute and save
            
        Returns:
            An initialized `Neighbors` object containing computed neighbors.
        """
        ids = ids if ids is not None else np.arange(len(pos))
        neighbor_clf = NearestNeighbors(metric=metric,
                                        n_neighbors=n_neighbors + 1).fit(pos)
        _, neigh_indexes = neighbor_clf.kneighbors(pos)
        
        return cls(ids[neigh_indexes[:,1:]], ids=ids, metric=metric, n_neighbors=n_neighbors, clf=neighbor_clf)
        
    def index(self, id_vals):
        """
        Returns the index(es) of the given IDs.
        """
        if isinstance(id_vals, (list, np.ndarray, set)):
            return [self._id_index[int(id_val)] for id_val in id_vals]
        else:
            return self._id_index[int(id_vals)]

    def __getitem__(self, ids):
        """ids can be a single ID or a sequence of IDs"""
        if ids is None: return self.values
        return self.values[self.index(ids)]
    
    def __eq__(self, other):
        if isinstance(other, NeighborSet): return other == self
        if not isinstance(other, Neighbors): return False
        return np.allclose(self.ids, other.ids) and np.allclose(self.values, other.values)
    
    def __ne__(self, other):
        return not (self == other)
        
    def __len__(self):
        return len(self.values)
    
    def calculate_neighbors(self, pos, return_distance=True, n_neighbors=None):
        if self.clf is None:
            raise ValueError(
                ("Cannot compute neighbors because the Neighbors was not "
                 "initialized with a neighbor classifier - was it deserialized "
                 "from JSON without saving the original coordinates or "
                 "concatenated to another Neighbors?"))
        neigh_dists, neigh_indexes = self.clf.kneighbors(pos, n_neighbors=n_neighbors or self.n_neighbors)
        if return_distance:
            return neigh_dists, neigh_indexes
        return neigh_indexes
    
    def concat(self, other):
        """Concatenates the two Neighbors together, discarding the original 
        classifier."""
        assert not (set(self.ids.tolist()) & set(other.ids.tolist())), "Cannot concatenate Neighbors objects with overlapping ID values"
        assert self.metric == other.metric, "Cannot concatenate Neighbors objects with different metrics"
        return Neighbors(
            np.concatenate(self.values, other.values),
            ids=np.concatenate(self.ids, other.ids),
            metric=self.metric,
            n_neighbors = max(self.n_neighbors, other.n_neighbors)
        )
    
    def to_json(self, compressed=True, num_neighbors=None):
        """Serializes the neighbors to a JSON object."""
        result = {}
        result["metric"] = self.metric
        result["n_neighbors"] = self.n_neighbors
        
        neighbors = self.values
        if num_neighbors is not None:
            neighbors = neighbors[:,:min(num_neighbors, neighbors.shape[1])]
            
        if compressed:
            result["_format"] = "compressed"
            # Specify the type name that will be used to encode the point IDs.
            # This is important because the highlight array takes up the bulk
            # of the space when transferring to file/widget.
            dtype, type_name = choose_integer_type(self.ids)
            result["_idtype"] = type_name
            result["_length"] = len(self)
            result["ids"] = encode_numerical_array(self.ids, dtype)
            
            result["neighbors"] = encode_numerical_array(neighbors.flatten(),
                                                            astype=dtype,
                                                            interval=neighbors.shape[1])
        else:
            result["_format"] = "expanded"
            result["neighbors"] = {}
            indexes = self.index(self.ids)
            for id_val, index in zip(self.ids, indexes):
                result["neighbors"][id_val] = neighbors[index].tolist()
        return result
    
    @classmethod
    def from_json(cls, data):
        if data.get("_format", "expanded") == "compressed":
            dtype = np.dtype(data["_idtype"])
            ids = decode_numerical_array(data["ids"], dtype)
            neighbors = decode_numerical_array(data["neighbors"], dtype)
        else:
            neighbor_dict = data["neighbors"]
            try:
                ids = [int(id_val) for id_val in list(neighbor_dict.keys())]
                neighbor_dict = {int(k): v for k, v in neighbor_dict.items()}
            except:
                ids = list(neighbor_dict.keys())
            ids = sorted(ids)
            neighbors = np.array([neighbor_dict[id_val] for id_val in ids])
                
        return cls(neighbors, ids=ids, metric=data["metric"], n_neighbors=data["n_neighbors"])

class NeighborSet:
    """
    An object representing a serializable collection of Neighbors objects.
    """
    def __init__(self, neighbor_objects):
        super().__init__()
        self._neighbors = neighbor_objects
        
    def __getitem__(self, slice):
        return self._neighbors[slice]
    
    def __setitem__(self, slice, val):
        self._neighbors[slice] = val
        
    def __len__(self):
        return len(self._neighbors)
    
    def __iter__(self):
        return iter(self._neighbors)
    
    def __eq__(self, other):
        if isinstance(other, NeighborSet):
            return len(other) == len(self) and all(n1 == n2 for n1, n2 in zip(self, other))
        elif isinstance(other, Neighbors):
            return all(n1 == other for n1 in self)
        return False
    
    def __ne__(self, other):
        return not (self == other)
    
    def to_json(self, compressed=True, num_neighbors=None):
        """
        Serializes the list of Neighbors objects to JSON.
        """
        return [n.to_json(compressed=compressed, num_neighbors=num_neighbors)
                for n in self]
        
    @classmethod
    def from_json(cls, data):
        return [Neighbors.from_json(d) for d in data]
    
    def identical(self):
        """Returns True if all Neighbors objects within this NeighborSet are equal to each other."""
        if len(self) == 0: return True
        return all(n == self[0] for n in self)

Classes

class NeighborSet (neighbor_objects)

An object representing a serializable collection of Neighbors objects.

Expand source code
class NeighborSet:
    """
    An object representing a serializable collection of Neighbors objects.
    """
    def __init__(self, neighbor_objects):
        super().__init__()
        self._neighbors = neighbor_objects
        
    def __getitem__(self, slice):
        return self._neighbors[slice]
    
    def __setitem__(self, slice, val):
        self._neighbors[slice] = val
        
    def __len__(self):
        return len(self._neighbors)
    
    def __iter__(self):
        return iter(self._neighbors)
    
    def __eq__(self, other):
        if isinstance(other, NeighborSet):
            return len(other) == len(self) and all(n1 == n2 for n1, n2 in zip(self, other))
        elif isinstance(other, Neighbors):
            return all(n1 == other for n1 in self)
        return False
    
    def __ne__(self, other):
        return not (self == other)
    
    def to_json(self, compressed=True, num_neighbors=None):
        """
        Serializes the list of Neighbors objects to JSON.
        """
        return [n.to_json(compressed=compressed, num_neighbors=num_neighbors)
                for n in self]
        
    @classmethod
    def from_json(cls, data):
        return [Neighbors.from_json(d) for d in data]
    
    def identical(self):
        """Returns True if all Neighbors objects within this NeighborSet are equal to each other."""
        if len(self) == 0: return True
        return all(n == self[0] for n in self)

Static methods

def from_json(data)
Expand source code
@classmethod
def from_json(cls, data):
    return [Neighbors.from_json(d) for d in data]

Methods

def identical(self)

Returns True if all Neighbors objects within this NeighborSet are equal to each other.

Expand source code
def identical(self):
    """Returns True if all Neighbors objects within this NeighborSet are equal to each other."""
    if len(self) == 0: return True
    return all(n == self[0] for n in self)
def to_json(self, compressed=True, num_neighbors=None)

Serializes the list of Neighbors objects to JSON.

Expand source code
def to_json(self, compressed=True, num_neighbors=None):
    """
    Serializes the list of Neighbors objects to JSON.
    """
    return [n.to_json(compressed=compressed, num_neighbors=num_neighbors)
            for n in self]
class Neighbors (values, ids=None, metric='euclidean', n_neighbors=100, clf=None)

An object representing a serializable set of nearest neighbors within an embedding. The Neighbors object simply stores a matrix of integer IDs, where rows correspond to points in the embedding and columns are IDs of neighbors in order of proximity to each point. These neighbors can be accessed through the values property.

This constructor should typically not be used - use Neighbors.compute instead.

Args

values
Matrix of n x D high-dimensional positions
ids
If supplied, a list of IDs for the points in the matrix
metric
Distance metric to use to compute neighbors (can be any supported metric for sklearn.neighbors.NearestNeighbors)
n_neighbors
Number of neighbors to compute and save
clf
The NearestNeighbors object (only used when loading a Neighbors object from file)
Expand source code
class Neighbors:
    """
    An object representing a serializable set of nearest neighbors within an
    embedding. The `Neighbors` object simply stores a matrix of integer IDs, where rows
    correspond to points in the embedding and columns are IDs of neighbors in
    order of proximity to each point. These neighbors can be accessed through the
    `values` property.
    """
    def __init__(self, values, ids=None, metric='euclidean', n_neighbors=100, clf=None):
        """
        This constructor should typically not be used - use [`Neighbors.compute`](#emblaze.neighbors.Neighbors.compute) instead.
        
        Args:
            values: Matrix of n x D high-dimensional positions
            ids: If supplied, a list of IDs for the points in the matrix
            metric: Distance metric to use to compute neighbors (can be any supported
                metric for `sklearn.neighbors.NearestNeighbors`)
            n_neighbors: Number of neighbors to compute and save
            clf: The `NearestNeighbors` object (only used when loading a `Neighbors`
                object from file)
        """
        super().__init__()
        self.values = values
        self.ids = ids
        self._id_index = {id: i for i, id in enumerate(self.ids)}
        self.metric = metric
        self.n_neighbors = n_neighbors
        self.clf = clf
    
    @classmethod
    def compute(cls, pos, ids=None, metric='euclidean', n_neighbors=100):
        """
        Compute a nearest-neighbor set using a given metric.
        
        Args:
            pos: Matrix of n x D high-dimensional positions
            ids: If supplied, a list of IDs for the points in the matrix
            metric: Distance metric to use to compute neighbors (can be any supported
                metric for `sklearn.neighbors.NearestNeighbors`)
            n_neighbors: Number of neighbors to compute and save
            
        Returns:
            An initialized `Neighbors` object containing computed neighbors.
        """
        ids = ids if ids is not None else np.arange(len(pos))
        neighbor_clf = NearestNeighbors(metric=metric,
                                        n_neighbors=n_neighbors + 1).fit(pos)
        _, neigh_indexes = neighbor_clf.kneighbors(pos)
        
        return cls(ids[neigh_indexes[:,1:]], ids=ids, metric=metric, n_neighbors=n_neighbors, clf=neighbor_clf)
        
    def index(self, id_vals):
        """
        Returns the index(es) of the given IDs.
        """
        if isinstance(id_vals, (list, np.ndarray, set)):
            return [self._id_index[int(id_val)] for id_val in id_vals]
        else:
            return self._id_index[int(id_vals)]

    def __getitem__(self, ids):
        """ids can be a single ID or a sequence of IDs"""
        if ids is None: return self.values
        return self.values[self.index(ids)]
    
    def __eq__(self, other):
        if isinstance(other, NeighborSet): return other == self
        if not isinstance(other, Neighbors): return False
        return np.allclose(self.ids, other.ids) and np.allclose(self.values, other.values)
    
    def __ne__(self, other):
        return not (self == other)
        
    def __len__(self):
        return len(self.values)
    
    def calculate_neighbors(self, pos, return_distance=True, n_neighbors=None):
        if self.clf is None:
            raise ValueError(
                ("Cannot compute neighbors because the Neighbors was not "
                 "initialized with a neighbor classifier - was it deserialized "
                 "from JSON without saving the original coordinates or "
                 "concatenated to another Neighbors?"))
        neigh_dists, neigh_indexes = self.clf.kneighbors(pos, n_neighbors=n_neighbors or self.n_neighbors)
        if return_distance:
            return neigh_dists, neigh_indexes
        return neigh_indexes
    
    def concat(self, other):
        """Concatenates the two Neighbors together, discarding the original 
        classifier."""
        assert not (set(self.ids.tolist()) & set(other.ids.tolist())), "Cannot concatenate Neighbors objects with overlapping ID values"
        assert self.metric == other.metric, "Cannot concatenate Neighbors objects with different metrics"
        return Neighbors(
            np.concatenate(self.values, other.values),
            ids=np.concatenate(self.ids, other.ids),
            metric=self.metric,
            n_neighbors = max(self.n_neighbors, other.n_neighbors)
        )
    
    def to_json(self, compressed=True, num_neighbors=None):
        """Serializes the neighbors to a JSON object."""
        result = {}
        result["metric"] = self.metric
        result["n_neighbors"] = self.n_neighbors
        
        neighbors = self.values
        if num_neighbors is not None:
            neighbors = neighbors[:,:min(num_neighbors, neighbors.shape[1])]
            
        if compressed:
            result["_format"] = "compressed"
            # Specify the type name that will be used to encode the point IDs.
            # This is important because the highlight array takes up the bulk
            # of the space when transferring to file/widget.
            dtype, type_name = choose_integer_type(self.ids)
            result["_idtype"] = type_name
            result["_length"] = len(self)
            result["ids"] = encode_numerical_array(self.ids, dtype)
            
            result["neighbors"] = encode_numerical_array(neighbors.flatten(),
                                                            astype=dtype,
                                                            interval=neighbors.shape[1])
        else:
            result["_format"] = "expanded"
            result["neighbors"] = {}
            indexes = self.index(self.ids)
            for id_val, index in zip(self.ids, indexes):
                result["neighbors"][id_val] = neighbors[index].tolist()
        return result
    
    @classmethod
    def from_json(cls, data):
        if data.get("_format", "expanded") == "compressed":
            dtype = np.dtype(data["_idtype"])
            ids = decode_numerical_array(data["ids"], dtype)
            neighbors = decode_numerical_array(data["neighbors"], dtype)
        else:
            neighbor_dict = data["neighbors"]
            try:
                ids = [int(id_val) for id_val in list(neighbor_dict.keys())]
                neighbor_dict = {int(k): v for k, v in neighbor_dict.items()}
            except:
                ids = list(neighbor_dict.keys())
            ids = sorted(ids)
            neighbors = np.array([neighbor_dict[id_val] for id_val in ids])
                
        return cls(neighbors, ids=ids, metric=data["metric"], n_neighbors=data["n_neighbors"])

Static methods

def compute(pos, ids=None, metric='euclidean', n_neighbors=100)

Compute a nearest-neighbor set using a given metric.

Args

pos
Matrix of n x D high-dimensional positions
ids
If supplied, a list of IDs for the points in the matrix
metric
Distance metric to use to compute neighbors (can be any supported metric for sklearn.neighbors.NearestNeighbors)
n_neighbors
Number of neighbors to compute and save

Returns

An initialized Neighbors object containing computed neighbors.

Expand source code
@classmethod
def compute(cls, pos, ids=None, metric='euclidean', n_neighbors=100):
    """
    Compute a nearest-neighbor set using a given metric.
    
    Args:
        pos: Matrix of n x D high-dimensional positions
        ids: If supplied, a list of IDs for the points in the matrix
        metric: Distance metric to use to compute neighbors (can be any supported
            metric for `sklearn.neighbors.NearestNeighbors`)
        n_neighbors: Number of neighbors to compute and save
        
    Returns:
        An initialized `Neighbors` object containing computed neighbors.
    """
    ids = ids if ids is not None else np.arange(len(pos))
    neighbor_clf = NearestNeighbors(metric=metric,
                                    n_neighbors=n_neighbors + 1).fit(pos)
    _, neigh_indexes = neighbor_clf.kneighbors(pos)
    
    return cls(ids[neigh_indexes[:,1:]], ids=ids, metric=metric, n_neighbors=n_neighbors, clf=neighbor_clf)
def from_json(data)
Expand source code
@classmethod
def from_json(cls, data):
    if data.get("_format", "expanded") == "compressed":
        dtype = np.dtype(data["_idtype"])
        ids = decode_numerical_array(data["ids"], dtype)
        neighbors = decode_numerical_array(data["neighbors"], dtype)
    else:
        neighbor_dict = data["neighbors"]
        try:
            ids = [int(id_val) for id_val in list(neighbor_dict.keys())]
            neighbor_dict = {int(k): v for k, v in neighbor_dict.items()}
        except:
            ids = list(neighbor_dict.keys())
        ids = sorted(ids)
        neighbors = np.array([neighbor_dict[id_val] for id_val in ids])
            
    return cls(neighbors, ids=ids, metric=data["metric"], n_neighbors=data["n_neighbors"])

Methods

def calculate_neighbors(self, pos, return_distance=True, n_neighbors=None)
Expand source code
def calculate_neighbors(self, pos, return_distance=True, n_neighbors=None):
    if self.clf is None:
        raise ValueError(
            ("Cannot compute neighbors because the Neighbors was not "
             "initialized with a neighbor classifier - was it deserialized "
             "from JSON without saving the original coordinates or "
             "concatenated to another Neighbors?"))
    neigh_dists, neigh_indexes = self.clf.kneighbors(pos, n_neighbors=n_neighbors or self.n_neighbors)
    if return_distance:
        return neigh_dists, neigh_indexes
    return neigh_indexes
def concat(self, other)

Concatenates the two Neighbors together, discarding the original classifier.

Expand source code
def concat(self, other):
    """Concatenates the two Neighbors together, discarding the original 
    classifier."""
    assert not (set(self.ids.tolist()) & set(other.ids.tolist())), "Cannot concatenate Neighbors objects with overlapping ID values"
    assert self.metric == other.metric, "Cannot concatenate Neighbors objects with different metrics"
    return Neighbors(
        np.concatenate(self.values, other.values),
        ids=np.concatenate(self.ids, other.ids),
        metric=self.metric,
        n_neighbors = max(self.n_neighbors, other.n_neighbors)
    )
def index(self, id_vals)

Returns the index(es) of the given IDs.

Expand source code
def index(self, id_vals):
    """
    Returns the index(es) of the given IDs.
    """
    if isinstance(id_vals, (list, np.ndarray, set)):
        return [self._id_index[int(id_val)] for id_val in id_vals]
    else:
        return self._id_index[int(id_vals)]
def to_json(self, compressed=True, num_neighbors=None)

Serializes the neighbors to a JSON object.

Expand source code
def to_json(self, compressed=True, num_neighbors=None):
    """Serializes the neighbors to a JSON object."""
    result = {}
    result["metric"] = self.metric
    result["n_neighbors"] = self.n_neighbors
    
    neighbors = self.values
    if num_neighbors is not None:
        neighbors = neighbors[:,:min(num_neighbors, neighbors.shape[1])]
        
    if compressed:
        result["_format"] = "compressed"
        # Specify the type name that will be used to encode the point IDs.
        # This is important because the highlight array takes up the bulk
        # of the space when transferring to file/widget.
        dtype, type_name = choose_integer_type(self.ids)
        result["_idtype"] = type_name
        result["_length"] = len(self)
        result["ids"] = encode_numerical_array(self.ids, dtype)
        
        result["neighbors"] = encode_numerical_array(neighbors.flatten(),
                                                        astype=dtype,
                                                        interval=neighbors.shape[1])
    else:
        result["_format"] = "expanded"
        result["neighbors"] = {}
        indexes = self.index(self.ids)
        for id_val, index in zip(self.ids, indexes):
            result["neighbors"][id_val] = neighbors[index].tolist()
    return result