Module emblaze.recommender

Defines a class to compute Suggested Selections, or clusters that exhibit consistent or noteworthy changes from one frame to another.

Expand source code
"""
Defines a class to compute Suggested Selections, or clusters that exhibit
consistent or noteworthy changes from one frame to another.
"""

import numpy as np
from sklearn.cluster import AgglomerativeClustering
from .utils import Field, inverse_intersection, standardize_json
from numba.typed import List
from scipy.sparse import csr_matrix
import collections

NUM_NEIGHBORS_FOR_SEARCH = 10

class SelectionRecommender:
    """
    Generates recommended selections based on a variety of inputs. The
    recommender works by pre-generating a list of clusters at various
    granularities, then sorting them by relevance to a given query.
    """
    def __init__(self, embeddings, clusters=None, progress_fn=None, frame_idx=None, preview_frame_idx=None, filter_points=None):
        super().__init__()
        self.embeddings = embeddings
        self.clusters = clusters if clusters is not None else {}
        self.is_restricted = frame_idx is not None or preview_frame_idx is not None or filter_points is not None
        embs_first = [frame_idx] if frame_idx is not None else range(len(self.embeddings))
        embs_second = [preview_frame_idx] if preview_frame_idx is not None else range(len(self.embeddings))
        total_num_embs = sum(1 for x in embs_first for y in embs_second if x != y)
        for i in embs_first:
            for j in embs_second:
                if i == j or (i, j) in self.clusters: continue
                self.clusters[(i, j)] = self._make_clusters(i, j, np.log10(len(self.embeddings[i])), filter_points=filter_points)
                if progress_fn is not None:
                    progress_fn(len(self.clusters) / total_num_embs)
        
    def _make_neighbor_mat(self, neighbors, num_columns):
        """Converts a list of neighbor indexes into a one-hot encoded matrix."""
        neighbor_mat = np.zeros((len(neighbors), num_columns + 1), dtype=np.uint8)
        if isinstance(neighbors, list):
            max_len = max(len(n) for n in neighbors)
            neighbors_padded = -np.ones((len(neighbors), max_len), dtype=int)
            for i, n in enumerate(neighbors):
                neighbors_padded[i,:len(n)] = list(n)
            neighbors = neighbors_padded

        for i in range(neighbors.shape[1]):
            neighbor_mat[np.arange(len(neighbors)), neighbors[:,i] + 1] = 1
        return csr_matrix(neighbor_mat[:,1:])

    def _pairwise_jaccard_distances(self, neighbors):
        """Computes the jaccard distance between each row of the given set of neighbors."""
        lengths = np.array([len(n) for n in neighbors], dtype=np.uint16)
        if np.sum(lengths) == 0:
            return np.zeros((len(neighbors), len(neighbors)))

        # Make a one-hot matrix of neighbors
        neighbor_mat = self._make_neighbor_mat(neighbors, max(np.max([n for x in neighbors for n in x]) + 1, len(neighbors)))
        # Calculate intersection of sets using dot product
        intersection = np.dot(neighbor_mat, neighbor_mat.T)
        del neighbor_mat
        
        # Use set trick: len(x | y) = len(x) + len(y) - len(x & y)
        length_sums = lengths[:,np.newaxis] + lengths[np.newaxis,:]
        union = np.maximum(length_sums - intersection, np.array([1], dtype=np.uint16), casting='no')
        del length_sums
        result = np.zeros((len(neighbors), len(neighbors)), dtype=np.float16)
        np.true_divide(intersection.todense(), union, out=result)
        return np.array([1.0], dtype=np.float16) - result

    def _make_neighbor_changes(self, idx_1, idx_2, filter_points=None):
        """
        Computes the sets of gained IDs and lost IDs for the given pair of frames.
        """
        frame_1 = self.embeddings[idx_1]
        frame_2 = self.embeddings[idx_2]
        frame_1_neighbors = frame_1.get_recent_neighbors()[filter_points or None]
        frame_2_neighbors = frame_2.get_recent_neighbors()[filter_points or None]
        gained_ids = [set(frame_2_neighbors[i]) - set(frame_1_neighbors[i]) for i in range(len(filter_points or frame_1))]
        lost_ids = [set(frame_1_neighbors[i]) - set(frame_2_neighbors[i]) for i in range(len(filter_points or frame_1))]

        return gained_ids, lost_ids
        
    def _consistency_score(self, ids, frame):
        """
        Computes the consistency between the neighbors for the given set of IDs 
        in the given frame.
        """
        return (np.sum(1 - self._pairwise_jaccard_distances(frame.get_recent_neighbors()[ids])) - len(ids)) / (len(ids) * (len(ids) - 1))
        
    def _inner_change_score(self, ids, frame_1, frame_2):
        """
        Computes the inverse intersection of the neighbor sets in the given
        two frames.
        """
        return np.mean(inverse_intersection(frame_1.get_recent_neighbors()[ids],
                                            frame_2.get_recent_neighbors()[ids],
                                            List(ids),
                                            False))
        
    def _change_score(self, change_set, ids, num_neighbors=10):
        """
        Computes a score estimating the consistency in the changes for the given
        set of IDs
        """
        counter = collections.Counter([x for s in change_set for x in s if x not in ids])
        return np.mean([x[1] / len(ids) for x in sorted(counter.items(), key=lambda x: x[1], reverse=True)[:num_neighbors]])
        
    def _make_clusters(self, idx_1, idx_2, min_cluster_size=1, filter_points=None):
        """
        Produces clusters based on the pairwise distances between the given pair of frames.
        """
        filter_points = list(filter_points) if filter_points is not None else None
        all_ids = np.array(filter_points) if filter_points is not None else self.embeddings[idx_1].ids
        
        gained_ids, lost_ids = self._make_neighbor_changes(idx_1, idx_2, filter_points=filter_points)
        distances = (self._pairwise_jaccard_distances(gained_ids) + self._pairwise_jaccard_distances(lost_ids)) / 2
        clusters = []
        
        for threshold in np.arange(0.7, 0.91, 0.1):
            clusterer = AgglomerativeClustering(n_clusters=None,
                                                distance_threshold=threshold,
                                                affinity='precomputed',
                                                linkage='average')
            clusterer.fit(distances)
            cluster_labels = clusterer.labels_
            
            for label, count in zip(*np.unique(cluster_labels, return_counts=True)):
                if count < min_cluster_size: continue
                indexes = np.arange(len(cluster_labels))[cluster_labels == label]
                ids = all_ids[cluster_labels == label].tolist()

                clusters.append({
                    'ids': set(ids),
                    'frame': idx_1,
                    'previewFrame': idx_2,
                    'consistency': self._consistency_score(ids, self.embeddings[idx_1]), 
                    'innerChange': self._inner_change_score(ids, self.embeddings[idx_1], self.embeddings[idx_2]),
                    'gain': self._change_score([gained_ids[i] for i in indexes], ids), 
                    'loss': self._change_score([lost_ids[i] for i in indexes], ids)
                })
        return clusters
    
    def query(self, ids_of_interest=None, filter_ids=None, frame_idx=None, preview_frame_idx=None, bounding_box=None, num_results=10, id_type="selection"):
        """
        Returns a list of clusters in sorted order of relevance that match the
        given filters.
        
        Args:
            ids_of_interest: A list of ID values. If there are sufficiently many clusters
                containing at least one ID in this list, only they will be returned.
                Otherwise, clusters containing IDs from the neighbor sets of those IDs
                may be returned as well.
            filter_ids: A list of IDs such that at least one point in every cluster
                MUST be present in this list.
            frame_idx: A base frame index to filter for. If None, clusters from any frame
                may be returned.
            preview_frame_idx: A preview frame index to filter for. frame_idx must be
                provided if this is provided.
            bounding_box: If provided, should be a tuple of four values: min x, max x, 
                min y, and max y. At least one point in each cluster will be required to
                be within the bounding box.
            num_results: Maximum number of results to return.
            id_type: The type of ID that ids_of_interest corresponds to. This goes into
                the explanation string for clusters, e.g. "shares 3 points with <id_type>".
                
        Returns:
            A list of suggested selections. Each suggestion is returned as a
            tuple of two values - a list of point IDs, and a string "reason"
            explaining why the cluster is recommended.
        """
        
        # Determine which frames to look for clusters in
        frames_to_check = []
        if frame_idx is not None:
            if preview_frame_idx is not None:
                frames_to_check.append((frame_idx, preview_frame_idx))
            else:
                frames_to_check = [(frame_idx, j) for j in range(len(self.embeddings)) if frame_idx != j]
        else:
            frames_to_check = [(i, j) for i in range(len(self.embeddings)) for j in range(len(self.embeddings)) if i != j]
            
        interest_set = set(ids_of_interest) if ids_of_interest is not None else None
        filter_set = set(filter_ids) if filter_ids is not None else None
        
        candidates = []
        for frame_key in frames_to_check:
            base_frame = self.embeddings[frame_key[0]]
            if bounding_box is not None:
                positions = base_frame.field(Field.POSITION)
            else:
                positions = None
                
            # Assemble a list of candidates
            if ids_of_interest is not None:
                neighbor_ids = set([n for n in self.embeddings[frame_key[0]].get_recent_neighbors()[ids_of_interest][:,:NUM_NEIGHBORS_FOR_SEARCH].flatten()])
            else:
                neighbor_ids = None    

            for cluster in self.clusters[frame_key]:
                frame_labels = "{} &rarr; {}".format(self.embeddings[cluster['frame']].label or "Frame " + str(cluster['frame']),
                                                     self.embeddings[cluster['previewFrame']].label or "Frame " + str(cluster['previewFrame']))
                base_score = (cluster['consistency'] + cluster['innerChange'] + cluster['gain'] + cluster['loss']) * np.log(len(cluster['ids']))
                if filter_set is not None:
                    if not cluster['ids'] & filter_set:
                        continue
                    base_score *= len(cluster['ids'] & filter_set) / len(cluster['ids'])
                
                if interest_set is not None and cluster['ids'] & interest_set:
                    candidates.append((cluster, 
                                       base_score * len(cluster['ids'] & interest_set) / len(cluster['ids']),
                                       "shares {} points with {} ({})".format(len(cluster['ids'] & interest_set), id_type, frame_labels)))
                elif neighbor_ids is not None and cluster['ids'] & neighbor_ids:
                    candidates.append((cluster,
                                       base_score * 0.5 * len(cluster['ids'] & neighbor_ids) / len(cluster['ids']),
                                      "shares {} points with neighbors of {} ({})".format(len(cluster['ids'] & neighbor_ids), id_type, frame_labels)))
                elif bounding_box is not None:
                    point_positions = positions[base_frame.index(np.array(list(cluster['ids'])))]
                    num_within = np.sum((point_positions[:,0] >= bounding_box[0]) *
                              (point_positions[:,0] <= bounding_box[1]) *
                              (point_positions[:,1] >= bounding_box[2]) *
                              (point_positions[:,1] <= bounding_box[3]))
                    if num_within > 0:
                        candidates.append((cluster, base_score * np.log(num_within), frame_labels))
                elif ids_of_interest is None:
                    if frame_idx is not None and preview_frame_idx is not None:
                        reason = "matches frames "
                    elif preview_frame_idx is not None:
                        reason = "matches preview frame "
                    else:
                        reason = ""
                    candidates.append((cluster,
                                       base_score,
                                       "{}{}".format(reason, ("(" + frame_labels + ")") if reason else frame_labels)))
        
        # Sort candidates and make sure they don't include overlapping IDs
        seen_ids = set()
        results = []
        for cluster, _, reason in sorted(candidates, key=lambda x: x[1], reverse=True):
            if cluster['ids'] & seen_ids: continue
            results.append((cluster, reason))
            seen_ids |= cluster['ids']
            if len(results) >= num_results: break
                
        return results
    
    def to_json(self):
        """
        Converts the clusters stored in this Recommender object to JSON.
        """
        def _convert_cluster(cluster):
            return {k: list(v) if isinstance(v, set) else v for k, v in cluster.items()}
        return standardize_json({
            ",".join((str(i), str(j))): [_convert_cluster(c) for c in clusters]
            for (i, j), clusters in self.clusters.items()
        })
        
    @classmethod
    def from_json(cls, data, embeddings):
        """
        Reads the clusters stored in the given JSON object to a Recommender.
        """
        def _convert_cluster(cluster):
            return {k: set(v) if k == 'ids' else v for k, v in cluster.items()}
        def _convert_key(key):
            i, j = key.split(",")
            return (int(i), int(j))
        return cls(embeddings, {_convert_key(k): [_convert_cluster(c) for c in clusters]
                                for k, clusters in data.items()})

Classes

class SelectionRecommender (embeddings, clusters=None, progress_fn=None, frame_idx=None, preview_frame_idx=None, filter_points=None)

Generates recommended selections based on a variety of inputs. The recommender works by pre-generating a list of clusters at various granularities, then sorting them by relevance to a given query.

Expand source code
class SelectionRecommender:
    """
    Generates recommended selections based on a variety of inputs. The
    recommender works by pre-generating a list of clusters at various
    granularities, then sorting them by relevance to a given query.
    """
    def __init__(self, embeddings, clusters=None, progress_fn=None, frame_idx=None, preview_frame_idx=None, filter_points=None):
        super().__init__()
        self.embeddings = embeddings
        self.clusters = clusters if clusters is not None else {}
        self.is_restricted = frame_idx is not None or preview_frame_idx is not None or filter_points is not None
        embs_first = [frame_idx] if frame_idx is not None else range(len(self.embeddings))
        embs_second = [preview_frame_idx] if preview_frame_idx is not None else range(len(self.embeddings))
        total_num_embs = sum(1 for x in embs_first for y in embs_second if x != y)
        for i in embs_first:
            for j in embs_second:
                if i == j or (i, j) in self.clusters: continue
                self.clusters[(i, j)] = self._make_clusters(i, j, np.log10(len(self.embeddings[i])), filter_points=filter_points)
                if progress_fn is not None:
                    progress_fn(len(self.clusters) / total_num_embs)
        
    def _make_neighbor_mat(self, neighbors, num_columns):
        """Converts a list of neighbor indexes into a one-hot encoded matrix."""
        neighbor_mat = np.zeros((len(neighbors), num_columns + 1), dtype=np.uint8)
        if isinstance(neighbors, list):
            max_len = max(len(n) for n in neighbors)
            neighbors_padded = -np.ones((len(neighbors), max_len), dtype=int)
            for i, n in enumerate(neighbors):
                neighbors_padded[i,:len(n)] = list(n)
            neighbors = neighbors_padded

        for i in range(neighbors.shape[1]):
            neighbor_mat[np.arange(len(neighbors)), neighbors[:,i] + 1] = 1
        return csr_matrix(neighbor_mat[:,1:])

    def _pairwise_jaccard_distances(self, neighbors):
        """Computes the jaccard distance between each row of the given set of neighbors."""
        lengths = np.array([len(n) for n in neighbors], dtype=np.uint16)
        if np.sum(lengths) == 0:
            return np.zeros((len(neighbors), len(neighbors)))

        # Make a one-hot matrix of neighbors
        neighbor_mat = self._make_neighbor_mat(neighbors, max(np.max([n for x in neighbors for n in x]) + 1, len(neighbors)))
        # Calculate intersection of sets using dot product
        intersection = np.dot(neighbor_mat, neighbor_mat.T)
        del neighbor_mat
        
        # Use set trick: len(x | y) = len(x) + len(y) - len(x & y)
        length_sums = lengths[:,np.newaxis] + lengths[np.newaxis,:]
        union = np.maximum(length_sums - intersection, np.array([1], dtype=np.uint16), casting='no')
        del length_sums
        result = np.zeros((len(neighbors), len(neighbors)), dtype=np.float16)
        np.true_divide(intersection.todense(), union, out=result)
        return np.array([1.0], dtype=np.float16) - result

    def _make_neighbor_changes(self, idx_1, idx_2, filter_points=None):
        """
        Computes the sets of gained IDs and lost IDs for the given pair of frames.
        """
        frame_1 = self.embeddings[idx_1]
        frame_2 = self.embeddings[idx_2]
        frame_1_neighbors = frame_1.get_recent_neighbors()[filter_points or None]
        frame_2_neighbors = frame_2.get_recent_neighbors()[filter_points or None]
        gained_ids = [set(frame_2_neighbors[i]) - set(frame_1_neighbors[i]) for i in range(len(filter_points or frame_1))]
        lost_ids = [set(frame_1_neighbors[i]) - set(frame_2_neighbors[i]) for i in range(len(filter_points or frame_1))]

        return gained_ids, lost_ids
        
    def _consistency_score(self, ids, frame):
        """
        Computes the consistency between the neighbors for the given set of IDs 
        in the given frame.
        """
        return (np.sum(1 - self._pairwise_jaccard_distances(frame.get_recent_neighbors()[ids])) - len(ids)) / (len(ids) * (len(ids) - 1))
        
    def _inner_change_score(self, ids, frame_1, frame_2):
        """
        Computes the inverse intersection of the neighbor sets in the given
        two frames.
        """
        return np.mean(inverse_intersection(frame_1.get_recent_neighbors()[ids],
                                            frame_2.get_recent_neighbors()[ids],
                                            List(ids),
                                            False))
        
    def _change_score(self, change_set, ids, num_neighbors=10):
        """
        Computes a score estimating the consistency in the changes for the given
        set of IDs
        """
        counter = collections.Counter([x for s in change_set for x in s if x not in ids])
        return np.mean([x[1] / len(ids) for x in sorted(counter.items(), key=lambda x: x[1], reverse=True)[:num_neighbors]])
        
    def _make_clusters(self, idx_1, idx_2, min_cluster_size=1, filter_points=None):
        """
        Produces clusters based on the pairwise distances between the given pair of frames.
        """
        filter_points = list(filter_points) if filter_points is not None else None
        all_ids = np.array(filter_points) if filter_points is not None else self.embeddings[idx_1].ids
        
        gained_ids, lost_ids = self._make_neighbor_changes(idx_1, idx_2, filter_points=filter_points)
        distances = (self._pairwise_jaccard_distances(gained_ids) + self._pairwise_jaccard_distances(lost_ids)) / 2
        clusters = []
        
        for threshold in np.arange(0.7, 0.91, 0.1):
            clusterer = AgglomerativeClustering(n_clusters=None,
                                                distance_threshold=threshold,
                                                affinity='precomputed',
                                                linkage='average')
            clusterer.fit(distances)
            cluster_labels = clusterer.labels_
            
            for label, count in zip(*np.unique(cluster_labels, return_counts=True)):
                if count < min_cluster_size: continue
                indexes = np.arange(len(cluster_labels))[cluster_labels == label]
                ids = all_ids[cluster_labels == label].tolist()

                clusters.append({
                    'ids': set(ids),
                    'frame': idx_1,
                    'previewFrame': idx_2,
                    'consistency': self._consistency_score(ids, self.embeddings[idx_1]), 
                    'innerChange': self._inner_change_score(ids, self.embeddings[idx_1], self.embeddings[idx_2]),
                    'gain': self._change_score([gained_ids[i] for i in indexes], ids), 
                    'loss': self._change_score([lost_ids[i] for i in indexes], ids)
                })
        return clusters
    
    def query(self, ids_of_interest=None, filter_ids=None, frame_idx=None, preview_frame_idx=None, bounding_box=None, num_results=10, id_type="selection"):
        """
        Returns a list of clusters in sorted order of relevance that match the
        given filters.
        
        Args:
            ids_of_interest: A list of ID values. If there are sufficiently many clusters
                containing at least one ID in this list, only they will be returned.
                Otherwise, clusters containing IDs from the neighbor sets of those IDs
                may be returned as well.
            filter_ids: A list of IDs such that at least one point in every cluster
                MUST be present in this list.
            frame_idx: A base frame index to filter for. If None, clusters from any frame
                may be returned.
            preview_frame_idx: A preview frame index to filter for. frame_idx must be
                provided if this is provided.
            bounding_box: If provided, should be a tuple of four values: min x, max x, 
                min y, and max y. At least one point in each cluster will be required to
                be within the bounding box.
            num_results: Maximum number of results to return.
            id_type: The type of ID that ids_of_interest corresponds to. This goes into
                the explanation string for clusters, e.g. "shares 3 points with <id_type>".
                
        Returns:
            A list of suggested selections. Each suggestion is returned as a
            tuple of two values - a list of point IDs, and a string "reason"
            explaining why the cluster is recommended.
        """
        
        # Determine which frames to look for clusters in
        frames_to_check = []
        if frame_idx is not None:
            if preview_frame_idx is not None:
                frames_to_check.append((frame_idx, preview_frame_idx))
            else:
                frames_to_check = [(frame_idx, j) for j in range(len(self.embeddings)) if frame_idx != j]
        else:
            frames_to_check = [(i, j) for i in range(len(self.embeddings)) for j in range(len(self.embeddings)) if i != j]
            
        interest_set = set(ids_of_interest) if ids_of_interest is not None else None
        filter_set = set(filter_ids) if filter_ids is not None else None
        
        candidates = []
        for frame_key in frames_to_check:
            base_frame = self.embeddings[frame_key[0]]
            if bounding_box is not None:
                positions = base_frame.field(Field.POSITION)
            else:
                positions = None
                
            # Assemble a list of candidates
            if ids_of_interest is not None:
                neighbor_ids = set([n for n in self.embeddings[frame_key[0]].get_recent_neighbors()[ids_of_interest][:,:NUM_NEIGHBORS_FOR_SEARCH].flatten()])
            else:
                neighbor_ids = None    

            for cluster in self.clusters[frame_key]:
                frame_labels = "{} &rarr; {}".format(self.embeddings[cluster['frame']].label or "Frame " + str(cluster['frame']),
                                                     self.embeddings[cluster['previewFrame']].label or "Frame " + str(cluster['previewFrame']))
                base_score = (cluster['consistency'] + cluster['innerChange'] + cluster['gain'] + cluster['loss']) * np.log(len(cluster['ids']))
                if filter_set is not None:
                    if not cluster['ids'] & filter_set:
                        continue
                    base_score *= len(cluster['ids'] & filter_set) / len(cluster['ids'])
                
                if interest_set is not None and cluster['ids'] & interest_set:
                    candidates.append((cluster, 
                                       base_score * len(cluster['ids'] & interest_set) / len(cluster['ids']),
                                       "shares {} points with {} ({})".format(len(cluster['ids'] & interest_set), id_type, frame_labels)))
                elif neighbor_ids is not None and cluster['ids'] & neighbor_ids:
                    candidates.append((cluster,
                                       base_score * 0.5 * len(cluster['ids'] & neighbor_ids) / len(cluster['ids']),
                                      "shares {} points with neighbors of {} ({})".format(len(cluster['ids'] & neighbor_ids), id_type, frame_labels)))
                elif bounding_box is not None:
                    point_positions = positions[base_frame.index(np.array(list(cluster['ids'])))]
                    num_within = np.sum((point_positions[:,0] >= bounding_box[0]) *
                              (point_positions[:,0] <= bounding_box[1]) *
                              (point_positions[:,1] >= bounding_box[2]) *
                              (point_positions[:,1] <= bounding_box[3]))
                    if num_within > 0:
                        candidates.append((cluster, base_score * np.log(num_within), frame_labels))
                elif ids_of_interest is None:
                    if frame_idx is not None and preview_frame_idx is not None:
                        reason = "matches frames "
                    elif preview_frame_idx is not None:
                        reason = "matches preview frame "
                    else:
                        reason = ""
                    candidates.append((cluster,
                                       base_score,
                                       "{}{}".format(reason, ("(" + frame_labels + ")") if reason else frame_labels)))
        
        # Sort candidates and make sure they don't include overlapping IDs
        seen_ids = set()
        results = []
        for cluster, _, reason in sorted(candidates, key=lambda x: x[1], reverse=True):
            if cluster['ids'] & seen_ids: continue
            results.append((cluster, reason))
            seen_ids |= cluster['ids']
            if len(results) >= num_results: break
                
        return results
    
    def to_json(self):
        """
        Converts the clusters stored in this Recommender object to JSON.
        """
        def _convert_cluster(cluster):
            return {k: list(v) if isinstance(v, set) else v for k, v in cluster.items()}
        return standardize_json({
            ",".join((str(i), str(j))): [_convert_cluster(c) for c in clusters]
            for (i, j), clusters in self.clusters.items()
        })
        
    @classmethod
    def from_json(cls, data, embeddings):
        """
        Reads the clusters stored in the given JSON object to a Recommender.
        """
        def _convert_cluster(cluster):
            return {k: set(v) if k == 'ids' else v for k, v in cluster.items()}
        def _convert_key(key):
            i, j = key.split(",")
            return (int(i), int(j))
        return cls(embeddings, {_convert_key(k): [_convert_cluster(c) for c in clusters]
                                for k, clusters in data.items()})

Static methods

def from_json(data, embeddings)

Reads the clusters stored in the given JSON object to a Recommender.

Expand source code
@classmethod
def from_json(cls, data, embeddings):
    """
    Reads the clusters stored in the given JSON object to a Recommender.
    """
    def _convert_cluster(cluster):
        return {k: set(v) if k == 'ids' else v for k, v in cluster.items()}
    def _convert_key(key):
        i, j = key.split(",")
        return (int(i), int(j))
    return cls(embeddings, {_convert_key(k): [_convert_cluster(c) for c in clusters]
                            for k, clusters in data.items()})

Methods

def query(self, ids_of_interest=None, filter_ids=None, frame_idx=None, preview_frame_idx=None, bounding_box=None, num_results=10, id_type='selection')

Returns a list of clusters in sorted order of relevance that match the given filters.

Args

ids_of_interest
A list of ID values. If there are sufficiently many clusters containing at least one ID in this list, only they will be returned. Otherwise, clusters containing IDs from the neighbor sets of those IDs may be returned as well.
filter_ids
A list of IDs such that at least one point in every cluster MUST be present in this list.
frame_idx
A base frame index to filter for. If None, clusters from any frame may be returned.
preview_frame_idx
A preview frame index to filter for. frame_idx must be provided if this is provided.
bounding_box
If provided, should be a tuple of four values: min x, max x, min y, and max y. At least one point in each cluster will be required to be within the bounding box.
num_results
Maximum number of results to return.
id_type
The type of ID that ids_of_interest corresponds to. This goes into the explanation string for clusters, e.g. "shares 3 points with ".

Returns

A list of suggested selections. Each suggestion is returned as a tuple of two values - a list of point IDs, and a string "reason" explaining why the cluster is recommended.

Expand source code
def query(self, ids_of_interest=None, filter_ids=None, frame_idx=None, preview_frame_idx=None, bounding_box=None, num_results=10, id_type="selection"):
    """
    Returns a list of clusters in sorted order of relevance that match the
    given filters.
    
    Args:
        ids_of_interest: A list of ID values. If there are sufficiently many clusters
            containing at least one ID in this list, only they will be returned.
            Otherwise, clusters containing IDs from the neighbor sets of those IDs
            may be returned as well.
        filter_ids: A list of IDs such that at least one point in every cluster
            MUST be present in this list.
        frame_idx: A base frame index to filter for. If None, clusters from any frame
            may be returned.
        preview_frame_idx: A preview frame index to filter for. frame_idx must be
            provided if this is provided.
        bounding_box: If provided, should be a tuple of four values: min x, max x, 
            min y, and max y. At least one point in each cluster will be required to
            be within the bounding box.
        num_results: Maximum number of results to return.
        id_type: The type of ID that ids_of_interest corresponds to. This goes into
            the explanation string for clusters, e.g. "shares 3 points with <id_type>".
            
    Returns:
        A list of suggested selections. Each suggestion is returned as a
        tuple of two values - a list of point IDs, and a string "reason"
        explaining why the cluster is recommended.
    """
    
    # Determine which frames to look for clusters in
    frames_to_check = []
    if frame_idx is not None:
        if preview_frame_idx is not None:
            frames_to_check.append((frame_idx, preview_frame_idx))
        else:
            frames_to_check = [(frame_idx, j) for j in range(len(self.embeddings)) if frame_idx != j]
    else:
        frames_to_check = [(i, j) for i in range(len(self.embeddings)) for j in range(len(self.embeddings)) if i != j]
        
    interest_set = set(ids_of_interest) if ids_of_interest is not None else None
    filter_set = set(filter_ids) if filter_ids is not None else None
    
    candidates = []
    for frame_key in frames_to_check:
        base_frame = self.embeddings[frame_key[0]]
        if bounding_box is not None:
            positions = base_frame.field(Field.POSITION)
        else:
            positions = None
            
        # Assemble a list of candidates
        if ids_of_interest is not None:
            neighbor_ids = set([n for n in self.embeddings[frame_key[0]].get_recent_neighbors()[ids_of_interest][:,:NUM_NEIGHBORS_FOR_SEARCH].flatten()])
        else:
            neighbor_ids = None    

        for cluster in self.clusters[frame_key]:
            frame_labels = "{} &rarr; {}".format(self.embeddings[cluster['frame']].label or "Frame " + str(cluster['frame']),
                                                 self.embeddings[cluster['previewFrame']].label or "Frame " + str(cluster['previewFrame']))
            base_score = (cluster['consistency'] + cluster['innerChange'] + cluster['gain'] + cluster['loss']) * np.log(len(cluster['ids']))
            if filter_set is not None:
                if not cluster['ids'] & filter_set:
                    continue
                base_score *= len(cluster['ids'] & filter_set) / len(cluster['ids'])
            
            if interest_set is not None and cluster['ids'] & interest_set:
                candidates.append((cluster, 
                                   base_score * len(cluster['ids'] & interest_set) / len(cluster['ids']),
                                   "shares {} points with {} ({})".format(len(cluster['ids'] & interest_set), id_type, frame_labels)))
            elif neighbor_ids is not None and cluster['ids'] & neighbor_ids:
                candidates.append((cluster,
                                   base_score * 0.5 * len(cluster['ids'] & neighbor_ids) / len(cluster['ids']),
                                  "shares {} points with neighbors of {} ({})".format(len(cluster['ids'] & neighbor_ids), id_type, frame_labels)))
            elif bounding_box is not None:
                point_positions = positions[base_frame.index(np.array(list(cluster['ids'])))]
                num_within = np.sum((point_positions[:,0] >= bounding_box[0]) *
                          (point_positions[:,0] <= bounding_box[1]) *
                          (point_positions[:,1] >= bounding_box[2]) *
                          (point_positions[:,1] <= bounding_box[3]))
                if num_within > 0:
                    candidates.append((cluster, base_score * np.log(num_within), frame_labels))
            elif ids_of_interest is None:
                if frame_idx is not None and preview_frame_idx is not None:
                    reason = "matches frames "
                elif preview_frame_idx is not None:
                    reason = "matches preview frame "
                else:
                    reason = ""
                candidates.append((cluster,
                                   base_score,
                                   "{}{}".format(reason, ("(" + frame_labels + ")") if reason else frame_labels)))
    
    # Sort candidates and make sure they don't include overlapping IDs
    seen_ids = set()
    results = []
    for cluster, _, reason in sorted(candidates, key=lambda x: x[1], reverse=True):
        if cluster['ids'] & seen_ids: continue
        results.append((cluster, reason))
        seen_ids |= cluster['ids']
        if len(results) >= num_results: break
            
    return results
def to_json(self)

Converts the clusters stored in this Recommender object to JSON.

Expand source code
def to_json(self):
    """
    Converts the clusters stored in this Recommender object to JSON.
    """
    def _convert_cluster(cluster):
        return {k: list(v) if isinstance(v, set) else v for k, v in cluster.items()}
    return standardize_json({
        ",".join((str(i), str(j))): [_convert_cluster(c) for c in clusters]
        for (i, j), clusters in self.clusters.items()
    })