Module PAPER.gibbsSampling

Created on Sat Apr 24 13:43:28 2021

@author: minx

Expand source code
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 24 13:43:28 2021

@author: minx
"""

from PAPER.tree_tools import *
import time
from random import choices
from igraph import *
import numpy as np
import collections
import scipy.optimize

from PAPER.estimateAlpha import *
import PAPER.grafting as grafting



def gibbsToConv(graf, DP=False, K=1, 
                alpha=0, beta=0, alpha0=50,
                Burn=10, M=40, gap=1, 
                MAXITER=100, tol=0.1, 
                size_thresh=0.01, birth_thresh=0.8,
                method="full",
                burn_thresh = 0.95):
    """
    Run gibbs sampler to generate posterior root probs.
    

    Parameters
    ----------
    graf : igraph object
        Input graph.
    DP : boolean, optional
        Use random K model or not. The default is False.
    K : int, optional
        Num of cluster-trees. Ignored if DP is True. The default is 1.
    alpha : float, optional
        Parameter. Set both alpha=0 and beta=0 (default) to 
        estimate the parameters via EM. The default is 0. 
    beta : float, optional
        Parameter. Set both alpha=0 and beta=0 (default) to estimate
        the parameter via EM. The default is 0.
    alpha0 : float, optional
        Initialization for parameter. Ignored if DP is False. The default is 50.
    Burn : int, optional
        Num of burn iteration. Unimportant if chain runs to 
        convergence. The default is 10.
    M : int, optional
        Num of iterations per convergence check. The default is 40.
    gap : int, optional
        Num of samples to skip for recording results. The default is 1.
    MAXITER : int, optional
        Maximum number of convergence checks. The default is 100.
    tol : float, optional
        Convergence threshold. The default is 0.1.
    size_thresh : float, optional
        Thresh for keeping a cluster-tree. 
        Ignored if K==1. The default is 0.01.
    birth_thresh : float, optional
        Thresh for creating new distinct cluster-tree 
        in output. 
        Ignored if K==1.
        The default is 0.8.
    method : string, optional
        Either "full" or "collapsed". The default is "full".
    burn_thresh : float, optional
        Criterion for determining whether burn in is
        complete. The default is 0.95.

    Returns
    -------
    0. nparray of posterior root probs
    1. first chain outputs
    2. second chain outputs

    """
    
    n = len(graf.vs)
    m = len(graf.es)
    graf2 = graf.copy()
    
    if (alpha == 0 and beta == 0):
        beta = 1
        alpha = estimateAlphaEM(graf, display=False)
        print("Estimated alpha as {0}".format(alpha))
    else:
        print("Using alpha {0} and beta {1}".format(alpha, beta))
    
    if (DP):
        print("Using random K model")
    else:
        print("Using fixed K={0} model".format(K))
        
        
    
    options = {"Burn": Burn, "M": M, "gap": gap, "alpha": alpha, 
               "beta": beta, "display": False, "size_thresh": size_thresh,
               "birth_thresh": birth_thresh}    
    
    
    if (DP and method == "full"):
        gibbsFn = gibbsFullDP
    
    if ((not DP) and method == "full"):
        gibbsFn = gibbsFull
        
    if (DP and method == "collapsed"):
        gibbsFn = grafting.gibbsGraftDP
    if ((not DP) and method== "collapsed"):
        gibbsFn = grafting.gibbsGraft
    
    
    if (not DP):
        res = gibbsFn(graf, K=K, **options)
        res1 = gibbsFn(graf2, K=K, **options)
    else:
        res = gibbsFn(graf, alpha0=alpha0, **options)
        res1 = gibbsFn(graf2, alpha0=alpha0, **options)        


    allfreq = np.array([0] * n)
    allfreq1 = np.array([0] * n)
    
    for i in range(MAXITER):
        
        allfreq = allfreq + np.array(res[0])
        allfreq1 = allfreq1 + np.array(res1[0])
            
        p1 = allfreq/sum(allfreq)
        p2 = allfreq1/sum(allfreq1)
        
        deviation = (1/2)*sum(np.abs( p1**(1/2) - p2**(1/2) )**2)
        print((i, deviation))
        
        if (deviation < tol):
            break
        
        if (deviation > burn_thresh):
            allfreq = np.array([0] * n)
            allfreq1 = np.array([0] * n)
        
        Mp = M*(i+1)
        
        options["Burn"] = 0
        options["M"] = Mp
        
        if ( (not DP) and method=="full"):
            res = gibbsFn(graf, K=K, initpi=res[-1], **options)
            res1 = gibbsFn(graf2, K=K, initpi=res1[-1], **options)
            
        if ( (not DP) and method=="collapsed"):
            res = gibbsFn(graf, K=K, initroots=res[-1], **options)
            res1 = gibbsFn(graf2, K=K, initroots=res1[-1], **options)
            
        if (DP and method=="full"):
            res = gibbsFn(graf, initpi=res[-1], alpha0=res[-2], initroots=res[-3], **options)
            res1 = gibbsFn(graf2, initpi=res1[-1], alpha0=res1[-2], initroots=res1[-3], **options)    
            
        if (DP and method=="collapsed"):
            res = gibbsFn(graf, alpha0=res[-2], initroots=res[-1], **options)
            res1 = gibbsFn(graf2, alpha0=res1[-2], initroots=res1[-1], **options)
    
    allfreq = allfreq + allfreq1
    allfreq = allfreq/sum(allfreq)
    
    return((allfreq, res, res1))



def gibbsFull(graf, Burn=40, M=50, gap=1, alpha=0, beta=1, K=1, 
              display=True, size_thresh=0.01, birth_thresh=0.8,
              initpi=None):
    """
    Full Gibbs sampler for computing posterior root prob and 
    node tree co-occurrence in fixed K setting. 

    Parameters
    ----------
    graf : igraph object
        Input graph.
    Burn : int, optional
        Num of burn in iterations. The default is 30.
    M : int, optional
        Num of regular iterations. The default is 50.
    gap : int, optional
        Num of samples to skip when recording results. 
        The default is 1.
    alpha : float, optional
        Parameter. The default is 0.
    beta : float, optional
        Parameter. The default is 1.
    K : int, optional
        Num of roots/clusters. The default is 1.
    display : boolean, optional
        Detailed display. The default is True.
    size_thresh : float, optional
        Thresh for keeping a cluster-tree. The default is 0.01.
    birth_thresh : float, optional
        Thresh for creating new distinct cluster-tree 
        in output. The default is 0.8.
    initpi : list, optional
        Initialization for ordering. The default is None.

    Returns
    -------
    0: nparray of posterior root probs
    1: dictionary mapping tree k to its posterior root probs
    2: nparray of node tree co-occurrence
    3: final roots (used for initiailization)
    4: final ordering (used for initialization)

    """
    n = len(graf.vs)
    m = len(graf.es)

    if (initpi is None):
        wilsonTree(graf)
        v = choices(range(n))[0]
    
        countSubtreeSizes(graf, v)
        tree2root = [v]
        initpi = sampleOrdering(graf, tree2root, alpha, beta)
    else:
        tree2root = initpi[0:K]
    
    mypi = initpi
    
    
    node_tree_coo = np.zeros((n, 0))
    
    freq = {}
    if (K == 1):
        freq[0] = [0] * n

    
    for i in range(Burn + M):
        
        for v in tree2root:
            assert graf.vs[v]["pa"] is None        
        
        nodewiseSamplePA(graf, mypi, alpha=alpha, beta=beta, K=K)
        tree2root = mypi[0:K]
        mypi = sampleOrdering(graf, tree2root, alpha=alpha, beta=beta)
        
        
        ## sort and display sizes
        sizes = getTreeSizes(graf, tree2root)
        sizes_sorted = -np.sort( - np.array(sizes))
        sizes_args = np.argsort(- np.array(sizes) )
        
        if (display):
            print("iter {0}  sizes {1}".format(i, sizes_sorted))
        
        tree2root_sorted = [0] * len(tree2root)
        for k in range(len(tree2root)):
            tree2root_sorted[k] = tree2root[sizes_args[k]]
        
        """ record results """
        
        if (i >= Burn and i % gap == 0):
            if (K == 1):
                freq[0] = freq[0] + countAllHist(graf, tree2root[0])[0]
            else:   
                node_tree_coo = updateInferResults(graf, freq, tree2root, 
                                                   alpha=alpha, beta=beta, 
                                                   size_thresh=size_thresh, 
                                                   birth_thresh=birth_thresh, 
                                                   node_tree_coo=node_tree_coo)
                
    allfreqs = np.array([0] * n)    
    
    for k in range(len(freq)):
        allfreqs = allfreqs + freq[k]
        freq[k] = freq[k]/sum(freq[k])
        
    allfreqs = allfreqs/sum(allfreqs)
    
    return((allfreqs, freq, node_tree_coo, tree2root, mypi))
        


def gibbsFullDP(graf, Burn=20, M=50, gap=1, alpha=0, beta=1, alpha0=50, 
                display=True, size_thresh=0.01, 
                birth_thresh=0.8, initpi=None, initroots=None):
    """
    Full Gibbs sampler for computing posterior root prob 
    in the random K setting.  

    Parameters
    ----------
    graf : igraph object
        Input graph.
    Burn : int, optional
        Num of burn in iterations. The default is 30.
    M : int, optional
        Num of regular iterations. The default is 50.
    gap : int, optional
        Num of samples to skip when recording results. 
        The default is 1.
    alpha : float, optional
        Parameter. The default is 0.
    beta : float, optional
        Parameter. The default is 1.
    alpha0 : float, optional
        Parameter. The default is 5.
    display : boolean, optional
        Detailed display. The default is True.
    size_thresh : float, optional
        Thresh for keeping a cluster-tree. The default is 0.01.
    birth_thresh : float, optional
        Thresh for creating new distinct cluster-tree 
        in output. The default is 0.8.
    initpi : list, optional
        Ordering initialization. The default is None.
    initroots : list, optional
        Root initialization. The default is None.

    Returns
    -------
    0. nparray of length n of posterior root prob
    1. dict giving posterior root prob for each distinct cluster-tree
    2. list of all Ks
    3. final set of roots (used for initialization)
    4. final alpha0 (used for initialization)
    5. final ordering (used for initialization)

    """
    
    
    n = len(graf.vs)
    m = len(graf.es)
    
    if (initpi is None):
        
        wilsonTree(graf)
        v = choices(range(n))[0]
    
        countSubtreeSizes(graf, v)
        tree2root = [v]
    
        tmp = sampleOrdering(graf, tree2root, alpha, beta, DP=True)
        initpi = tmp[0]
        tree2root = tmp[1]
    else:
        tree2root = initroots
        
    mypi = initpi
    
    allK = []
    
    freq = {}
    bigK = 0    
    
    for i in range(Burn + M):
            
        tree2root = nodewiseSampleDP(graf, mypi, tree2root, alpha=alpha, beta=beta, alpha0=alpha0)
        
        sizes = getTreeSizes(graf, tree2root)
                
        tmp = sampleOrdering(graf, tree2root, alpha=alpha, beta=beta, DP=True)
        mypi = tmp[0]
        tree2root = tmp[1]
    
    
        K = len(tree2root)
    
        sizes_sorted = -np.sort( - np.array(sizes))
        sizes_args = np.argsort( - np.array(sizes))
    
        ## Uncomment to update alpha0
        alpha0tilde = drawAlpha0tilde(K, n, alpha0/(alpha+2*beta))
        alpha0 = alpha0tilde*(alpha+2*beta)
                      
        
        
        if (display):
            print("iter {0}  a0 {1}  K {2}  sizes{3}".format(i, round(alpha0, 3),
                                                             K, sizes_sorted))
            
        """ record results """

        if (i >= Burn and i % gap == 0):
            allK.append(len(tree2root))
            
            updateInferResults(graf, freq, tree2root, 
                               alpha=alpha, beta=beta, 
                               size_thresh=size_thresh, 
                               birth_thresh=birth_thresh)
            
            
    allfreqs = np.array([0] * n)
    for k in range(len(freq)):
        allfreqs = allfreqs + freq[k]
        freq[k] = freq[k]/sum(freq[k])     
        
    return((allfreqs, freq, allK, tree2root, alpha0, mypi))





def nodewiseSampleDP(graf, mypi, tree2root, alpha, beta, alpha0):
    """
    Generates new forest for a given ordering by sampling
    a new parent for each node. Used in random K setting.

    Require: graf.es has "tree" attribute    

    Parameters
    ----------
    graf : igraph object
        Input graph; "tree" edge attribute and "pa" node
        attributes are modified in place.
    mypi : list
        Given ordering of the nodes.
    tree2root : list
        Lists of the roots for each of the trees.
    alpha : float
        Parameter.
    beta : float
        Parameter.
    alpha0 : float
        Parameter.

    Returns
    -------
    New list of roots

    """
    n = len(graf.vs)
    m = len(graf.es)
    n2 = n*(n-1)/2
    
    ## DEBUG
    getTreeSizes(graf, tree2root)
    
    
    root_dict = {}
    for v in tree2root:
        root_dict[v] = 1
        
    
    mypi_inv = [0] * n
    for i in range(n):
        mypi_inv[mypi[i]] = i    
                
    all_tree_degs = getAllTreeDeg(graf)        
    assert sum(all_tree_degs) == 2*(n-len(tree2root))
    
    
    edge_ls = []
    curK = len(tree2root)
    for i in range(n-1):
       
        k = i + 1
        u = mypi[k]
        mypa = graf.vs[u]["pa"]
        uisroot = (mypa == None)
        
        nbs = graf.neighbors(u)
        nbs = [w for w in nbs if mypi_inv[w] < k]

        tree_degs = np.array([all_tree_degs[w] for w in nbs])
        root_adj = np.array([w in root_dict for w in nbs])
        pa_adj = np.array([w == mypa for w in nbs])
        
        tmp_p = beta*tree_degs + 2*beta*root_adj - beta*pa_adj + alpha
        
        new_root_wt = alpha0 * (m-n+curK+1-uisroot)/(n2-n+curK+1-uisroot) * \
            (beta*all_tree_degs[u] + beta*uisroot + alpha)/(beta+alpha)
        
        tmp_p = np.append(tmp_p, new_root_wt)
        
        """ draw a new parent for u"""
        nbs.append(-1)
        myw = choices(nbs, weights=tmp_p)[0]
        
        if (myw == -1):
            myw = None
        
        if (myw == mypa):
            if (mypa != None):
                edge_ls.append((u, mypa))
            
            continue
        
        """ modifying pa, all_tree_degs """
        if (myw != None):
            all_tree_degs[myw] = all_tree_degs[myw] + 1
            if (not uisroot):
                all_tree_degs[mypa] = all_tree_degs[mypa] - 1
            else:
                all_tree_degs[u] = all_tree_degs[u] + 1
                root_dict.pop(u)
                curK = curK - 1
            
            edge_ls.append((u, myw))
        else:
            ## u was not a root, became a root
            assert mypa != None
            root_dict[u] = 1
            curK = curK + 1
            all_tree_degs[u] = all_tree_degs[u] - 1
            all_tree_degs[mypa] = all_tree_degs[mypa] - 1


    assert len(edge_ls) == (n - curK)
    graf.es["tree"] = 0
    graf.vs["pa"] = None
    
    graf.es[graf.get_eids(edge_ls)]["tree"] = 1
    
    rootset = list(root_dict.keys())
    return(rootset)





def nodewiseSamplePA(graf, mypi, alpha, beta, K):
    """
    Generates new forest for a given ordering by sampling
    a new parent for each node. Used in fixed K setting.

    Require: graf.es has "tree" attribute

    Parameters
    ----------
    graf : igraph object
        Input graph; "tree" edge attribute and "pa" node
        attributes are modified in place.
    mypi : list
        Given ordering of the nodes.
    alpha : float
        Parameter.
    beta : float
        Parameter.
    K : int
        Num of clusters.

    Returns
    -------
    None.

    """
    n  = len(graf.vs)
    
    mypi_inv = [0] * n
    for i in range(n):
        mypi_inv[mypi[i]] = i

    for k in range(K):
        countSubtreeSizes(graf, mypi[k])

    all_tree_degs = [0] * n
    for i in range(n):
        mypa = graf.vs[i]["pa"]
        if (mypa != None):
            all_tree_degs[mypa] = all_tree_degs[mypa] + 1
            all_tree_degs[i] = all_tree_degs[i] + 1
        
    edge_ls = []
    
    for i in range(n-K):
        k = K+i
        v = mypi[k]
        
        mypa = graf.vs[v]["pa"]
        assert mypa is not None
        ## adjust parent degree
        all_tree_degs[mypa] = all_tree_degs[mypa] - 1
        
        nbs = graf.neighbors(v)
        nbs = [w for w in nbs if mypi_inv[w] < k]

        tree_degs = [all_tree_degs[w] for w in nbs]
        tree_degs = np.array(tree_degs)

        root_adj = np.array([w in mypi[0:K] for w in nbs])
        
        if (K == 1):
            root_adj = 0
        
        """ generate new parent for u"""
        tmp_p = beta*tree_degs + 2*beta*root_adj + alpha
        myw = choices(nbs, weights=tmp_p)[0]

        edge_ls.append((v, myw))
        ## myw may potentially be mypa
        all_tree_degs[myw] = all_tree_degs[myw] + 1
        
    assert len(edge_ls) == (n - K)
    graf.es["tree"] = 0
    graf.vs["pa"] = None
    
    graf.es[graf.get_eids(edge_ls)]["tree"] = 1
    
    


def sampleOrdering(graf, tree2root, alpha, beta, DP=False):
    """
    Condition on the forest, generate a new root for each
    tree and generate a new global ordering.
    
    Require: graf.vs has "pa" attribute; graf.es has "tree" attribute

    Parameters
    ----------
    graf : igraph object
        Input graph; "pa" and "subtree_size" vertex attributes
        modified in place.
    tree2root : list
        list of root nodes.
    alpha : float
        Parameter.
    beta : float
        Parameter.
    DP : boolean, optional
        Use random K model or not. The default is False.

    Returns
    -------
    0. new node ordering
    1. list of new roots (only used in random K setting)

    """
    K = len(tree2root)
    n = len(graf.vs)
    
    time3 = time.time()
    
    degs = getAllTreeDeg(graf)
    
    mypi = [0] * n
    
    tree_sizes = getTreeSizes(graf, tree2root)
    
    """ draw new roots for each subtree """
    for k in range(K):
        if (tree_sizes[k] == 1):
            graf.vs[tree2root[k]]["subtree_size"] = 1
            mypi[k] = tree2root[k]
            continue
        
        cur_root = tree2root[k]
        normalized_h = countAllHist(graf, cur_root)[0]
        
        deg_adj = (beta*degs + beta + alpha) * (beta*degs + alpha)
        if (K == 1):
            deg_adj = 1
        
        tmp_p = normalized_h*deg_adj
        mypi[k] = choices(range(n), tmp_p)[0]
        tree2root[k] = mypi[k]
        
        countSubtreeSizes(graf, root=mypi[k])
    
    if (DP):
        wts = [graf.vs[tree2root[k]]["subtree_size"] for k in range(K)]
        assert(sum(wts) == n)
        
        mypi[0] = tree2root[choices(range(K), wts)[0]]
        remain_nodes = [i for i in list(range(n)) if i != mypi[0]]
        assert mypi[0] not in remain_nodes
        
        mypi[1:n] = np.random.permutation(remain_nodes)
    else:
        remain_nodes = [i for i in list(range(n)) if i not in mypi[0:K]]
        mypi[K:n] = np.random.permutation(remain_nodes)
    
    mypi_inv = [0] * n
    for i in range(n):
        mypi_inv[mypi[i]] = i
    
    
    
    marked = {}
    
    if (DP):
        marked[mypi[0]] = 1
    else:
        for k in range(K):
            marked[mypi[k]] = 1
    
    for i in range(n-1):
        
        if (DP):
            k = 1 + i
        else:   
            k = K + i
            if (k >= n):
                break
            
        v = mypi[k]
        
        if (not DP): 
            assert v not in tree2root
            assert graf.vs[v]["pa"] != None
        
        if (v not in marked):
            ancs = getAncestors(graf, v)
            unmarked_ancs = [w for w in ancs if w not in marked]
            
            v_anc = unmarked_ancs[-1]            

            old_pos = mypi_inv[v_anc]
            mypi[old_pos] = v
            mypi[k] = v_anc
            mypi_inv[v_anc] = k
            mypi_inv[v] = old_pos
            
            marked[v_anc] = 1         
    
    if (DP):
        return((mypi, tree2root))    
    else:   
        return(mypi) 



def updateInferResults(graf, freq, tree2root, 
                       alpha, beta, size_thresh, birth_thresh,
                       node_tree_coo=None):
    """
    Match clustr-trees, update posterior root prob, and
    update node-tree co-occurrence results.
    
    Requires graf.vs has "pa" attribute.
    Requires graf.es has "tree" attribute.


    Parameters
    ----------
    graf : igraph object
        Input graph.
    freq : dict
        Existing posterior root probs; maps k to the 
        posterior root prob of tree k. Modified in place.
    tree2root : list
        list of root nodes.
    alpha : float
        Parameter.
    beta : float
        Parameter.
    size_thresh : float
        Thresh for keeping a cluster-tree. 
    birth_thresh : float
        Thresh for creating new distinct cluster-tree 
    node_tree_coo : nparray, optional
        (i,j)-th entry is num of times node i
        appears in tree j. The default is None.

    Returns
    -------
    nparray of new node-tree co-occurrences; replaces
    existing node_tree_coo.

    """
    n = len(graf.vs)
    
    sizes = getTreeSizes(graf, tree2root)
    sizes_sorted = -np.sort( - np.array(sizes))
    sizes_args = np.argsort( - np.array(sizes))

    K = len(tree2root)
    bigK = len(freq)
    
    tree2root_sorted = [0] * len(tree2root)
    for k in range(K):
        tree2root_sorted[k] = tree2root[sizes_args[k]]
    
    tmp_freq = {}
    treedegs = getAllTreeDeg(graf)
    
    for k in range(K):
        if (sizes_sorted[k] > size_thresh * n):
            tmp_freq[k] = countAllHist(graf, tree2root_sorted[k])[0]
        else:
            break
        
        if (sizes_sorted[k] > 1):
            tmp_freq[k] = tmp_freq[k] * (beta*treedegs+beta+alpha) \
                                * (beta*treedegs + alpha)                           

            tmp_freq[k] = tmp_freq[k]/sum(tmp_freq[k])
                    
    curbigK = len(tmp_freq)
    
    if (curbigK > bigK):
        for k in range(bigK, curbigK):
            freq[k] = np.array([0] * n)
            
            if (node_tree_coo is not None):
                node_tree_coo = np.column_stack((node_tree_coo, np.zeros((n,1))))
        bigK = curbigK
        
    dists = np.zeros((curbigK, bigK))
    
    for k in range(curbigK):
        for kk in range(bigK):
            if (sum(freq[kk] > 0)):
                distr1 = np.array(freq[kk]/sum(freq[kk]) )                        
                distr2 = np.array(tmp_freq[k])

                dists[k, kk] = sum(np.abs(distr1 - distr2))/2
            else:
                dists[k, kk] = 0
                    
    treematch = scipy.optimize.linear_sum_assignment(dists)[1]
                      
    for k in range(curbigK):
        if (dists[k, treematch[k]] > birth_thresh):
            freq[bigK] = np.array([0] * n)
            treematch[k] = bigK
            
            if (node_tree_coo is not None):
                node_tree_coo = np.column_stack((node_tree_coo, np.zeros((n, 1))))
            bigK = bigK + 1
    
    for k in range(curbigK):
        freq[treematch[k]] = freq[treematch[k]] + tmp_freq[k]
      
    for ii in range(n):
        if (node_tree_coo is None):
            break
        
        ants = getAncestors(graf, ii)
        myroot = ants[-1]
        my_k = tree2root_sorted.index(myroot)
        if (sizes_sorted[my_k] <= size_thresh * n):
            continue
                    
        my_kstar = treematch[my_k]
        node_tree_coo[ii, my_kstar] = node_tree_coo[ii, my_kstar] + 1     
    
    return(node_tree_coo)                    
        
        


def reorderSubvector(vec1, vec2, pos_dict):
    """

    Parameters
    ----------
    vec1 : list
        Longer input list.
    vec2 : list
        Shorter input list. Required to be a sub-list of
        vec1.
    pos_dict : dict
        Positions of all elements of vec2 in vec1. Modified in place.

    Returns
    -------
    a list which contains the same elements as vec1
    the sub-list that correspond to elements of vec2 is re-ordered
    according to vec2.

    """
    n = len(vec1)
    m = len(vec2)
    
    all_pos = [0] * m
    for i in range(m):
        all_pos[i] = pos_dict[vec2[i]]
    
    all_pos.sort()
        
    for i in range(m):
        vec1[all_pos[i]] = vec2[i]
        pos_dict[vec2[i]] = all_pos[i]
    
    return(vec1)

Functions

def gibbsFull(graf, Burn=40, M=50, gap=1, alpha=0, beta=1, K=1, display=True, size_thresh=0.01, birth_thresh=0.8, initpi=None)

Full Gibbs sampler for computing posterior root prob and node tree co-occurrence in fixed K setting.

Parameters

graf : igraph object
Input graph.
Burn : int, optional
Num of burn in iterations. The default is 30.
M : int, optional
Num of regular iterations. The default is 50.
gap : int, optional
Num of samples to skip when recording results. The default is 1.
alpha : float, optional
Parameter. The default is 0.
beta : float, optional
Parameter. The default is 1.
K : int, optional
Num of roots/clusters. The default is 1.
display : boolean, optional
Detailed display. The default is True.
size_thresh : float, optional
Thresh for keeping a cluster-tree. The default is 0.01.
birth_thresh : float, optional
Thresh for creating new distinct cluster-tree in output. The default is 0.8.
initpi : list, optional
Initialization for ordering. The default is None.

Returns

0 : nparray of posterior root probs
 
1 : dictionary mapping tree k to its posterior root probs
 
2 : nparray of node tree co-occurrence
 
3 : final roots (used for initiailization)
 
4 : final ordering (used for initialization)
 
Expand source code
def gibbsFull(graf, Burn=40, M=50, gap=1, alpha=0, beta=1, K=1, 
              display=True, size_thresh=0.01, birth_thresh=0.8,
              initpi=None):
    """
    Full Gibbs sampler for computing posterior root prob and 
    node tree co-occurrence in fixed K setting. 

    Parameters
    ----------
    graf : igraph object
        Input graph.
    Burn : int, optional
        Num of burn in iterations. The default is 30.
    M : int, optional
        Num of regular iterations. The default is 50.
    gap : int, optional
        Num of samples to skip when recording results. 
        The default is 1.
    alpha : float, optional
        Parameter. The default is 0.
    beta : float, optional
        Parameter. The default is 1.
    K : int, optional
        Num of roots/clusters. The default is 1.
    display : boolean, optional
        Detailed display. The default is True.
    size_thresh : float, optional
        Thresh for keeping a cluster-tree. The default is 0.01.
    birth_thresh : float, optional
        Thresh for creating new distinct cluster-tree 
        in output. The default is 0.8.
    initpi : list, optional
        Initialization for ordering. The default is None.

    Returns
    -------
    0: nparray of posterior root probs
    1: dictionary mapping tree k to its posterior root probs
    2: nparray of node tree co-occurrence
    3: final roots (used for initiailization)
    4: final ordering (used for initialization)

    """
    n = len(graf.vs)
    m = len(graf.es)

    if (initpi is None):
        wilsonTree(graf)
        v = choices(range(n))[0]
    
        countSubtreeSizes(graf, v)
        tree2root = [v]
        initpi = sampleOrdering(graf, tree2root, alpha, beta)
    else:
        tree2root = initpi[0:K]
    
    mypi = initpi
    
    
    node_tree_coo = np.zeros((n, 0))
    
    freq = {}
    if (K == 1):
        freq[0] = [0] * n

    
    for i in range(Burn + M):
        
        for v in tree2root:
            assert graf.vs[v]["pa"] is None        
        
        nodewiseSamplePA(graf, mypi, alpha=alpha, beta=beta, K=K)
        tree2root = mypi[0:K]
        mypi = sampleOrdering(graf, tree2root, alpha=alpha, beta=beta)
        
        
        ## sort and display sizes
        sizes = getTreeSizes(graf, tree2root)
        sizes_sorted = -np.sort( - np.array(sizes))
        sizes_args = np.argsort(- np.array(sizes) )
        
        if (display):
            print("iter {0}  sizes {1}".format(i, sizes_sorted))
        
        tree2root_sorted = [0] * len(tree2root)
        for k in range(len(tree2root)):
            tree2root_sorted[k] = tree2root[sizes_args[k]]
        
        """ record results """
        
        if (i >= Burn and i % gap == 0):
            if (K == 1):
                freq[0] = freq[0] + countAllHist(graf, tree2root[0])[0]
            else:   
                node_tree_coo = updateInferResults(graf, freq, tree2root, 
                                                   alpha=alpha, beta=beta, 
                                                   size_thresh=size_thresh, 
                                                   birth_thresh=birth_thresh, 
                                                   node_tree_coo=node_tree_coo)
                
    allfreqs = np.array([0] * n)    
    
    for k in range(len(freq)):
        allfreqs = allfreqs + freq[k]
        freq[k] = freq[k]/sum(freq[k])
        
    allfreqs = allfreqs/sum(allfreqs)
    
    return((allfreqs, freq, node_tree_coo, tree2root, mypi))
def gibbsFullDP(graf, Burn=20, M=50, gap=1, alpha=0, beta=1, alpha0=50, display=True, size_thresh=0.01, birth_thresh=0.8, initpi=None, initroots=None)

Full Gibbs sampler for computing posterior root prob in the random K setting.

Parameters

graf : igraph object
Input graph.
Burn : int, optional
Num of burn in iterations. The default is 30.
M : int, optional
Num of regular iterations. The default is 50.
gap : int, optional
Num of samples to skip when recording results. The default is 1.
alpha : float, optional
Parameter. The default is 0.
beta : float, optional
Parameter. The default is 1.
alpha0 : float, optional
Parameter. The default is 5.
display : boolean, optional
Detailed display. The default is True.
size_thresh : float, optional
Thresh for keeping a cluster-tree. The default is 0.01.
birth_thresh : float, optional
Thresh for creating new distinct cluster-tree in output. The default is 0.8.
initpi : list, optional
Ordering initialization. The default is None.
initroots : list, optional
Root initialization. The default is None.

Returns

0. nparray of length n of posterior root prob
 
1. dict giving posterior root prob for each distinct cluster-tree
 
2. list of all Ks
 
3. final set of roots (used for initialization)
 
4. final alpha0 (used for initialization)
 
5. final ordering (used for initialization)
 
Expand source code
def gibbsFullDP(graf, Burn=20, M=50, gap=1, alpha=0, beta=1, alpha0=50, 
                display=True, size_thresh=0.01, 
                birth_thresh=0.8, initpi=None, initroots=None):
    """
    Full Gibbs sampler for computing posterior root prob 
    in the random K setting.  

    Parameters
    ----------
    graf : igraph object
        Input graph.
    Burn : int, optional
        Num of burn in iterations. The default is 30.
    M : int, optional
        Num of regular iterations. The default is 50.
    gap : int, optional
        Num of samples to skip when recording results. 
        The default is 1.
    alpha : float, optional
        Parameter. The default is 0.
    beta : float, optional
        Parameter. The default is 1.
    alpha0 : float, optional
        Parameter. The default is 5.
    display : boolean, optional
        Detailed display. The default is True.
    size_thresh : float, optional
        Thresh for keeping a cluster-tree. The default is 0.01.
    birth_thresh : float, optional
        Thresh for creating new distinct cluster-tree 
        in output. The default is 0.8.
    initpi : list, optional
        Ordering initialization. The default is None.
    initroots : list, optional
        Root initialization. The default is None.

    Returns
    -------
    0. nparray of length n of posterior root prob
    1. dict giving posterior root prob for each distinct cluster-tree
    2. list of all Ks
    3. final set of roots (used for initialization)
    4. final alpha0 (used for initialization)
    5. final ordering (used for initialization)

    """
    
    
    n = len(graf.vs)
    m = len(graf.es)
    
    if (initpi is None):
        
        wilsonTree(graf)
        v = choices(range(n))[0]
    
        countSubtreeSizes(graf, v)
        tree2root = [v]
    
        tmp = sampleOrdering(graf, tree2root, alpha, beta, DP=True)
        initpi = tmp[0]
        tree2root = tmp[1]
    else:
        tree2root = initroots
        
    mypi = initpi
    
    allK = []
    
    freq = {}
    bigK = 0    
    
    for i in range(Burn + M):
            
        tree2root = nodewiseSampleDP(graf, mypi, tree2root, alpha=alpha, beta=beta, alpha0=alpha0)
        
        sizes = getTreeSizes(graf, tree2root)
                
        tmp = sampleOrdering(graf, tree2root, alpha=alpha, beta=beta, DP=True)
        mypi = tmp[0]
        tree2root = tmp[1]
    
    
        K = len(tree2root)
    
        sizes_sorted = -np.sort( - np.array(sizes))
        sizes_args = np.argsort( - np.array(sizes))
    
        ## Uncomment to update alpha0
        alpha0tilde = drawAlpha0tilde(K, n, alpha0/(alpha+2*beta))
        alpha0 = alpha0tilde*(alpha+2*beta)
                      
        
        
        if (display):
            print("iter {0}  a0 {1}  K {2}  sizes{3}".format(i, round(alpha0, 3),
                                                             K, sizes_sorted))
            
        """ record results """

        if (i >= Burn and i % gap == 0):
            allK.append(len(tree2root))
            
            updateInferResults(graf, freq, tree2root, 
                               alpha=alpha, beta=beta, 
                               size_thresh=size_thresh, 
                               birth_thresh=birth_thresh)
            
            
    allfreqs = np.array([0] * n)
    for k in range(len(freq)):
        allfreqs = allfreqs + freq[k]
        freq[k] = freq[k]/sum(freq[k])     
        
    return((allfreqs, freq, allK, tree2root, alpha0, mypi))
def gibbsToConv(graf, DP=False, K=1, alpha=0, beta=0, alpha0=50, Burn=10, M=40, gap=1, MAXITER=100, tol=0.1, size_thresh=0.01, birth_thresh=0.8, method='full', burn_thresh=0.95)

Run gibbs sampler to generate posterior root probs.

Parameters

graf : igraph object
Input graph.
DP : boolean, optional
Use random K model or not. The default is False.
K : int, optional
Num of cluster-trees. Ignored if DP is True. The default is 1.
alpha : float, optional
Parameter. Set both alpha=0 and beta=0 (default) to estimate the parameters via EM. The default is 0.
beta : float, optional
Parameter. Set both alpha=0 and beta=0 (default) to estimate the parameter via EM. The default is 0.
alpha0 : float, optional
Initialization for parameter. Ignored if DP is False. The default is 50.
Burn : int, optional
Num of burn iteration. Unimportant if chain runs to convergence. The default is 10.
M : int, optional
Num of iterations per convergence check. The default is 40.
gap : int, optional
Num of samples to skip for recording results. The default is 1.
MAXITER : int, optional
Maximum number of convergence checks. The default is 100.
tol : float, optional
Convergence threshold. The default is 0.1.
size_thresh : float, optional
Thresh for keeping a cluster-tree. Ignored if K==1. The default is 0.01.
birth_thresh : float, optional
Thresh for creating new distinct cluster-tree in output. Ignored if K==1. The default is 0.8.
method : string, optional
Either "full" or "collapsed". The default is "full".
burn_thresh : float, optional
Criterion for determining whether burn in is complete. The default is 0.95.

Returns

0. nparray of posterior root probs
 
1. first chain outputs
 
2. second chain outputs
 
Expand source code
def gibbsToConv(graf, DP=False, K=1, 
                alpha=0, beta=0, alpha0=50,
                Burn=10, M=40, gap=1, 
                MAXITER=100, tol=0.1, 
                size_thresh=0.01, birth_thresh=0.8,
                method="full",
                burn_thresh = 0.95):
    """
    Run gibbs sampler to generate posterior root probs.
    

    Parameters
    ----------
    graf : igraph object
        Input graph.
    DP : boolean, optional
        Use random K model or not. The default is False.
    K : int, optional
        Num of cluster-trees. Ignored if DP is True. The default is 1.
    alpha : float, optional
        Parameter. Set both alpha=0 and beta=0 (default) to 
        estimate the parameters via EM. The default is 0. 
    beta : float, optional
        Parameter. Set both alpha=0 and beta=0 (default) to estimate
        the parameter via EM. The default is 0.
    alpha0 : float, optional
        Initialization for parameter. Ignored if DP is False. The default is 50.
    Burn : int, optional
        Num of burn iteration. Unimportant if chain runs to 
        convergence. The default is 10.
    M : int, optional
        Num of iterations per convergence check. The default is 40.
    gap : int, optional
        Num of samples to skip for recording results. The default is 1.
    MAXITER : int, optional
        Maximum number of convergence checks. The default is 100.
    tol : float, optional
        Convergence threshold. The default is 0.1.
    size_thresh : float, optional
        Thresh for keeping a cluster-tree. 
        Ignored if K==1. The default is 0.01.
    birth_thresh : float, optional
        Thresh for creating new distinct cluster-tree 
        in output. 
        Ignored if K==1.
        The default is 0.8.
    method : string, optional
        Either "full" or "collapsed". The default is "full".
    burn_thresh : float, optional
        Criterion for determining whether burn in is
        complete. The default is 0.95.

    Returns
    -------
    0. nparray of posterior root probs
    1. first chain outputs
    2. second chain outputs

    """
    
    n = len(graf.vs)
    m = len(graf.es)
    graf2 = graf.copy()
    
    if (alpha == 0 and beta == 0):
        beta = 1
        alpha = estimateAlphaEM(graf, display=False)
        print("Estimated alpha as {0}".format(alpha))
    else:
        print("Using alpha {0} and beta {1}".format(alpha, beta))
    
    if (DP):
        print("Using random K model")
    else:
        print("Using fixed K={0} model".format(K))
        
        
    
    options = {"Burn": Burn, "M": M, "gap": gap, "alpha": alpha, 
               "beta": beta, "display": False, "size_thresh": size_thresh,
               "birth_thresh": birth_thresh}    
    
    
    if (DP and method == "full"):
        gibbsFn = gibbsFullDP
    
    if ((not DP) and method == "full"):
        gibbsFn = gibbsFull
        
    if (DP and method == "collapsed"):
        gibbsFn = grafting.gibbsGraftDP
    if ((not DP) and method== "collapsed"):
        gibbsFn = grafting.gibbsGraft
    
    
    if (not DP):
        res = gibbsFn(graf, K=K, **options)
        res1 = gibbsFn(graf2, K=K, **options)
    else:
        res = gibbsFn(graf, alpha0=alpha0, **options)
        res1 = gibbsFn(graf2, alpha0=alpha0, **options)        


    allfreq = np.array([0] * n)
    allfreq1 = np.array([0] * n)
    
    for i in range(MAXITER):
        
        allfreq = allfreq + np.array(res[0])
        allfreq1 = allfreq1 + np.array(res1[0])
            
        p1 = allfreq/sum(allfreq)
        p2 = allfreq1/sum(allfreq1)
        
        deviation = (1/2)*sum(np.abs( p1**(1/2) - p2**(1/2) )**2)
        print((i, deviation))
        
        if (deviation < tol):
            break
        
        if (deviation > burn_thresh):
            allfreq = np.array([0] * n)
            allfreq1 = np.array([0] * n)
        
        Mp = M*(i+1)
        
        options["Burn"] = 0
        options["M"] = Mp
        
        if ( (not DP) and method=="full"):
            res = gibbsFn(graf, K=K, initpi=res[-1], **options)
            res1 = gibbsFn(graf2, K=K, initpi=res1[-1], **options)
            
        if ( (not DP) and method=="collapsed"):
            res = gibbsFn(graf, K=K, initroots=res[-1], **options)
            res1 = gibbsFn(graf2, K=K, initroots=res1[-1], **options)
            
        if (DP and method=="full"):
            res = gibbsFn(graf, initpi=res[-1], alpha0=res[-2], initroots=res[-3], **options)
            res1 = gibbsFn(graf2, initpi=res1[-1], alpha0=res1[-2], initroots=res1[-3], **options)    
            
        if (DP and method=="collapsed"):
            res = gibbsFn(graf, alpha0=res[-2], initroots=res[-1], **options)
            res1 = gibbsFn(graf2, alpha0=res1[-2], initroots=res1[-1], **options)
    
    allfreq = allfreq + allfreq1
    allfreq = allfreq/sum(allfreq)
    
    return((allfreq, res, res1))
def nodewiseSampleDP(graf, mypi, tree2root, alpha, beta, alpha0)

Generates new forest for a given ordering by sampling a new parent for each node. Used in random K setting.

Require: graf.es has "tree" attribute

Parameters

graf : igraph object
Input graph; "tree" edge attribute and "pa" node attributes are modified in place.
mypi : list
Given ordering of the nodes.
tree2root : list
Lists of the roots for each of the trees.
alpha : float
Parameter.
beta : float
Parameter.
alpha0 : float
Parameter.

Returns

New list of roots
 
Expand source code
def nodewiseSampleDP(graf, mypi, tree2root, alpha, beta, alpha0):
    """
    Generates new forest for a given ordering by sampling
    a new parent for each node. Used in random K setting.

    Require: graf.es has "tree" attribute    

    Parameters
    ----------
    graf : igraph object
        Input graph; "tree" edge attribute and "pa" node
        attributes are modified in place.
    mypi : list
        Given ordering of the nodes.
    tree2root : list
        Lists of the roots for each of the trees.
    alpha : float
        Parameter.
    beta : float
        Parameter.
    alpha0 : float
        Parameter.

    Returns
    -------
    New list of roots

    """
    n = len(graf.vs)
    m = len(graf.es)
    n2 = n*(n-1)/2
    
    ## DEBUG
    getTreeSizes(graf, tree2root)
    
    
    root_dict = {}
    for v in tree2root:
        root_dict[v] = 1
        
    
    mypi_inv = [0] * n
    for i in range(n):
        mypi_inv[mypi[i]] = i    
                
    all_tree_degs = getAllTreeDeg(graf)        
    assert sum(all_tree_degs) == 2*(n-len(tree2root))
    
    
    edge_ls = []
    curK = len(tree2root)
    for i in range(n-1):
       
        k = i + 1
        u = mypi[k]
        mypa = graf.vs[u]["pa"]
        uisroot = (mypa == None)
        
        nbs = graf.neighbors(u)
        nbs = [w for w in nbs if mypi_inv[w] < k]

        tree_degs = np.array([all_tree_degs[w] for w in nbs])
        root_adj = np.array([w in root_dict for w in nbs])
        pa_adj = np.array([w == mypa for w in nbs])
        
        tmp_p = beta*tree_degs + 2*beta*root_adj - beta*pa_adj + alpha
        
        new_root_wt = alpha0 * (m-n+curK+1-uisroot)/(n2-n+curK+1-uisroot) * \
            (beta*all_tree_degs[u] + beta*uisroot + alpha)/(beta+alpha)
        
        tmp_p = np.append(tmp_p, new_root_wt)
        
        """ draw a new parent for u"""
        nbs.append(-1)
        myw = choices(nbs, weights=tmp_p)[0]
        
        if (myw == -1):
            myw = None
        
        if (myw == mypa):
            if (mypa != None):
                edge_ls.append((u, mypa))
            
            continue
        
        """ modifying pa, all_tree_degs """
        if (myw != None):
            all_tree_degs[myw] = all_tree_degs[myw] + 1
            if (not uisroot):
                all_tree_degs[mypa] = all_tree_degs[mypa] - 1
            else:
                all_tree_degs[u] = all_tree_degs[u] + 1
                root_dict.pop(u)
                curK = curK - 1
            
            edge_ls.append((u, myw))
        else:
            ## u was not a root, became a root
            assert mypa != None
            root_dict[u] = 1
            curK = curK + 1
            all_tree_degs[u] = all_tree_degs[u] - 1
            all_tree_degs[mypa] = all_tree_degs[mypa] - 1


    assert len(edge_ls) == (n - curK)
    graf.es["tree"] = 0
    graf.vs["pa"] = None
    
    graf.es[graf.get_eids(edge_ls)]["tree"] = 1
    
    rootset = list(root_dict.keys())
    return(rootset)
def nodewiseSamplePA(graf, mypi, alpha, beta, K)

Generates new forest for a given ordering by sampling a new parent for each node. Used in fixed K setting.

Require: graf.es has "tree" attribute

Parameters

graf : igraph object
Input graph; "tree" edge attribute and "pa" node attributes are modified in place.
mypi : list
Given ordering of the nodes.
alpha : float
Parameter.
beta : float
Parameter.
K : int
Num of clusters.

Returns

None.

Expand source code
def nodewiseSamplePA(graf, mypi, alpha, beta, K):
    """
    Generates new forest for a given ordering by sampling
    a new parent for each node. Used in fixed K setting.

    Require: graf.es has "tree" attribute

    Parameters
    ----------
    graf : igraph object
        Input graph; "tree" edge attribute and "pa" node
        attributes are modified in place.
    mypi : list
        Given ordering of the nodes.
    alpha : float
        Parameter.
    beta : float
        Parameter.
    K : int
        Num of clusters.

    Returns
    -------
    None.

    """
    n  = len(graf.vs)
    
    mypi_inv = [0] * n
    for i in range(n):
        mypi_inv[mypi[i]] = i

    for k in range(K):
        countSubtreeSizes(graf, mypi[k])

    all_tree_degs = [0] * n
    for i in range(n):
        mypa = graf.vs[i]["pa"]
        if (mypa != None):
            all_tree_degs[mypa] = all_tree_degs[mypa] + 1
            all_tree_degs[i] = all_tree_degs[i] + 1
        
    edge_ls = []
    
    for i in range(n-K):
        k = K+i
        v = mypi[k]
        
        mypa = graf.vs[v]["pa"]
        assert mypa is not None
        ## adjust parent degree
        all_tree_degs[mypa] = all_tree_degs[mypa] - 1
        
        nbs = graf.neighbors(v)
        nbs = [w for w in nbs if mypi_inv[w] < k]

        tree_degs = [all_tree_degs[w] for w in nbs]
        tree_degs = np.array(tree_degs)

        root_adj = np.array([w in mypi[0:K] for w in nbs])
        
        if (K == 1):
            root_adj = 0
        
        """ generate new parent for u"""
        tmp_p = beta*tree_degs + 2*beta*root_adj + alpha
        myw = choices(nbs, weights=tmp_p)[0]

        edge_ls.append((v, myw))
        ## myw may potentially be mypa
        all_tree_degs[myw] = all_tree_degs[myw] + 1
        
    assert len(edge_ls) == (n - K)
    graf.es["tree"] = 0
    graf.vs["pa"] = None
    
    graf.es[graf.get_eids(edge_ls)]["tree"] = 1
def reorderSubvector(vec1, vec2, pos_dict)

Parameters

vec1 : list
Longer input list.
vec2 : list
Shorter input list. Required to be a sub-list of vec1.
pos_dict : dict
Positions of all elements of vec2 in vec1. Modified in place.

Returns

a list which contains the same elements as vec1
 
the sub-list that correspond to elements of vec2 is re-ordered
 

according to vec2.

Expand source code
def reorderSubvector(vec1, vec2, pos_dict):
    """

    Parameters
    ----------
    vec1 : list
        Longer input list.
    vec2 : list
        Shorter input list. Required to be a sub-list of
        vec1.
    pos_dict : dict
        Positions of all elements of vec2 in vec1. Modified in place.

    Returns
    -------
    a list which contains the same elements as vec1
    the sub-list that correspond to elements of vec2 is re-ordered
    according to vec2.

    """
    n = len(vec1)
    m = len(vec2)
    
    all_pos = [0] * m
    for i in range(m):
        all_pos[i] = pos_dict[vec2[i]]
    
    all_pos.sort()
        
    for i in range(m):
        vec1[all_pos[i]] = vec2[i]
        pos_dict[vec2[i]] = all_pos[i]
    
    return(vec1)
def sampleOrdering(graf, tree2root, alpha, beta, DP=False)

Condition on the forest, generate a new root for each tree and generate a new global ordering.

Require: graf.vs has "pa" attribute; graf.es has "tree" attribute

Parameters

graf : igraph object
Input graph; "pa" and "subtree_size" vertex attributes modified in place.
tree2root : list
list of root nodes.
alpha : float
Parameter.
beta : float
Parameter.
DP : boolean, optional
Use random K model or not. The default is False.

Returns

0. new node ordering
 
1. list of new roots (only used in random K setting)
 
Expand source code
def sampleOrdering(graf, tree2root, alpha, beta, DP=False):
    """
    Condition on the forest, generate a new root for each
    tree and generate a new global ordering.
    
    Require: graf.vs has "pa" attribute; graf.es has "tree" attribute

    Parameters
    ----------
    graf : igraph object
        Input graph; "pa" and "subtree_size" vertex attributes
        modified in place.
    tree2root : list
        list of root nodes.
    alpha : float
        Parameter.
    beta : float
        Parameter.
    DP : boolean, optional
        Use random K model or not. The default is False.

    Returns
    -------
    0. new node ordering
    1. list of new roots (only used in random K setting)

    """
    K = len(tree2root)
    n = len(graf.vs)
    
    time3 = time.time()
    
    degs = getAllTreeDeg(graf)
    
    mypi = [0] * n
    
    tree_sizes = getTreeSizes(graf, tree2root)
    
    """ draw new roots for each subtree """
    for k in range(K):
        if (tree_sizes[k] == 1):
            graf.vs[tree2root[k]]["subtree_size"] = 1
            mypi[k] = tree2root[k]
            continue
        
        cur_root = tree2root[k]
        normalized_h = countAllHist(graf, cur_root)[0]
        
        deg_adj = (beta*degs + beta + alpha) * (beta*degs + alpha)
        if (K == 1):
            deg_adj = 1
        
        tmp_p = normalized_h*deg_adj
        mypi[k] = choices(range(n), tmp_p)[0]
        tree2root[k] = mypi[k]
        
        countSubtreeSizes(graf, root=mypi[k])
    
    if (DP):
        wts = [graf.vs[tree2root[k]]["subtree_size"] for k in range(K)]
        assert(sum(wts) == n)
        
        mypi[0] = tree2root[choices(range(K), wts)[0]]
        remain_nodes = [i for i in list(range(n)) if i != mypi[0]]
        assert mypi[0] not in remain_nodes
        
        mypi[1:n] = np.random.permutation(remain_nodes)
    else:
        remain_nodes = [i for i in list(range(n)) if i not in mypi[0:K]]
        mypi[K:n] = np.random.permutation(remain_nodes)
    
    mypi_inv = [0] * n
    for i in range(n):
        mypi_inv[mypi[i]] = i
    
    
    
    marked = {}
    
    if (DP):
        marked[mypi[0]] = 1
    else:
        for k in range(K):
            marked[mypi[k]] = 1
    
    for i in range(n-1):
        
        if (DP):
            k = 1 + i
        else:   
            k = K + i
            if (k >= n):
                break
            
        v = mypi[k]
        
        if (not DP): 
            assert v not in tree2root
            assert graf.vs[v]["pa"] != None
        
        if (v not in marked):
            ancs = getAncestors(graf, v)
            unmarked_ancs = [w for w in ancs if w not in marked]
            
            v_anc = unmarked_ancs[-1]            

            old_pos = mypi_inv[v_anc]
            mypi[old_pos] = v
            mypi[k] = v_anc
            mypi_inv[v_anc] = k
            mypi_inv[v] = old_pos
            
            marked[v_anc] = 1         
    
    if (DP):
        return((mypi, tree2root))    
    else:   
        return(mypi) 
def updateInferResults(graf, freq, tree2root, alpha, beta, size_thresh, birth_thresh, node_tree_coo=None)

Match clustr-trees, update posterior root prob, and update node-tree co-occurrence results.

Requires graf.vs has "pa" attribute. Requires graf.es has "tree" attribute.

Parameters

graf : igraph object
Input graph.
freq : dict
Existing posterior root probs; maps k to the posterior root prob of tree k. Modified in place.
tree2root : list
list of root nodes.
alpha : float
Parameter.
beta : float
Parameter.
size_thresh : float
Thresh for keeping a cluster-tree.
birth_thresh : float
Thresh for creating new distinct cluster-tree
node_tree_coo : nparray, optional
(i,j)-th entry is num of times node i appears in tree j. The default is None.

Returns

nparray of new node-tree co-occurrences; replaces
 

existing node_tree_coo.

Expand source code
def updateInferResults(graf, freq, tree2root, 
                       alpha, beta, size_thresh, birth_thresh,
                       node_tree_coo=None):
    """
    Match clustr-trees, update posterior root prob, and
    update node-tree co-occurrence results.
    
    Requires graf.vs has "pa" attribute.
    Requires graf.es has "tree" attribute.


    Parameters
    ----------
    graf : igraph object
        Input graph.
    freq : dict
        Existing posterior root probs; maps k to the 
        posterior root prob of tree k. Modified in place.
    tree2root : list
        list of root nodes.
    alpha : float
        Parameter.
    beta : float
        Parameter.
    size_thresh : float
        Thresh for keeping a cluster-tree. 
    birth_thresh : float
        Thresh for creating new distinct cluster-tree 
    node_tree_coo : nparray, optional
        (i,j)-th entry is num of times node i
        appears in tree j. The default is None.

    Returns
    -------
    nparray of new node-tree co-occurrences; replaces
    existing node_tree_coo.

    """
    n = len(graf.vs)
    
    sizes = getTreeSizes(graf, tree2root)
    sizes_sorted = -np.sort( - np.array(sizes))
    sizes_args = np.argsort( - np.array(sizes))

    K = len(tree2root)
    bigK = len(freq)
    
    tree2root_sorted = [0] * len(tree2root)
    for k in range(K):
        tree2root_sorted[k] = tree2root[sizes_args[k]]
    
    tmp_freq = {}
    treedegs = getAllTreeDeg(graf)
    
    for k in range(K):
        if (sizes_sorted[k] > size_thresh * n):
            tmp_freq[k] = countAllHist(graf, tree2root_sorted[k])[0]
        else:
            break
        
        if (sizes_sorted[k] > 1):
            tmp_freq[k] = tmp_freq[k] * (beta*treedegs+beta+alpha) \
                                * (beta*treedegs + alpha)                           

            tmp_freq[k] = tmp_freq[k]/sum(tmp_freq[k])
                    
    curbigK = len(tmp_freq)
    
    if (curbigK > bigK):
        for k in range(bigK, curbigK):
            freq[k] = np.array([0] * n)
            
            if (node_tree_coo is not None):
                node_tree_coo = np.column_stack((node_tree_coo, np.zeros((n,1))))
        bigK = curbigK
        
    dists = np.zeros((curbigK, bigK))
    
    for k in range(curbigK):
        for kk in range(bigK):
            if (sum(freq[kk] > 0)):
                distr1 = np.array(freq[kk]/sum(freq[kk]) )                        
                distr2 = np.array(tmp_freq[k])

                dists[k, kk] = sum(np.abs(distr1 - distr2))/2
            else:
                dists[k, kk] = 0
                    
    treematch = scipy.optimize.linear_sum_assignment(dists)[1]
                      
    for k in range(curbigK):
        if (dists[k, treematch[k]] > birth_thresh):
            freq[bigK] = np.array([0] * n)
            treematch[k] = bigK
            
            if (node_tree_coo is not None):
                node_tree_coo = np.column_stack((node_tree_coo, np.zeros((n, 1))))
            bigK = bigK + 1
    
    for k in range(curbigK):
        freq[treematch[k]] = freq[treematch[k]] + tmp_freq[k]
      
    for ii in range(n):
        if (node_tree_coo is None):
            break
        
        ants = getAncestors(graf, ii)
        myroot = ants[-1]
        my_k = tree2root_sorted.index(myroot)
        if (sizes_sorted[my_k] <= size_thresh * n):
            continue
                    
        my_kstar = treematch[my_k]
        node_tree_coo[ii, my_kstar] = node_tree_coo[ii, my_kstar] + 1     
    
    return(node_tree_coo)