laydi/fluents/lib/nx_utils.py

import os,sys
from itertools import izip
import networkx as NX
from scipy import shape,diag,dot,asarray,sqrt,real,zeros,eye,exp,maximum,\
     outer,maximum,sum,diag,real
from scipy.linalg import eig,svd,inv,expm,norm
from cx_utils import sorted_eig

import numpy

eps = numpy.finfo(float).eps.item()
feps = numpy.finfo(numpy.single).eps.item()
_array_precision = {'f': 0, 'd': 1, 'F': 0, 'D': 1,'i': 1}


def xgraph_to_graph(G):
    """Convert an Xgraph to an ordinary graph.
    Edge attributes, mult.edges and self-loops are lost in the process.
    """

    GG = NX.convert.from_dict_of_lists(NX.convert.to_dict_of_lists(G))
    return GG

def get_affinity_matrix(G, data, ids, dist='e', mask=None, weight=None, t=0, out='dist'):
    """
    Function for calculating a general affinity matrix, based upon distances.
    Affiniy = 1 - distance ((10-1) 1 is far apart)
    INPUT

    data:
    gene expression data, type dict data[gene] = expression-vector

    G:
    The network (networkx.base.Graph object)

    mask:
    The array mask shows which data are missing. If mask[i][j]==0, then
    data[i][j] is missing.

    weights:
    The array weight contains the weights to be used when calculating distances.

    transpose:
    If transpose==0, then genes are clustered. If transpose==1, microarrays are
    clustered.

    dist:
    The character dist defines the distance function to be used:
    dist=='e': Euclidean distance
    dist=='b': City Block distance
    dist=='h': Harmonically summed Euclidean distance
    dist=='c': Pearson correlation
    dist=='a': absolute value of the correlation
    dist=='u': uncentered correlation
    dist=='x': absolute uncentered correlation
    dist=='s': Spearman's rank correlation
    dist=='k': Kendall's tau
    For other values of dist, the default (Euclidean distance) is used.

    OUTPUT
    D :
    Similariy matrix (nGenes x nGenes), symetric, d_ij e in [0,1]
    Normalized so max weight = 1.0
    """
    try:
        from Bio import Cluster as CLS
    except:
        raise ValueError, "Need installed biopython"
    nVar = len(data)
    nSamp = len(data[data.keys()[0]])
    X = zeros((nVar, nSamp),dtpye='<f8')
    for i,gene in enumerate(ids): #this shuld be right!!
        X[i,:] = data[gene]


    #X = transpose(X) # distancematrix needs matrix as (nGenes,nSamples)

    D_list  = CLS.distancematrix(X, dist=dist)
    D = zeros((nVar,nVar),dtype='<f8')
    for i,row in enumerate(D_list):
        if i>0:
            D[i,:len(row)]=row

    D = D + D.T
    MAX = 30.0
    D_max = max(ravel(D))/MAX
    D_n = D/D_max #normalised (max = 10.0)
    D_n = (MAX+1.) - D_n #using correlation (inverse distance for dists)

    A = NX.adj_matrix(G, nodelist=ids)
    if out=='dist':
        return D_n*A
    elif out=='heat_kernel':
        t=1.0
        K = exp(-t*D*A)
        return K
    elif out=='complete':
        return D_n
    else:
        return []

def remove_one_degree_nodes(G, iter=True):
    """Removes all nodes with only one neighbour.  These nodes does
    not contribute to community structure.
    input:
             G -- graph
             iter -- True/False iteratively remove?
    """
    G_copy = G.copy()
    if iter==True:
        while 1:
            bad_nodes=[]
            for node in G_copy.nodes():
                if len(G_copy.neighbors(node))==1:
                    bad_nodes.append(node)
            if len(bad_nodes)>0:
                G_copy.delete_nodes_from(bad_nodes)
            else:
                break
    else:
       bad_nodes=[]
       for ngb in G_copy.neighbors_iter():
           if len(G_copy.neighbors(node))==1:
               bad_nodes.append(node)
           if len(bad_nodes)>0:
               G_copy.delete_nodes_from(bad_nodes)

    print "Deleted %s nodes from network" %(len(G)-len(G_copy))
    return G_copy

def key_players(G, n=1, with_labels=False):
    """
    Resilince measure
    Identification of key nodes by fraction of nodes in
    disconnected subgraph when the node is removed.

    output:
           fraction of nodes disconnected when node i is removed
    """
    i=0
    frac=[]
    labels = {}
    for node in G.nodes():
        i+=1
        print i
        T = G.copy()
        T.delete_node(node)
        n_nodes = T.number_of_nodes()
        sub_graphs = NX.connected_component_subgraphs(T)
        n = len(sub_graphs)
        if n>1:
            strong_comp = sub_graphs[0]
            fraction = 1.0 - 1.0*strong_comp.number_of_nodes()/n_nodes
            frac.append(fraction)
            labels[node]=fraction

        else:
            frac.append(0.0)
            labels[node]=0.0

    out = 1.0 - array(frac)
    if with_labels==True:
        return out,labels
    else:
        return out

def node_weighted_adj_matrix(G, weights=None, ave_type='harmonic', with_labels=False):
    """Return a weighted adjacency matrix of graph. The weights are
    node weights.
    input: G -- graph
           weights -- dict, keys: nodes, values: weights
           with_labels -- True/False, return labels?

    output: A -- weighted eadjacency matrix
            [index] -- node labels

    """
    n=G.order()
    # make an dictionary that maps vertex name to position
    index={}
    count=0
    for node in G.nodes():
        index[node]=count
        count = count+1

    a = zeros((n,n))
    if type(G)=='networkx.xbase.XGraph':
        raise
    for head,tail in G.edges():
        if ave_type == 'geometric':
            a[index[head],index[tail]]= sqrt(weights[head]*weights[tail])
            a[index[tail],index[head]]= a[index[head],index[tail]]
        elif ave_type == 'harmonic':
            a[index[head],index[tail]] = mean(weights[head],weights[tail])
            a[index[tail],index[head]]= mean(weights[head],weights[tail])
    if with_labels:
        return a,index
    else:
        return a

def weighted_adj_matrix(G, with_labels=False):
    """Adjacency matrix of an XGraph whos weights are given in edges.
    """
    A, labels = NX.adj_matrix(G, with_labels=True)
    W = A.astype('<f8')
    for orf, i in labels.items():
        for orf2, j in labels.items():
            if G.has_edge(orf, orf2):
                edge_weight = G.get_edge(orf, orf2)
                W[i,j] = edge_weight
                W[j,i] = edge_weight
    if with_labels==True:
        return W, labels
    else:
        return W

def assortative_index(G):
    """Ouputs two vectors: the degree and the neighbor average degree.
    Used to measure the assortative mixing.  If the average degree is
    pos. correlated with the degree we know that hubs tend to connect
    to other hubs.

    input: G, graph connected!!
    ouput: d,mn_d: degree, and average degree of neighb.
    (degree sorting from degree(with_labels=True))
    """
    d = G.degree(with_labels=True)
    out=[]
    for node in G.nodes():
        nn = G.neighbors(node)
        if len(nn)>0:
            nn_d = mean([float(d[i]) for i in nn])
            out.append((d[node], nn_d))
    return array(out).T

def struct_equivalence(G,n1,n2):
    """Returns the structural equivalence of a node pair.  Two nodes
    are structural equal if they share the same neighbors.

    x_s = [ne(n1) union ne(n2) - ne(n1) intersection ne(n2)]/[ne(n1)
    union ne(n2) + ne(n1) intersection ne(n2)]
    ref: Brun et.al 2003
    """

    #[ne(n1) union ne(n2) - ne(n1) intersection ne(n2
    s1 = set(G.neighbors(n1))
    s2 = set(G.neighbors(n2))
    num_union = len(s1.union(s2))
    num_intersection = len(s1.intersection(s2))
    if num_union & num_intersection:
        xs=0
    else:
        xs = (num_union - num_intersection)/(num_union + num_intersection)
    return xs

def struct_equivalence_all(G):
    """Not finnished.
    """
    A,labels = NX.adj_matrix(G,with_labels=True)
    pass

def hamming_distance(n1,n2):
    """Not finnsihed.
    """
    pass

def graph_corrcoeff(G):
    """Not finnished.
    """
    A,index = NX.adj_matrix(G,with_labels=True)
    #C = zeros(*A.shape(),'d')
    n = 1.*G.number_of_nodes()
    for node in G.nodes():
        a_j = A[index[node],:] #neighbors
        mean_a = sum(a_j)/n# degree(G)/number_of_nodes()
        var_a = sqrt(sum((a_j - mean_a)**2)/n)
        pass

def graph_and_data_intersection(data, graph, pathways=None,
                                keep_connected=True):

    """Returns the intersection of keys in two dictionaries.

    NB: keep track of identifer sorting after these dict transforms.

    input:
          data -- dict, keys: gene id, value: measurement profile
          graph -- networkx,base.graph, full graph
          pathways -- dict, keys: pathway name, values: nodes in pathway
    call:
    new_data, new_graph,pathways = graph_and_data_intersection(data,graph,pathways,keep_connected=True)

    """
    new_graph = graph.copy()
    new_data = {}
    new_pathways = {}
    graph_set = set(graph.nodes())
    data_set = set(data.keys())
    intersection = data_set & graph_set
    new_graph.delete_nodes_from(graph_set - data_set) #remove difference
    for k in intersection:
        new_data[k] = data[k]

    if keep_connected:
        max_iter = 0
        sub_graphs = NX.connected_component_subgraphs(new_graph)
        if len(sub_graphs)==0:
            new_graph = sub_graphs[0]
        else:
            new_graph = sub_graphs[0]
        old_data = new_data
        while new_graph.number_of_nodes() != len(new_data) and max_iter<100:
            max_iter+=1
            graph_set = sets.Set(new_graph.nodes())
            data_set = sets.Set(new_data.keys())
            intersection = data_set & graph_set
            new_graph.delete_nodes_from(graph_set - data_set)
            new_data={}
            for k in intersection:
                new_data[k] = old_data[k]
            old_data = new_data.copy()
            new_graph = NX.connected_component_subgraphs(new_graph)[0]
    if pathways!=None:
        for pth,nodes in pathways.items():
            new_pathways[pth] = [node for node in nodes if node in new_graph]
    print "\nSUMMARY (graph_and_data_intersection): "
    print "Number of input variables: %s\n\
    Number nodes in input graph: %s" %(len(data),len(graph))
    print "\nUsing intersection of connected graph and nodes with data values"
    print "Number of variables is now: %s" %len(new_data)
    print "Number of nodes in graph: %s" %new_graph.number_of_nodes()
    if pathways!=None:
        return new_data,new_graph,new_pathways
    else:
        return new_data,new_graph

def rx_graph_and_data_intersection(graph,node_data,pathways,data,keep_connected=False):
    """Returns a (connected) reaction graph with present gene expression data.

    keep_connected==True:
    When a node (gene) is not present in our expression data, the node
    is deleted and all neighbors are connected with edge weight=0.5
    if the are not already neigbors.

    input:
          data -- dict, keys: gene id, value: measurement profile
          graph -- networkx.xbase.xgraph, full wieghted graph
          node_data -- dict, keys: rx id, value: set of gene_ids
          pathways -- dict, keys: pathway name, values: lidt of nodes in pathway
    """
    # We do not connect the full graph ... may be performed by using the reference graph?
    graph = NX.connected_component_subgraphs(graph)[0] #largest connected component

    new_graph = graph.copy()
    new_data = {}
    new_node_data = node_data.copy()
    new_pathways = {}

    genes_in_graph=set()
    genes_in_data = set(data.keys())
    rx_in_graph = set(new_graph.nodes())

    # genes in graph nodes (rx_nodes)
    for rx in rx_in_graph:
        genes_in_graph.update(set(new_node_data.get(rx)))
    keep_genes = genes_in_data.intersection(genes_in_graph) #both in graph and data

    #update node data
    for rx,genes in node_data.items(): # delete node data of nodes not present in graph
            genes = set(genes)
            genes.intersection_update(keep_genes) #remove genes if they are not in inters.
            if len(genes)==0 or rx not in rx_in_graph: #no gene data or not in graph
                print "removing: " + str(rx)
                del new_node_data[rx]
    rx_in_data= set(new_node_data.keys())
    rx_intersection = rx_in_data.intersection(rx_in_graph)

    for gene in keep_genes:
        new_data[gene] = data.get(gene)

    # update pathways nodes
    for pth,genes in pathways.items():
        if genes:
            genes = set(genes)
            genes.intersection_update(keep_genes) # gene needs to have data
        else:
            pass
        new_pathways[pth] = genes
    bad_nodes = rx_in_graph.difference(rx_in_data) #in graph but no data

    if keep_connected==True:
        dummy = new_graph.copy()
        for rx in bad_nodes:
            dummy.delete_node(rx)
            if len(NX.connected_component_subgraphs(dummy))>1:
                nghbrs = new_graph.neighbors(rx)
                for i in nghbrs:
                    for j in nghbrs:
                        if i!=j:
                            if not new_graph.has_edge(i,j):
                                new_graph.add_edge(i,j,0.5)

    #update graph
    new_graph.delete_nodes_from(list(bad_nodes))

    return new_graph,new_node_data,new_pathways,new_data

def weighted_laplacian(G,with_labels=False):
    """Return standard Laplacian of graph from a weighted adjacency matrix."""
    n= G.order()
    I = scipy.eye(n)
    A = weighted_adj_matrix(G)
    D = I*scipy.sum(A, 0)
    L = D-A
    if with_labels:
        A,index = weighted_adj_matrix(G, with_labels=True)
	return L, index
    else:
        return L

def subnetworks(G, T2):
    """Return the highest scoring (T2-test) subgraph og G.

    Use simulated annealing to identify highly scoring subgraphs.

    ref: -- Ideker et.al (Bioinformatics 18, 2002)
         -- Patil and Nielsen (PNAS 2006)

    """
    N = 1000
    states = [(node, False) for node in G.nodes()]
    t2_last = 0.0
    for i in xrange(N):
        if i==0: #assign random states
            states = [(state[0], True) for state in states if rand(1)>.5]
        sub_nodes = [state[0] for state in states if state[1]]
        Gsub = NX.subgraph(G, sub_nodes)
        Gsub = NX.connected_components_subgraphs(Gsub)[0]
        t2 = [T2[node] for node in Gsub]
        if t2>t2_last:
            pass
        else:
            p = numpy.exp()


"""Below are methods for calculating graph metrics

Four main decompositions :
0.) Adjacency diffusion kernel expm(A),
1.) von neumann kernels (diagonalisation of adjacency matrix)

2.) laplacian kernels (geometric series of adj.)

3.) diffusion kernels (exponential series of adj.)

---- Kv
von_neumann : Kv = (I-alpha*A)^-1 (mod: A(I-alpha*A)^-1)? ,
geom. series

---- Kl
laplacian: Kl = (I-alpha*L)^-1 , geom. series

---- Kd
laplacian_diffusion: Kd = expm(-alpha*L)
exp. series

---- Ke
Exponential diffusion.
Ke = expm(A) .... expm(-A)?

"""

# TODO:
# check for numerical unstable eigenvalues and set to zero
# othervise some inverses wil explode ->ok ..using pinv for inverses
#
# This gives results that look numerical unstable
#
# -- divided adj by sum(A[:]), check this one (paper by Lebart scales with number of edges)
#
#
#
# the neumann kernel is defined in Kandola to be K = A*(I-A)^-1
# lowest eigenvectors are same as the highest of K = A*A ?
# this needs clarification

# diffusion is still wrong! ... ok
# diff needs normalisation?! check the meaning of exp(-s) = exp(1/s) -L = 1/degree ... etc
# Is it the negative of exp. of adj. metrix in Kandola?
#
# Normalised=False returns only nans (no idea why!!) ... fixed ok

# 31.1: diff is ok exp(0)=1 not zero!
# 07.03.2005: normalisation is ok: -> normalisation will emphasize high degree nodes
# 10.03.2005: symeig is unstable an returns nans of some eigenvectors? switching back to eig
# 14.05.2006: diffusion returns negative values, using expm(-LL) instead (FIX)
# 13.09.2206: update for use in numpy


def K_expAdj(W, normalised=True, alpha=1.0):
    """Matrix exponential of adjacency matrix, mentioned in Kandola as a general diffusion kernel.
    """
    W = asarray(W)
    t = W.dtype.char
    if len(W.shape)!=2:
        raise ValueError, "Non-matrix input to matrix function."
    m,n = W.shape
    if t in ['F','D']:
        raise TypeError, "Complex input!"
    if normalised==True:
        T = diag( sqrt( 1./(sum(W,0))) )
        W = dot(dot(T, W), T)
    e,vr = eig(W)
    s = real(e)**2 # from eigenvalues to singularvalues
    vri = inv(vr)
    s = maximum.reduce(s) + s
    cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
    cutoff = abs(cond*maximum.reduce(s))
    psigma = eye(m)
    for i in range(len(s)):
        if abs(s[i]) > cutoff:
            psigma[i,i] = .5*alpha*exp(s[i])

    return dot(dot(vr,psigma),vri)

def K_vonNeumann(W, normalised=True, alpha=1.0):
    """ The geometric series of path lengths.
    Returns matrix square root of pseudo inverse of the adjacency matrix.
    """
    W = asarray(W)
    t = W.dtype.char
    if len(W.shape)!=2:
        raise ValueError, "Non-matrix input to matrix function."
    m,n = W.shape
    if t in ['F','D']:
        raise TypeError, "Complex input!"

    if normalised==True:
        T = diag(sqrt(1./(sum(W,0))))
        W = dot(dot(T,W),T)
    e,vr = eig(W)
    vri = inv(vr)
    e = real(e)  # we only work with real pos. eigvals
    e = maximum.reduce(e) + e
    cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
    cutoff = cond*maximum.reduce(e)
    psigma = zeros((m,n),t)
    for i in range(len(e)):
        if e[i] > cutoff:
            psigma[i,i] = 1.0/e[i] #these are eig.vals (=sqrt(sing.vals))
    return dot(dot(vr,psigma),vri).astype(t)

def K_laplacian(W, normalised=True, alpha=1.0):
    """ This is the matrix square root of the pseudo inverse of L.
    Also known as th eaverage commute time matrix.
    """
    W = asarray(W)
    t = W.dtype.char
    if len(W.shape)!=2:
        raise ValueError, "Non-matrix input to matrix function."
    m,n = W.shape
    if t in ['F','D']:
        raise TypeError, "Complex input!"
    D = diag(sum(W,0))
    L = D - W
    if normalised==True:
        T = diag(sqrt(1./sum(W, 0)))
        L = dot(dot(T, L), T)
    e,vr = eig(L)
    e = real(e)
    vri = inv(vr)
    cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
    cutoff = cond*maximum.reduce(e)
    psigma = zeros((m,),t) # if s close to zero -> set 1/s = 0
    for i in range(len(e)):
        if e[i] > cutoff:
            psigma[i] = 1.0/e[i]
    K = dot(dot(vr, diag(psigma)), vri).astype(t)
    K = real(K)
    I = eye(n)
    K = (1-alpha)*I + alpha*K
    return K


def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5):
    """Returns diffusion kernel.
    input:
            -- W, adj. matrix
            -- normalised [True/False]
            -- alpha, [0,1] (degree of network influence)
            -- beta, [0->), (diffusion degree)
    """
    W = asarray(W)
    t = W.dtype.char
    if len(W.shape)!=2:
        raise ValueError, "Non-matrix input to matrix function."
    m,n = W.shape
    if t in ['F','D']:
        raise TypeError, "Complex input!"
    D = diag(sum(W,0))
    L = D-W
    if normalised==True:
        T = diag(sqrt(1./(sum(W,0))))
        L = dot(dot(T,L),T)
    e,vr = eig(L)
    vri = inv(vr) #inv
    cond = 1.0*{0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
    cutoff = 1.*abs(cond*maximum.reduce(e))
    psigma = eye(m) # if sing vals are 0 exp(0)=1 (unnecessary)
    #psigma = zeros((m,n), dtype='<f8')
    for i in range(len(e)):
        if abs(e[i]) > cutoff:
            psigma[i,i] = exp(-beta*e[i])
    K = real(dot(dot(vr, psigma), vri))
    I = eye(n, dtype='<f8')
    K = (1. - alpha)*I + alpha*K
    return K

def K_modularity(W,alpha=1.0):
    """ Returns the matrix square root of Newmans modularity."""
    W = asarray(W)
    t = W.dtype.char
    m, n = W.shape
    d = sum(W, 0)
    m = 1.*sum(d)
    B = W - (outer(d, d)/m)
    s,v = sorted_eig(B, sort_by='lm')
    psigma = zeros( (n, n), dtype='<f8' )
    for i in range(len(s)):
        if s[i]>1e-7:
            psigma[i,i] = sqrt(s[i])
            #psigma[i,i] = s[i]
    K = dot(dot(v, psigma), v.T)
    I = eye(n)
    K = (1 - alpha)*I + alpha*K
    return K

def kernel_score(K, W):
    """Returns the modularity score.
    K -- (modularity) kernel
    W -- adjacency matrix (possibly weighted)
    """
    # normalize W (: W'W=I)
    m, n = shape(W)
    for i in range(n):
        W[:,i] = W[:,i]/norm(W[:,i])
    score = diag(dot(W, dot(K, W)) )
    tot = sum(score)
    return score, tot