import os,sys from itertools import izip import networkx as NX from scipy import shape,diag,dot,asarray,sqrt,real,zeros,eye,exp,maximum,\ outer,maximum,sum,diag,real,atleast_2d from scipy.linalg import eig,svd,inv,expm,norm from cx_utils import sorted_eig import numpy eps = numpy.finfo(float).eps.item() feps = numpy.finfo(numpy.single).eps.item() _array_precision = {'f': 0, 'd': 1, 'F': 0, 'D': 1,'i': 1} class NXUTILSException(Exception): pass def xgraph_to_graph(G): """Convert an Xgraph to an ordinary graph. Edge attributes, mult.edges and self-loops are lost in the process. """ GG = NX.convert.from_dict_of_lists(NX.convert.to_dict_of_lists(G)) return GG def get_affinity_matrix(G, data, ids, dist='e', mask=None, weight=None, t=0, out='dist'): """ Function for calculating a general affinity matrix, based upon distances. Affiniy = 1 - distance ((10-1) 1 is far apart) INPUT data: gene expression data, type dict data[gene] = expression-vector G: The network (networkx.base.Graph object) mask: The array mask shows which data are missing. If mask[i][j]==0, then data[i][j] is missing. weights: The array weight contains the weights to be used when calculating distances. transpose: If transpose==0, then genes are clustered. If transpose==1, microarrays are clustered. dist: The character dist defines the distance function to be used: dist=='e': Euclidean distance dist=='b': City Block distance dist=='h': Harmonically summed Euclidean distance dist=='c': Pearson correlation dist=='a': absolute value of the correlation dist=='u': uncentered correlation dist=='x': absolute uncentered correlation dist=='s': Spearman's rank correlation dist=='k': Kendall's tau For other values of dist, the default (Euclidean distance) is used. OUTPUT D : Similariy matrix (nGenes x nGenes), symetric, d_ij e in [0,1] Normalized so max weight = 1.0 """ try: from Bio import Cluster as CLS except: raise NXUTILSError("Import of Biopython failed") n_var = len(data) n_samp = len(data[data.keys()[0]]) X = zeros((nVar, nSamp),dtpye='0: D[i,:len(row)]=row D = D + D.T MAX = 30.0 D_max = max(ravel(D))/MAX D_n = D/D_max #normalised (max = 10.0) D_n = (MAX+1.) - D_n #using correlation (inverse distance for dists) A = NX.adj_matrix(G, nodelist=ids) if out=='dist': return D_n*A elif out=='heat_kernel': t=1.0 K = exp(-t*D*A) return K elif out=='complete': return D_n else: return [] def remove_one_degree_nodes(G, iter=True): """Removes all nodes with only one neighbour. These nodes does not contribute to community structure. input: G -- graph iter -- True/False iteratively remove? """ G_copy = G.copy() if iter==True: while 1: bad_nodes=[] for node in G_copy.nodes(): if len(G_copy.neighbors(node))==1: bad_nodes.append(node) if len(bad_nodes)>0: G_copy.delete_nodes_from(bad_nodes) else: break else: bad_nodes=[] for ngb in G_copy.neighbors_iter(): if len(G_copy.neighbors(node))==1: bad_nodes.append(node) if len(bad_nodes)>0: G_copy.delete_nodes_from(bad_nodes) print "Deleted %s nodes from network" %(len(G)-len(G_copy)) return G_copy def key_players(G, n=1, with_labels=False): """ Resilince measure Identification of key nodes by fraction of nodes in disconnected subgraph when the node is removed. output: fraction of nodes disconnected when node i is removed """ i=0 frac=[] labels = {} for node in G.nodes(): i+=1 print i T = G.copy() T.delete_node(node) n_nodes = T.number_of_nodes() sub_graphs = NX.connected_component_subgraphs(T) n = len(sub_graphs) if n>1: strong_comp = sub_graphs[0] fraction = 1.0 - 1.0*strong_comp.number_of_nodes()/n_nodes frac.append(fraction) labels[node]=fraction else: frac.append(0.0) labels[node]=0.0 out = 1.0 - array(frac) if with_labels==True: return out,labels else: return out def node_weighted_adj_matrix(G, weights=None, ave_type='harmonic', with_labels=False): """Return a weighted adjacency matrix of graph. The weights are node weights. input: G -- graph weights -- dict, keys: nodes, values: weights with_labels -- True/False, return labels? output: A -- weighted eadjacency matrix [index] -- node labels """ n=G.order() # make an dictionary that maps vertex name to position index={} count=0 for node in G.nodes(): index[node]=count count = count+1 a = zeros((n,n)) if type(G)=='networkx.xbase.XGraph': raise for head,tail in G.edges(): if ave_type == 'geometric': a[index[head],index[tail]]= sqrt(weights[head]*weights[tail]) a[index[tail],index[head]]= a[index[head],index[tail]] elif ave_type == 'harmonic': a[index[head],index[tail]] = mean(weights[head],weights[tail]) a[index[tail],index[head]]= mean(weights[head],weights[tail]) if with_labels: return a,index else: return a def weighted_adj_matrix(G, with_labels=False): """Adjacency matrix of an XGraph whos weights are given in edges. """ A, labels = NX.adj_matrix(G, with_labels=True) W = A.astype('0: nn_d = mean([float(d[i]) for i in nn]) out.append((d[node], nn_d)) return array(out).T def struct_equivalence(G,n1,n2): """Returns the structural equivalence of a node pair. Two nodes are structural equal if they share the same neighbors. x_s = [ne(n1) union ne(n2) - ne(n1) intersection ne(n2)]/[ne(n1) union ne(n2) + ne(n1) intersection ne(n2)] ref: Brun et.al 2003 """ #[ne(n1) union ne(n2) - ne(n1) intersection ne(n2 s1 = set(G.neighbors(n1)) s2 = set(G.neighbors(n2)) num_union = len(s1.union(s2)) num_intersection = len(s1.intersection(s2)) if num_union & num_intersection: xs=0 else: xs = (num_union - num_intersection)/(num_union + num_intersection) return xs def struct_equivalence_all(G): """Not finnished. """ A,labels = NX.adj_matrix(G,with_labels=True) pass def hamming_distance(n1,n2): """Not finnsihed. """ pass def graph_corrcoeff(G, vec=None, nodelist=None, sim='corr'): """Returns the correlation coefficient for each node. The correlation coefficient is between the node and its neighbours. """ if nodelist==None: nodelist=G.nodes() if vec == None: vec = G.degree(nodelist) if len(vec)!=len(nodelist): raise NXUTILSError("The node value vector is not of same length (%s) as the nodelist(%s)") %(len(vec), len(nodelist)) A = NX.ad_matrix(G, nodelist=nodelist) for i, node in enumerate(nodelist): nei_i = A[i,:]==1 vec_i = vec[nei_i] def weighted_laplacian(G,with_labels=False): """Return standard Laplacian of graph from a weighted adjacency matrix.""" n= G.order() I = scipy.eye(n) A = weighted_adj_matrix(G) D = I*scipy.sum(A, 0) L = D-A if with_labels: A,index = weighted_adj_matrix(G, with_labels=True) return L, index else: return L def grow_subnetworks(G, T2): """Return the highest scoring (T2-test) subgraph og G. Use simulated annealing to identify highly grow subgraphs. ref: -- Ideker et.al (Bioinformatics 18, 2002) -- Patil and Nielsen (PNAS 2006) """ N = 1000 states = [(node, False) for node in G.nodes()] t2_last = 0.0 for i in xrange(N): if i==0: #assign random states states = [(state[0], True) for state in states if rand(1)>.5] sub_nodes = [state[0] for state in states if state[1]] Gsub = NX.subgraph(G, sub_nodes) Gsub = NX.connected_components_subgraphs(Gsub)[0] t2 = [T2[node] for node in Gsub] if t2>t2_last: pass else: p = numpy.exp() """Below are methods for calculating graph metrics Four main decompositions : 0.) Adjacency diffusion kernel expm(A), 1.) von neumann kernels (diagonalisation of adjacency matrix) 2.) laplacian kernels (geometric series of adj.) 3.) diffusion kernels (exponential series of adj.) ---- Kv von_neumann : Kv = (I-alpha*A)^-1 (mod: A(I-alpha*A)^-1)? , geom. series ---- Kl laplacian: Kl = (I-alpha*L)^-1 , geom. series ---- Kd laplacian_diffusion: Kd = expm(-alpha*L) exp. series ---- Ke Exponential diffusion. Ke = expm(A) .... expm(-A)? """ # TODO: # check for numerical unstable eigenvalues and set to zero # othervise some inverses wil explode ->ok ..using pinv for inverses # # This gives results that look numerical unstable # # -- divided adj by sum(A[:]), check this one (paper by Lebart scales with number of edges) # # # # the neumann kernel is defined in Kandola to be K = A*(I-A)^-1 # lowest eigenvectors are same as the highest of K = A*A ? # this needs clarification # diffusion is still wrong! ... ok # diff needs normalisation?! check the meaning of exp(-s) = exp(1/s) -L = 1/degree ... etc # Is it the negative of exp. of adj. metrix in Kandola? # # Normalised=False returns only nans (no idea why!!) ... fixed ok # 31.1: diff is ok exp(0)=1 not zero! # 07.03.2005: normalisation is ok: -> normalisation will emphasize high degree nodes # 10.03.2005: symeig is unstable an returns nans of some eigenvectors? switching back to eig # 14.05.2006: diffusion returns negative values, using expm(-LL) instead (FIX) # 13.09.2206: update for use in numpy # 27.04.2007: diffusion now uses pade approximations to matrix exponential. Also the last def K_expAdj(W, normalised=True, alpha=1.0): """Matrix exponential of adjacency matrix, mentioned in Kandola as a general diffusion kernel. """ W = asarray(W) t = W.dtype.char if len(W.shape)!=2: raise ValueError, "Non-matrix input to matrix function." m,n = W.shape if t in ['F','D']: raise TypeError, "Complex input!" if normalised==True: T = diag( sqrt( 1./(sum(W,0))) ) W = dot(dot(T, W), T) e,vr = eig(W) s = real(e)**2 # from eigenvalues to singularvalues vri = inv(vr) s = maximum.reduce(s) + s cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]] cutoff = abs(cond*maximum.reduce(s)) psigma = eye(m) for i in range(len(s)): if abs(s[i]) > cutoff: psigma[i,i] = .5*alpha*exp(s[i]) return dot(dot(vr,psigma),vri) def K_vonNeumann(W, normalised=True, alpha=1.0): """ The geometric series of path lengths. Returns matrix square root of pseudo inverse of the adjacency matrix. """ W = asarray(W) t = W.dtype.char if len(W.shape)!=2: raise ValueError, "Non-matrix input to matrix function." m,n = W.shape if t in ['F','D']: raise TypeError, "Complex input!" if normalised==True: T = diag(sqrt(1./(sum(W,0)))) W = dot(dot(T,W),T) e,vr = eig(W) vri = inv(vr) e = real(e) # we only work with real pos. eigvals e = maximum.reduce(e) + e cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]] cutoff = cond*maximum.reduce(e) psigma = zeros((m,n),t) for i in range(len(e)): if e[i] > cutoff: psigma[i,i] = 1.0/e[i] #these are eig.vals (=sqrt(sing.vals)) return dot(dot(vr,psigma),vri).astype(t) def K_laplacian(W, normalised=True, alpha=1.0): """ This is the matrix pseudo inverse of L. Also known as the average commute time matrix. """ W = asarray(W) t = W.dtype.char if len(W.shape)!=2: raise ValueError, "Non-matrix input to matrix function." m,n = W.shape if t in ['F','D']: raise TypeError, "Complex input!" D = diag(sum(W,0)) L = D - W if normalised==True: T = diag(sqrt(1./sum(W, 0))) L = dot(dot(T, L), T) e,vr = eig(L) e = real(e) vri = inv(vr) cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]] cutoff = cond*maximum.reduce(e) psigma = zeros((m,),t) # if s close to zero -> set 1/s = 0 for i in range(len(e)): if e[i] > cutoff: psigma[i] = 1.0/e[i] K = dot(dot(vr, diag(psigma)), vri).astype(t) K = real(K) I = eye(n) K = (1-alpha)*I + alpha*K return K def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5, use_cut=False): """Returns diffusion kernel. input: -- W, adj. matrix -- normalised [True/False] -- alpha, [0,1] (degree of network influence) -- beta, [0->), (diffusion degree) """ W = asarray(W) t = W.dtype.char if len(W.shape)!=2: raise ValueError, "Non-matrix input to matrix function." m, n = W.shape if t in ['F','D']: raise TypeError, "Complex input!" D = diag(W.sum(0)) L = D - W if normalised==True: T = diag(sqrt(1./W.sum(0))) L = dot(dot(T, L), T) e, vr = eig(L) vri = inv(vr) #inv cond = 1.0*{0: feps*1e3, 1: eps*1e6}[_array_precision[t]] cutoff = 1.*abs(cond*maximum.reduce(e)) psigma = eye(m) # if eigvals are 0 exp(0)=1 (unnecessary) #psigma = zeros((m,n), dtype=' cutoff: psigma[i,i] = exp(-beta*e[i]) #else: # psigma[i,i] = 0.0 K = real(dot(dot(vr, psigma), vri)) I = eye(n, dtype='), (diffusion degree) """ D = diag(W.sum(0)) L = D - W if normalised==True: T = diag(sqrt(1./W.sum(0))) L = dot(dot(T, L), T) return expm(-beta*L) def K_modularity(W, alpha=1.0): """ Returns the matrix square root of Newmans modularity.""" W = asarray(W) t = W.dtype.char m, n = W.shape d = sum(W, 0) m = 1.*sum(d) B = W - (outer(d, d)/m) s,v = sorted_eig(B, sort_by='lm') psigma = zeros( (n, n), dtype='1e-7: psigma[i,i] = sqrt(s[i]) #psigma[i,i] = s[i] K = dot(dot(v, psigma), v.T) I = eye(n) K = (1 - alpha)*I + alpha*K return K def kernel_score(K, W): """Returns the modularity score. K -- (modularity) kernel W -- adjacency matrix (possibly weighted) """ # normalize W (: W'W=I) m, n = shape(W) for i in range(n): W[:,i] = W[:,i]/norm(W[:,i]) score = diag(dot(W, dot(K, W)) ) tot = sum(score) return score, tot def modularity_matrix(G, nodelist=None): if not nodelist: nodelist = G.nodes() else: G = NX.subgraph(G, nodelist) A = NX.adj_matrix(G, nodelist=nodelist) d = atleast_2d(G.degree(nbunch=nodelist)) m = 1.*G.number_of_edges() B = A - dot(d.T, d)/m return B