import os,sys from itertools import izip import networkx as NX from scipy import shape,diag,dot,asarray,sqrt,real,zeros,eye,exp,maximum,\ outer,maximum,sum,diag,real from scipy.linalg import eig,svd,inv,expm,norm from cx_utils import sorted_eig import numpy eps = numpy.finfo(float).eps.item() feps = numpy.finfo(numpy.single).eps.item() _array_precision = {'f': 0, 'd': 1, 'F': 0, 'D': 1,'i': 1} def xgraph_to_graph(G): """Convert an Xgraph to an ordinary graph. Edge attributes, mult.edges and self-loops are lost in the process. """ GG = NX.convert.from_dict_of_lists(NX.convert.to_dict_of_lists(G)) return GG def get_affinity_matrix(G, data, ids, dist='e', mask=None, weight=None, t=0, out='dist'): """ Function for calculating a general affinity matrix, based upon distances. Affiniy = 1 - distance ((10-1) 1 is far apart) INPUT data: gene expression data, type dict data[gene] = expression-vector G: The network (networkx.base.Graph object) mask: The array mask shows which data are missing. If mask[i][j]==0, then data[i][j] is missing. weights: The array weight contains the weights to be used when calculating distances. transpose: If transpose==0, then genes are clustered. If transpose==1, microarrays are clustered. dist: The character dist defines the distance function to be used: dist=='e': Euclidean distance dist=='b': City Block distance dist=='h': Harmonically summed Euclidean distance dist=='c': Pearson correlation dist=='a': absolute value of the correlation dist=='u': uncentered correlation dist=='x': absolute uncentered correlation dist=='s': Spearman's rank correlation dist=='k': Kendall's tau For other values of dist, the default (Euclidean distance) is used. OUTPUT D : Similariy matrix (nGenes x nGenes), symetric, d_ij e in [0,1] Normalized so max weight = 1.0 """ try: from Bio import Cluster as CLS except: raise ValueError, "Need installed biopython" nVar = len(data) nSamp = len(data[data.keys()[0]]) X = zeros((nVar, nSamp),dtpye='0: D[i,:len(row)]=row D = D + D.T MAX = 30.0 D_max = max(ravel(D))/MAX D_n = D/D_max #normalised (max = 10.0) D_n = (MAX+1.) - D_n #using correlation (inverse distance for dists) A = NX.adj_matrix(G, nodelist=ids) if out=='dist': return D_n*A elif out=='heat_kernel': t=1.0 K = exp(-t*D*A) return K elif out=='complete': return D_n else: return [] def remove_one_degree_nodes(G, iter=True): """Removes all nodes with only one neighbour. These nodes does not contribute to community structure. input: G -- graph iter -- True/False iteratively remove? """ G_copy = G.copy() if iter==True: while 1: bad_nodes=[] for node in G_copy.nodes(): if len(G_copy.neighbors(node))==1: bad_nodes.append(node) if len(bad_nodes)>0: G_copy.delete_nodes_from(bad_nodes) else: break else: bad_nodes=[] for ngb in G_copy.neighbors_iter(): if len(G_copy.neighbors(node))==1: bad_nodes.append(node) if len(bad_nodes)>0: G_copy.delete_nodes_from(bad_nodes) print "Deleted %s nodes from network" %(len(G)-len(G_copy)) return G_copy def key_players(G, n=1, with_labels=False): """ Resilince measure Identification of key nodes by fraction of nodes in disconnected subgraph when the node is removed. output: fraction of nodes disconnected when node i is removed """ i=0 frac=[] labels = {} for node in G.nodes(): i+=1 print i T = G.copy() T.delete_node(node) n_nodes = T.number_of_nodes() sub_graphs = NX.connected_component_subgraphs(T) n = len(sub_graphs) if n>1: strong_comp = sub_graphs[0] fraction = 1.0 - 1.0*strong_comp.number_of_nodes()/n_nodes frac.append(fraction) labels[node]=fraction else: frac.append(0.0) labels[node]=0.0 out = 1.0 - array(frac) if with_labels==True: return out,labels else: return out def node_weighted_adj_matrix(G, weights=None, ave_type='harmonic', with_labels=False): """Return a weighted adjacency matrix of graph. The weights are node weights. input: G -- graph weights -- dict, keys: nodes, values: weights with_labels -- True/False, return labels? output: A -- weighted eadjacency matrix [index] -- node labels """ n=G.order() # make an dictionary that maps vertex name to position index={} count=0 for node in G.nodes(): index[node]=count count = count+1 a = zeros((n,n)) if type(G)=='networkx.xbase.XGraph': raise for head,tail in G.edges(): if ave_type == 'geometric': a[index[head],index[tail]]= sqrt(weights[head]*weights[tail]) a[index[tail],index[head]]= a[index[head],index[tail]] elif ave_type == 'harmonic': a[index[head],index[tail]] = mean(weights[head],weights[tail]) a[index[tail],index[head]]= mean(weights[head],weights[tail]) if with_labels: return a,index else: return a def weighted_adj_matrix(G, with_labels=False): """Adjacency matrix of an XGraph whos weights are given in edges. """ A,labels = NX.adj_matrix(G,with_labels=True) W = A.astype('0: nn_d = mean([float(d[i]) for i in nn]) out.append((d[node], nn_d)) return array(out).T def struct_equivalence(G,n1,n2): """Returns the structural equivalence of a node pair. Two nodes are structural equal if they share the same neighbors. x_s = [ne(n1) union ne(n2) - ne(n1) intersection ne(n2)]/[ne(n1) union ne(n2) + ne(n1) intersection ne(n2)] ref: Brun et.al 2003 """ #[ne(n1) union ne(n2) - ne(n1) intersection ne(n2 s1 = set(G.neighbors(n1)) s2 = set(G.neighbors(n2)) num_union = len(s1.union(s2)) num_intersection = len(s1.intersection(s2)) if num_union & num_intersection: xs=0 else: xs = (num_union - num_intersection)/(num_union + num_intersection) return xs def struct_equivalence_all(G): """Not finnished. """ A,labels = NX.adj_matrix(G,with_labels=True) pass def hamming_distance(n1,n2): """Not finnsihed. """ pass def graph_corrcoeff(G): """Not finnished. """ A,index = NX.adj_matrix(G,with_labels=True) #C = zeros(*A.shape(),'d') n = 1.*G.number_of_nodes() for node in G.nodes(): a_j = A[index[node],:] #neighbors mean_a = sum(a_j)/n# degree(G)/number_of_nodes() var_a = sqrt(sum((a_j - mean_a)**2)/n) pass def graph_and_data_intersection(data, graph, pathways=None, keep_connected=True): """Returns the intersection of keys in two dictionaries. NB: keep track of identifer sorting after these dict transforms. input: data -- dict, keys: gene id, value: measurement profile graph -- networkx,base.graph, full graph pathways -- dict, keys: pathway name, values: nodes in pathway call: new_data, new_graph,pathways = graph_and_data_intersection(data,graph,pathways,keep_connected=True) """ new_graph = graph.copy() new_data = {} new_pathways = {} graph_set = set(graph.nodes()) data_set = set(data.keys()) intersection = data_set & graph_set new_graph.delete_nodes_from(graph_set - data_set) #remove difference for k in intersection: new_data[k] = data[k] if keep_connected: max_iter = 0 sub_graphs = NX.connected_component_subgraphs(new_graph) if len(sub_graphs)==0: new_graph = sub_graphs[0] else: new_graph = sub_graphs[0] old_data = new_data while new_graph.number_of_nodes() != len(new_data) and max_iter<100: max_iter+=1 graph_set = sets.Set(new_graph.nodes()) data_set = sets.Set(new_data.keys()) intersection = data_set & graph_set new_graph.delete_nodes_from(graph_set - data_set) new_data={} for k in intersection: new_data[k] = old_data[k] old_data = new_data.copy() new_graph = NX.connected_component_subgraphs(new_graph)[0] if pathways!=None: for pth,nodes in pathways.items(): new_pathways[pth] = [node for node in nodes if node in new_graph] print "\nSUMMARY (graph_and_data_intersection): " print "Number of input variables: %s\n\ Number nodes in input graph: %s" %(len(data),len(graph)) print "\nUsing intersection of connected graph and nodes with data values" print "Number of variables is now: %s" %len(new_data) print "Number of nodes in graph: %s" %new_graph.number_of_nodes() if pathways!=None: return new_data,new_graph,new_pathways else: return new_data,new_graph def rx_graph_and_data_intersection(graph,node_data,pathways,data,keep_connected=False): """Returns a (connected) reaction graph with present gene expression data. keep_connected==True: When a node (gene) is not present in our expression data, the node is deleted and all neighbors are connected with edge weight=0.5 if the are not already neigbors. input: data -- dict, keys: gene id, value: measurement profile graph -- networkx.xbase.xgraph, full wieghted graph node_data -- dict, keys: rx id, value: set of gene_ids pathways -- dict, keys: pathway name, values: lidt of nodes in pathway """ # We do not connect the full graph ... may be performed by using the reference graph? graph = NX.connected_component_subgraphs(graph)[0] #largest connected component new_graph = graph.copy() new_data = {} new_node_data = node_data.copy() new_pathways = {} genes_in_graph=set() genes_in_data = set(data.keys()) rx_in_graph = set(new_graph.nodes()) # genes in graph nodes (rx_nodes) for rx in rx_in_graph: genes_in_graph.update(set(new_node_data.get(rx))) keep_genes = genes_in_data.intersection(genes_in_graph) #both in graph and data #update node data for rx,genes in node_data.items(): # delete node data of nodes not present in graph genes = set(genes) genes.intersection_update(keep_genes) #remove genes if they are not in inters. if len(genes)==0 or rx not in rx_in_graph: #no gene data or not in graph print "removing: " + str(rx) del new_node_data[rx] rx_in_data= set(new_node_data.keys()) rx_intersection = rx_in_data.intersection(rx_in_graph) for gene in keep_genes: new_data[gene] = data.get(gene) # update pathways nodes for pth,genes in pathways.items(): if genes: genes = set(genes) genes.intersection_update(keep_genes) # gene needs to have data else: pass new_pathways[pth] = genes bad_nodes = rx_in_graph.difference(rx_in_data) #in graph but no data if keep_connected==True: dummy = new_graph.copy() for rx in bad_nodes: dummy.delete_node(rx) if len(NX.connected_component_subgraphs(dummy))>1: nghbrs = new_graph.neighbors(rx) for i in nghbrs: for j in nghbrs: if i!=j: if not new_graph.has_edge(i,j): new_graph.add_edge(i,j,0.5) #update graph new_graph.delete_nodes_from(list(bad_nodes)) return new_graph,new_node_data,new_pathways,new_data def weighted_laplacian(G,with_labels=False): """Return standard Laplacian of graph from a weighted adjacency matrix.""" n= G.order() I = scipy.eye(n) A = weighted_adj_matrix(G) D = I*scipy.sum(A, 0) L = D-A if with_labels: A,index = weighted_adj_matrix(G, with_labels=True) return L, index else: return L """Below are methods for calculating graph metrics Four main decompositions : 0.) Adjacency diffusion kernel expm(A), 1.) von neumann kernels (diagonalisation of adjacency matrix) 2.) laplacian kernels (geometric series of adj.) 3.) diffusion kernels (exponential series of adj.) ---- Kv von_neumann : Kv = (I-alpha*A)^-1 (mod: A(I-alpha*A)^-1)? , geom. series ---- Kl laplacian: Kl = (I-alpha*L)^-1 , geom. series ---- Kd laplacian_diffusion: Kd = expm(-alpha*L) exp. series ---- Ke Exponential diffusion. Ke = expm(A) .... expm(-A)? """ # TODO: # check for numerical unstable eigenvalues and set to zero # othervise some inverses wil explode ->ok ..using pinv for inverses # # This gives results that look numerical unstable # # -- divided adj by sum(A[:]), check this one (paper by Lebart scales with number of edges) # # # # the neumann kernel is defined in Kandola to be K = A*(I-A)^-1 # lowest eigenvectors are same as the highest of K = A*A ? # this needs clarification # diffusion is still wrong! ... ok # diff needs normalisation?! check the meaning of exp(-s) = exp(1/s) -L = 1/degree ... etc # Is it the negative of exp. of adj. metrix in Kandola? # # Normalised=False returns only nans (no idea why!!) ... fixed ok # 31.1: diff is ok exp(0)=1 not zero! # 07.03.2005: normalisation is ok: -> normalisation will emphasize high degree nodes # 10.03.2005: symeig is unstable an returns nans of some eigenvectors? switching back to eig # 14.05.2006: diffusion returns negative values, using expm(-LL) instead (FIX) # 13.09.2206: update for use in numpy def K_expAdj(W, normalised=False, alpha=1.0): """Matrix exponential of adjacency matrix, mentioned in Kandola as a general diffusion kernel. """ W = asarray(W) t = W.dtype.char if len(W.shape)!=2: raise ValueError, "Non-matrix input to matrix function." m,n = W.shape if t in ['F','D']: raise TypeError, "Complex input!" if normalised==True: T = diag( sqrt( 1./(sum(W,0))) ) W = dot(dot(T, W), T) e,vr = eig(W) s = real(e)**2 # from eigenvalues to singularvalues vri = inv(vr) s = maximum.reduce(s) + s cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]] cutoff = abs(cond*maximum.reduce(s)) psigma = eye(m) for i in range(len(s)): if abs(s[i]) > cutoff: psigma[i,i] = .5*alpha*exp(s[i]) return dot(dot(vr,psigma),vri) def K_vonNeumann(W,normalised=False,alpha=1.0): """ The geometric series of path lengths. Returns matrix square root of pseudo inverse of the adjacency matrix. """ W = asarray(W) t = W.dtype.char if len(W.shape)!=2: raise ValueError, "Non-matrix input to matrix function." m,n = W.shape if t in ['F','D']: raise TypeError, "Complex input!" if normalised==True: T = diag(sqrt(1./(sum(W,0)))) W = dot(dot(T,W),T) e,vr = eig(W) vri = inv(vr) e = real(e) # we only work with real pos. eigvals e = maximum.reduce(e) + e cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]] cutoff = cond*maximum.reduce(e) psigma = zeros((m,n),t) for i in range(len(e)): if e[i] > cutoff: psigma[i,i] = 1.0/e[i] #these are eig.vals (=sqrt(sing.vals)) return dot(dot(vr,psigma),vri).astype(t) def K_laplacian(W, normalised=True, alpha=1.0): """ This is the matrix square root of the pseudo inverse of L. Also known as th eaverage commute time matrix. """ W = asarray(W) t = W.dtype.char if len(W.shape)!=2: raise ValueError, "Non-matrix input to matrix function." m,n = W.shape if t in ['F','D']: raise TypeError, "Complex input!" D = diag(sum(W,0)) L = D - W if normalised==True: T = diag(sqrt(1./sum(W,0))) L = dot(dot(T,L),T) e,vr = eig(L) e = real(e) vri = inv(vr) cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]] cutoff = cond*maximum.reduce(e) psigma = zeros((m,),t) # if s close to zero -> set 1/s = 0 for i in range(len(e)): if e[i] > cutoff: psigma[i] = 1.0/e[i] K = dot(dot(vr,diag(psigma)),vri).astype(t) K = real(K) I = eye(n) K = (1-alpha)*I + alpha*K return K def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5): """Returns diffusion kernel. input: -- W, adj. matrix -- normalised [True/False] -- alpha, [0,1] (degree of network influence) -- beta, [0->), (diffusion degree) """ W = asarray(W) t = W.dtype.char if len(W.shape)!=2: raise ValueError, "Non-matrix input to matrix function." m,n = W.shape if t in ['F','D']: raise TypeError, "Complex input!" D = diag(sum(W,0)) L = D-W if normalised==True: T = diag(sqrt(1./(sum(W,0)))) L = dot(dot(T,L),T) e,vr = eig(L) vri = inv(vr) #inv cond = 1.0*{0: feps*1e3, 1: eps*1e6}[_array_precision[t]] cutoff = 1.*abs(cond*maximum.reduce(e)) psigma = eye(m) # if sing vals are 0 exp(0)=1 (unnecessary) #psigma = zeros((m,n), dtype=' cutoff: psigma[i,i] = exp(-beta*e[i]) K = real(dot(dot(vr, psigma), vri)) I = eye(n, dtype='1e-7: psigma[i,i] = sqrt(s[i]) #psigma[i,i] = s[i] K = dot(dot(v, psigma), v.T) I = eye(n) K = (1 - alpha)*I + alpha*K return K def kernel_score(K, W): """Returns the modularity score. K -- (modularity) kernel W -- adjacency matrix (possibly weighted) """ # normalize W (: W'W=I) m, n = shape(W) for i in range(n): W[:,i] = W[:,i]/norm(W[:,i]) score = diag(dot(W, dot(K, W)) ) tot = sum(score) return score, tot