568 lines
16 KiB
Python
568 lines
16 KiB
Python
import os,sys
|
|
from itertools import izip
|
|
import networkx as NX
|
|
from scipy import shape,diag,dot,asarray,sqrt,real,zeros,eye,exp,maximum,\
|
|
outer,maximum,sum,diag,real,atleast_2d
|
|
from scipy.linalg import eig,svd,inv,expm,norm
|
|
from cx_utils import sorted_eig
|
|
|
|
import numpy
|
|
|
|
|
|
|
|
eps = numpy.finfo(float).eps.item()
|
|
feps = numpy.finfo(numpy.single).eps.item()
|
|
_array_precision = {'f': 0, 'd': 1, 'F': 0, 'D': 1,'i': 1}
|
|
|
|
class NXUTILSException(Exception): pass
|
|
|
|
def xgraph_to_graph(G):
|
|
"""Convert an Xgraph to an ordinary graph.
|
|
Edge attributes, mult.edges and self-loops are lost in the process.
|
|
"""
|
|
|
|
GG = NX.convert.from_dict_of_lists(NX.convert.to_dict_of_lists(G))
|
|
return GG
|
|
|
|
def get_affinity_matrix(G, data, ids, dist='e', mask=None, weight=None, t=0, out='dist'):
|
|
"""
|
|
Function for calculating a general affinity matrix, based upon distances.
|
|
Affiniy = 1 - distance ((10-1) 1 is far apart)
|
|
INPUT
|
|
|
|
data:
|
|
gene expression data, type dict data[gene] = expression-vector
|
|
|
|
G:
|
|
The network (networkx.base.Graph object)
|
|
|
|
mask:
|
|
The array mask shows which data are missing. If mask[i][j]==0, then
|
|
data[i][j] is missing.
|
|
|
|
weights:
|
|
The array weight contains the weights to be used when calculating distances.
|
|
|
|
transpose:
|
|
If transpose==0, then genes are clustered. If transpose==1, microarrays are
|
|
clustered.
|
|
|
|
dist:
|
|
The character dist defines the distance function to be used:
|
|
dist=='e': Euclidean distance
|
|
dist=='b': City Block distance
|
|
dist=='h': Harmonically summed Euclidean distance
|
|
dist=='c': Pearson correlation
|
|
dist=='a': absolute value of the correlation
|
|
dist=='u': uncentered correlation
|
|
dist=='x': absolute uncentered correlation
|
|
dist=='s': Spearman's rank correlation
|
|
dist=='k': Kendall's tau
|
|
For other values of dist, the default (Euclidean distance) is used.
|
|
|
|
OUTPUT
|
|
D :
|
|
Similariy matrix (nGenes x nGenes), symetric, d_ij e in [0,1]
|
|
Normalized so max weight = 1.0
|
|
"""
|
|
try:
|
|
from Bio import Cluster as CLS
|
|
except:
|
|
raise NXUTILSError("Import of Biopython failed")
|
|
n_var = len(data)
|
|
n_samp = len(data[data.keys()[0]])
|
|
X = zeros((nVar, nSamp),dtpye='<f8')
|
|
|
|
for i, gene in enumerate(ids): #this shuld be right!!
|
|
X[i,:] = data[gene]
|
|
|
|
|
|
#X = transpose(X) # distancematrix needs matrix as (nGenes,nSamples)
|
|
|
|
D_list = CLS.distancematrix(X, dist=dist)
|
|
D = zeros((nVar,nVar),dtype='<f8')
|
|
for i,row in enumerate(D_list):
|
|
if i>0:
|
|
D[i,:len(row)]=row
|
|
|
|
D = D + D.T
|
|
MAX = 30.0
|
|
D_max = max(ravel(D))/MAX
|
|
D_n = D/D_max #normalised (max = 10.0)
|
|
D_n = (MAX+1.) - D_n #using correlation (inverse distance for dists)
|
|
|
|
A = NX.adj_matrix(G, nodelist=ids)
|
|
if out=='dist':
|
|
return D_n*A
|
|
elif out=='heat_kernel':
|
|
t=1.0
|
|
K = exp(-t*D*A)
|
|
return K
|
|
elif out=='complete':
|
|
return D_n
|
|
else:
|
|
return []
|
|
|
|
def remove_one_degree_nodes(G, iter=True):
|
|
"""Removes all nodes with only one neighbour. These nodes does
|
|
not contribute to community structure.
|
|
input:
|
|
G -- graph
|
|
iter -- True/False iteratively remove?
|
|
"""
|
|
G_copy = G.copy()
|
|
if iter==True:
|
|
while 1:
|
|
bad_nodes=[]
|
|
for node in G_copy.nodes():
|
|
if len(G_copy.neighbors(node))==1:
|
|
bad_nodes.append(node)
|
|
if len(bad_nodes)>0:
|
|
G_copy.delete_nodes_from(bad_nodes)
|
|
else:
|
|
break
|
|
else:
|
|
bad_nodes=[]
|
|
for ngb in G_copy.neighbors_iter():
|
|
if len(G_copy.neighbors(node))==1:
|
|
bad_nodes.append(node)
|
|
if len(bad_nodes)>0:
|
|
G_copy.delete_nodes_from(bad_nodes)
|
|
|
|
print "Deleted %s nodes from network" %(len(G)-len(G_copy))
|
|
return G_copy
|
|
|
|
def key_players(G, n=1, with_labels=False):
|
|
"""
|
|
Resilince measure
|
|
Identification of key nodes by fraction of nodes in
|
|
disconnected subgraph when the node is removed.
|
|
|
|
output:
|
|
fraction of nodes disconnected when node i is removed
|
|
"""
|
|
i=0
|
|
frac=[]
|
|
labels = {}
|
|
for node in G.nodes():
|
|
i+=1
|
|
print i
|
|
T = G.copy()
|
|
T.delete_node(node)
|
|
n_nodes = T.number_of_nodes()
|
|
sub_graphs = NX.connected_component_subgraphs(T)
|
|
n = len(sub_graphs)
|
|
if n>1:
|
|
strong_comp = sub_graphs[0]
|
|
fraction = 1.0 - 1.0*strong_comp.number_of_nodes()/n_nodes
|
|
frac.append(fraction)
|
|
labels[node]=fraction
|
|
|
|
else:
|
|
frac.append(0.0)
|
|
labels[node]=0.0
|
|
|
|
out = 1.0 - array(frac)
|
|
if with_labels==True:
|
|
return out,labels
|
|
else:
|
|
return out
|
|
|
|
def node_weighted_adj_matrix(G, weights=None, ave_type='harmonic', with_labels=False):
|
|
"""Return a weighted adjacency matrix of graph. The weights are
|
|
node weights.
|
|
input: G -- graph
|
|
weights -- dict, keys: nodes, values: weights
|
|
with_labels -- True/False, return labels?
|
|
|
|
output: A -- weighted eadjacency matrix
|
|
[index] -- node labels
|
|
|
|
"""
|
|
n=G.order()
|
|
# make an dictionary that maps vertex name to position
|
|
index={}
|
|
count=0
|
|
for node in G.nodes():
|
|
index[node]=count
|
|
count = count+1
|
|
|
|
a = zeros((n,n))
|
|
if type(G)=='networkx.xbase.XGraph':
|
|
raise
|
|
for head,tail in G.edges():
|
|
if ave_type == 'geometric':
|
|
a[index[head],index[tail]]= sqrt(weights[head]*weights[tail])
|
|
a[index[tail],index[head]]= a[index[head],index[tail]]
|
|
elif ave_type == 'harmonic':
|
|
a[index[head],index[tail]] = mean(weights[head],weights[tail])
|
|
a[index[tail],index[head]]= mean(weights[head],weights[tail])
|
|
if with_labels:
|
|
return a,index
|
|
else:
|
|
return a
|
|
|
|
def weighted_adj_matrix(G, with_labels=False):
|
|
"""Adjacency matrix of an XGraph whos weights are given in edges.
|
|
"""
|
|
A, labels = NX.adj_matrix(G, with_labels=True)
|
|
W = A.astype('<f8')
|
|
for orf, i in labels.items():
|
|
for orf2, j in labels.items():
|
|
if G.has_edge(orf, orf2):
|
|
edge_weight = G.get_edge(orf, orf2)
|
|
W[i,j] = edge_weight
|
|
W[j,i] = edge_weight
|
|
if with_labels==True:
|
|
return W, labels
|
|
else:
|
|
return W
|
|
|
|
def assortative_index(G):
|
|
"""Ouputs two vectors: the degree and the neighbor average degree.
|
|
Used to measure the assortative mixing. If the average degree is
|
|
pos. correlated with the degree we know that hubs tend to connect
|
|
to other hubs.
|
|
|
|
input: G, graph connected!!
|
|
ouput: d,mn_d: degree, and average degree of neighb.
|
|
(degree sorting from degree(with_labels=True))
|
|
"""
|
|
d = G.degree(with_labels=True)
|
|
out=[]
|
|
for node in G.nodes():
|
|
nn = G.neighbors(node)
|
|
if len(nn)>0:
|
|
nn_d = mean([float(d[i]) for i in nn])
|
|
out.append((d[node], nn_d))
|
|
return array(out).T
|
|
|
|
|
|
def struct_equivalence(G,n1,n2):
|
|
"""Returns the structural equivalence of a node pair. Two nodes
|
|
are structural equal if they share the same neighbors.
|
|
|
|
x_s = [ne(n1) union ne(n2) - ne(n1) intersection ne(n2)]/[ne(n1)
|
|
union ne(n2) + ne(n1) intersection ne(n2)]
|
|
ref: Brun et.al 2003
|
|
"""
|
|
|
|
#[ne(n1) union ne(n2) - ne(n1) intersection ne(n2
|
|
s1 = set(G.neighbors(n1))
|
|
s2 = set(G.neighbors(n2))
|
|
num_union = len(s1.union(s2))
|
|
num_intersection = len(s1.intersection(s2))
|
|
if num_union & num_intersection:
|
|
xs=0
|
|
else:
|
|
xs = (num_union - num_intersection)/(num_union + num_intersection)
|
|
return xs
|
|
|
|
def struct_equivalence_all(G):
|
|
"""Not finnished.
|
|
"""
|
|
A,labels = NX.adj_matrix(G,with_labels=True)
|
|
pass
|
|
|
|
def hamming_distance(n1,n2):
|
|
"""Not finnsihed.
|
|
"""
|
|
pass
|
|
|
|
def graph_corrcoeff(G, vec=None, nodelist=None, sim='corr'):
|
|
"""Returns the correlation coefficient for each node. The
|
|
correlation coefficient is between the node and its neighbours.
|
|
|
|
"""
|
|
if nodelist==None:
|
|
nodelist=G.nodes()
|
|
if vec == None:
|
|
vec = G.degree(nodelist)
|
|
if len(vec)!=len(nodelist):
|
|
raise NXUTILSError("The node value vector is not of same length (%s) as the nodelist(%s)") %(len(vec), len(nodelist))
|
|
|
|
A = NX.ad_matrix(G, nodelist=nodelist)
|
|
for i, node in enumerate(nodelist):
|
|
nei_i = A[i,:]==1
|
|
vec_i = vec[nei_i]
|
|
|
|
def weighted_laplacian(G,with_labels=False):
|
|
"""Return standard Laplacian of graph from a weighted adjacency matrix."""
|
|
n= G.order()
|
|
I = scipy.eye(n)
|
|
A = weighted_adj_matrix(G)
|
|
D = I*scipy.sum(A, 0)
|
|
L = D-A
|
|
if with_labels:
|
|
A,index = weighted_adj_matrix(G, with_labels=True)
|
|
return L, index
|
|
else:
|
|
return L
|
|
|
|
def grow_subnetworks(G, T2):
|
|
"""Return the highest scoring (T2-test) subgraph og G.
|
|
|
|
Use simulated annealing to identify highly grow subgraphs.
|
|
|
|
ref: -- Ideker et.al (Bioinformatics 18, 2002)
|
|
-- Patil and Nielsen (PNAS 2006)
|
|
|
|
"""
|
|
N = 1000
|
|
states = [(node, False) for node in G.nodes()]
|
|
t2_last = 0.0
|
|
for i in xrange(N):
|
|
if i==0: #assign random states
|
|
states = [(state[0], True) for state in states if rand(1)>.5]
|
|
sub_nodes = [state[0] for state in states if state[1]]
|
|
Gsub = NX.subgraph(G, sub_nodes)
|
|
Gsub = NX.connected_components_subgraphs(Gsub)[0]
|
|
t2 = [T2[node] for node in Gsub]
|
|
if t2>t2_last:
|
|
pass
|
|
else:
|
|
p = numpy.exp()
|
|
|
|
|
|
|
|
"""Below are methods for calculating graph metrics
|
|
|
|
Four main decompositions :
|
|
0.) Adjacency diffusion kernel expm(A),
|
|
1.) von neumann kernels (diagonalisation of adjacency matrix)
|
|
|
|
2.) laplacian kernels (geometric series of adj.)
|
|
|
|
3.) diffusion kernels (exponential series of adj.)
|
|
|
|
---- Kv
|
|
von_neumann : Kv = (I-alpha*A)^-1 (mod: A(I-alpha*A)^-1)? ,
|
|
geom. series
|
|
|
|
---- Kl
|
|
laplacian: Kl = (I-alpha*L)^-1 , geom. series
|
|
|
|
---- Kd
|
|
laplacian_diffusion: Kd = expm(-alpha*L)
|
|
exp. series
|
|
|
|
---- Ke
|
|
Exponential diffusion.
|
|
Ke = expm(A) .... expm(-A)?
|
|
|
|
"""
|
|
|
|
# TODO:
|
|
# check for numerical unstable eigenvalues and set to zero
|
|
# othervise some inverses wil explode ->ok ..using pinv for inverses
|
|
#
|
|
# This gives results that look numerical unstable
|
|
#
|
|
# -- divided adj by sum(A[:]), check this one (paper by Lebart scales with number of edges)
|
|
#
|
|
#
|
|
#
|
|
# the neumann kernel is defined in Kandola to be K = A*(I-A)^-1
|
|
# lowest eigenvectors are same as the highest of K = A*A ?
|
|
# this needs clarification
|
|
|
|
# diffusion is still wrong! ... ok
|
|
# diff needs normalisation?! check the meaning of exp(-s) = exp(1/s) -L = 1/degree ... etc
|
|
# Is it the negative of exp. of adj. metrix in Kandola?
|
|
#
|
|
# Normalised=False returns only nans (no idea why!!) ... fixed ok
|
|
|
|
# 31.1: diff is ok exp(0)=1 not zero!
|
|
# 07.03.2005: normalisation is ok: -> normalisation will emphasize high degree nodes
|
|
# 10.03.2005: symeig is unstable an returns nans of some eigenvectors? switching back to eig
|
|
# 14.05.2006: diffusion returns negative values, using expm(-LL) instead (FIX)
|
|
# 13.09.2206: update for use in numpy
|
|
|
|
# 27.04.2007: diffusion now uses pade approximations to matrix exponential. Also the last
|
|
|
|
def K_expAdj(W, normalised=True, alpha=1.0):
|
|
"""Matrix exponential of adjacency matrix, mentioned in Kandola as a general diffusion kernel.
|
|
"""
|
|
W = asarray(W)
|
|
t = W.dtype.char
|
|
if len(W.shape)!=2:
|
|
raise ValueError, "Non-matrix input to matrix function."
|
|
m,n = W.shape
|
|
if t in ['F','D']:
|
|
raise TypeError, "Complex input!"
|
|
if normalised==True:
|
|
T = diag( sqrt( 1./(sum(W,0))) )
|
|
W = dot(dot(T, W), T)
|
|
e,vr = eig(W)
|
|
s = real(e)**2 # from eigenvalues to singularvalues
|
|
vri = inv(vr)
|
|
s = maximum.reduce(s) + s
|
|
cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
|
|
cutoff = abs(cond*maximum.reduce(s))
|
|
psigma = eye(m)
|
|
for i in range(len(s)):
|
|
if abs(s[i]) > cutoff:
|
|
psigma[i,i] = .5*alpha*exp(s[i])
|
|
|
|
return dot(dot(vr,psigma),vri)
|
|
|
|
def K_vonNeumann(W, normalised=True, alpha=1.0):
|
|
""" The geometric series of path lengths.
|
|
Returns matrix square root of pseudo inverse of the adjacency matrix.
|
|
"""
|
|
W = asarray(W)
|
|
t = W.dtype.char
|
|
if len(W.shape)!=2:
|
|
raise ValueError, "Non-matrix input to matrix function."
|
|
m,n = W.shape
|
|
if t in ['F','D']:
|
|
raise TypeError, "Complex input!"
|
|
|
|
if normalised==True:
|
|
T = diag(sqrt(1./(sum(W,0))))
|
|
W = dot(dot(T,W),T)
|
|
e,vr = eig(W)
|
|
vri = inv(vr)
|
|
e = real(e) # we only work with real pos. eigvals
|
|
e = maximum.reduce(e) + e
|
|
cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
|
|
cutoff = cond*maximum.reduce(e)
|
|
psigma = zeros((m,n),t)
|
|
for i in range(len(e)):
|
|
if e[i] > cutoff:
|
|
psigma[i,i] = 1.0/e[i] #these are eig.vals (=sqrt(sing.vals))
|
|
return dot(dot(vr,psigma),vri).astype(t)
|
|
|
|
def K_laplacian(W, normalised=True, alpha=1.0):
|
|
""" This is the matrix pseudo inverse of L.
|
|
Also known as the average commute time matrix.
|
|
"""
|
|
W = asarray(W)
|
|
t = W.dtype.char
|
|
if len(W.shape)!=2:
|
|
raise ValueError, "Non-matrix input to matrix function."
|
|
m,n = W.shape
|
|
if t in ['F','D']:
|
|
raise TypeError, "Complex input!"
|
|
D = diag(sum(W,0))
|
|
L = D - W
|
|
if normalised==True:
|
|
T = diag(sqrt(1./sum(W, 0)))
|
|
L = dot(dot(T, L), T)
|
|
e,vr = eig(L)
|
|
e = real(e)
|
|
vri = inv(vr)
|
|
cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
|
|
cutoff = cond*maximum.reduce(e)
|
|
psigma = zeros((m,),t) # if s close to zero -> set 1/s = 0
|
|
for i in range(len(e)):
|
|
if e[i] > cutoff:
|
|
psigma[i] = 1.0/e[i]
|
|
K = dot(dot(vr, diag(psigma)), vri).astype(t)
|
|
K = real(K)
|
|
I = eye(n)
|
|
K = (1-alpha)*I + alpha*K
|
|
return K
|
|
|
|
|
|
def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5, use_cut=False):
|
|
"""Returns diffusion kernel.
|
|
input:
|
|
-- W, adj. matrix
|
|
-- normalised [True/False]
|
|
-- alpha, [0,1] (degree of network influence)
|
|
-- beta, [0->), (diffusion degree)
|
|
"""
|
|
W = asarray(W)
|
|
t = W.dtype.char
|
|
if len(W.shape)!=2:
|
|
raise ValueError, "Non-matrix input to matrix function."
|
|
m, n = W.shape
|
|
if t in ['F','D']:
|
|
raise TypeError, "Complex input!"
|
|
D = diag(W.sum(0))
|
|
L = D - W
|
|
if normalised==True:
|
|
T = diag(sqrt(1./W.sum(0)))
|
|
L = dot(dot(T, L), T)
|
|
e, vr = eig(L)
|
|
vri = inv(vr) #inv
|
|
cond = 1.0*{0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
|
|
cutoff = 1.*abs(cond*maximum.reduce(e))
|
|
psigma = eye(m) # if eigvals are 0 exp(0)=1 (unnecessary)
|
|
#psigma = zeros((m,n), dtype='<f8')
|
|
for i in range(len(e)):
|
|
if abs(e[i]) > cutoff:
|
|
psigma[i,i] = exp(-beta*e[i])
|
|
#else:
|
|
# psigma[i,i] = 0.0
|
|
K = real(dot(dot(vr, psigma), vri))
|
|
I = eye(n, dtype='<f8')
|
|
K = (1. - alpha)*I + alpha*K
|
|
return K
|
|
|
|
def K_diffusion2(W, normalised=True, alpha=1.0, beta=0.5, ncomp=None):
|
|
"""Returns diffusion kernel, using fast pade approximation.
|
|
input:
|
|
-- W, adj. matrix
|
|
-- normalised [True/False]
|
|
-- beta, [0->), (diffusion degree)
|
|
"""
|
|
|
|
D = diag(W.sum(0))
|
|
L = D - W
|
|
if normalised==True:
|
|
T = diag(sqrt(1./W.sum(0)))
|
|
L = dot(dot(T, L), T)
|
|
return expm(-beta*L)
|
|
|
|
|
|
def K_modularity(W,alpha=1.0):
|
|
""" Returns the matrix square root of Newmans modularity."""
|
|
W = asarray(W)
|
|
t = W.dtype.char
|
|
m, n = W.shape
|
|
d = sum(W, 0)
|
|
m = 1.*sum(d)
|
|
B = W - (outer(d, d)/m)
|
|
s,v = sorted_eig(B, sort_by='lm')
|
|
psigma = zeros( (n, n), dtype='<f8' )
|
|
for i in range(len(s)):
|
|
if s[i]>1e-7:
|
|
psigma[i,i] = sqrt(s[i])
|
|
#psigma[i,i] = s[i]
|
|
K = dot(dot(v, psigma), v.T)
|
|
I = eye(n)
|
|
K = (1 - alpha)*I + alpha*K
|
|
return K
|
|
|
|
def kernel_score(K, W):
|
|
"""Returns the modularity score.
|
|
K -- (modularity) kernel
|
|
W -- adjacency matrix (possibly weighted)
|
|
"""
|
|
# normalize W (: W'W=I)
|
|
m, n = shape(W)
|
|
for i in range(n):
|
|
W[:,i] = W[:,i]/norm(W[:,i])
|
|
score = diag(dot(W, dot(K, W)) )
|
|
tot = sum(score)
|
|
return score, tot
|
|
|
|
|
|
def modularity_matrix(G, nodelist=None):
|
|
if not nodelist:
|
|
nodelist = G.nodes()
|
|
else:
|
|
G = NX.subgraph(G, nodelist)
|
|
|
|
A = NX.adj_matrix(G, nodelist=nodelist)
|
|
d = atleast_2d(G.degree(nbunch=nodelist))
|
|
m = 1.*G.number_of_edges()
|
|
B = A - A/m
|
|
return B
|
|
|
|
|
|
|
|
|