This repository has been archived on 2024-07-04. You can view files and clone it, but cannot push or open issues or pull requests.
laydi/fluents/lib/nx_utils.py

568 lines
16 KiB
Python
Raw Normal View History

2006-12-18 12:59:12 +01:00
import os,sys
from itertools import izip
import networkx as NX
from scipy import shape,diag,dot,asarray,sqrt,real,zeros,eye,exp,maximum,\
2007-07-23 19:33:21 +02:00
outer,maximum,sum,diag,real,atleast_2d
2006-12-18 12:59:12 +01:00
from scipy.linalg import eig,svd,inv,expm,norm
from cx_utils import sorted_eig
import numpy
2007-03-14 17:32:49 +01:00
2006-12-18 12:59:12 +01:00
eps = numpy.finfo(float).eps.item()
feps = numpy.finfo(numpy.single).eps.item()
_array_precision = {'f': 0, 'd': 1, 'F': 0, 'D': 1,'i': 1}
2007-03-14 17:32:49 +01:00
class NXUTILSException(Exception): pass
2006-12-18 12:59:12 +01:00
def xgraph_to_graph(G):
"""Convert an Xgraph to an ordinary graph.
Edge attributes, mult.edges and self-loops are lost in the process.
"""
GG = NX.convert.from_dict_of_lists(NX.convert.to_dict_of_lists(G))
return GG
def get_affinity_matrix(G, data, ids, dist='e', mask=None, weight=None, t=0, out='dist'):
"""
Function for calculating a general affinity matrix, based upon distances.
Affiniy = 1 - distance ((10-1) 1 is far apart)
INPUT
data:
gene expression data, type dict data[gene] = expression-vector
G:
The network (networkx.base.Graph object)
mask:
The array mask shows which data are missing. If mask[i][j]==0, then
data[i][j] is missing.
weights:
The array weight contains the weights to be used when calculating distances.
transpose:
If transpose==0, then genes are clustered. If transpose==1, microarrays are
clustered.
dist:
The character dist defines the distance function to be used:
dist=='e': Euclidean distance
dist=='b': City Block distance
dist=='h': Harmonically summed Euclidean distance
dist=='c': Pearson correlation
dist=='a': absolute value of the correlation
dist=='u': uncentered correlation
dist=='x': absolute uncentered correlation
dist=='s': Spearman's rank correlation
dist=='k': Kendall's tau
For other values of dist, the default (Euclidean distance) is used.
OUTPUT
D :
Similariy matrix (nGenes x nGenes), symetric, d_ij e in [0,1]
Normalized so max weight = 1.0
"""
try:
from Bio import Cluster as CLS
except:
2007-03-14 17:32:49 +01:00
raise NXUTILSError("Import of Biopython failed")
n_var = len(data)
n_samp = len(data[data.keys()[0]])
2006-12-18 12:59:12 +01:00
X = zeros((nVar, nSamp),dtpye='<f8')
2007-03-14 17:32:49 +01:00
for i, gene in enumerate(ids): #this shuld be right!!
2006-12-18 12:59:12 +01:00
X[i,:] = data[gene]
#X = transpose(X) # distancematrix needs matrix as (nGenes,nSamples)
D_list = CLS.distancematrix(X, dist=dist)
D = zeros((nVar,nVar),dtype='<f8')
for i,row in enumerate(D_list):
if i>0:
D[i,:len(row)]=row
D = D + D.T
MAX = 30.0
D_max = max(ravel(D))/MAX
D_n = D/D_max #normalised (max = 10.0)
D_n = (MAX+1.) - D_n #using correlation (inverse distance for dists)
A = NX.adj_matrix(G, nodelist=ids)
if out=='dist':
return D_n*A
elif out=='heat_kernel':
t=1.0
K = exp(-t*D*A)
return K
elif out=='complete':
return D_n
else:
return []
def remove_one_degree_nodes(G, iter=True):
"""Removes all nodes with only one neighbour. These nodes does
not contribute to community structure.
input:
G -- graph
iter -- True/False iteratively remove?
"""
G_copy = G.copy()
if iter==True:
while 1:
bad_nodes=[]
for node in G_copy.nodes():
if len(G_copy.neighbors(node))==1:
bad_nodes.append(node)
if len(bad_nodes)>0:
G_copy.delete_nodes_from(bad_nodes)
else:
break
else:
bad_nodes=[]
for ngb in G_copy.neighbors_iter():
if len(G_copy.neighbors(node))==1:
bad_nodes.append(node)
if len(bad_nodes)>0:
G_copy.delete_nodes_from(bad_nodes)
print "Deleted %s nodes from network" %(len(G)-len(G_copy))
return G_copy
def key_players(G, n=1, with_labels=False):
"""
Resilince measure
Identification of key nodes by fraction of nodes in
disconnected subgraph when the node is removed.
output:
fraction of nodes disconnected when node i is removed
"""
i=0
frac=[]
labels = {}
for node in G.nodes():
i+=1
print i
T = G.copy()
T.delete_node(node)
n_nodes = T.number_of_nodes()
sub_graphs = NX.connected_component_subgraphs(T)
n = len(sub_graphs)
if n>1:
strong_comp = sub_graphs[0]
fraction = 1.0 - 1.0*strong_comp.number_of_nodes()/n_nodes
frac.append(fraction)
labels[node]=fraction
else:
frac.append(0.0)
labels[node]=0.0
out = 1.0 - array(frac)
if with_labels==True:
return out,labels
else:
return out
def node_weighted_adj_matrix(G, weights=None, ave_type='harmonic', with_labels=False):
"""Return a weighted adjacency matrix of graph. The weights are
node weights.
input: G -- graph
weights -- dict, keys: nodes, values: weights
with_labels -- True/False, return labels?
output: A -- weighted eadjacency matrix
[index] -- node labels
"""
n=G.order()
# make an dictionary that maps vertex name to position
index={}
count=0
for node in G.nodes():
index[node]=count
count = count+1
a = zeros((n,n))
if type(G)=='networkx.xbase.XGraph':
raise
for head,tail in G.edges():
if ave_type == 'geometric':
a[index[head],index[tail]]= sqrt(weights[head]*weights[tail])
a[index[tail],index[head]]= a[index[head],index[tail]]
elif ave_type == 'harmonic':
a[index[head],index[tail]] = mean(weights[head],weights[tail])
a[index[tail],index[head]]= mean(weights[head],weights[tail])
if with_labels:
return a,index
else:
return a
def weighted_adj_matrix(G, with_labels=False):
"""Adjacency matrix of an XGraph whos weights are given in edges.
"""
2007-01-25 12:58:10 +01:00
A, labels = NX.adj_matrix(G, with_labels=True)
2006-12-18 12:59:12 +01:00
W = A.astype('<f8')
2007-01-25 12:58:10 +01:00
for orf, i in labels.items():
for orf2, j in labels.items():
if G.has_edge(orf, orf2):
edge_weight = G.get_edge(orf, orf2)
W[i,j] = edge_weight
W[j,i] = edge_weight
2006-12-18 12:59:12 +01:00
if with_labels==True:
2007-01-25 12:58:10 +01:00
return W, labels
2006-12-18 12:59:12 +01:00
else:
return W
def assortative_index(G):
"""Ouputs two vectors: the degree and the neighbor average degree.
Used to measure the assortative mixing. If the average degree is
pos. correlated with the degree we know that hubs tend to connect
to other hubs.
input: G, graph connected!!
ouput: d,mn_d: degree, and average degree of neighb.
(degree sorting from degree(with_labels=True))
"""
d = G.degree(with_labels=True)
out=[]
for node in G.nodes():
nn = G.neighbors(node)
if len(nn)>0:
nn_d = mean([float(d[i]) for i in nn])
out.append((d[node], nn_d))
return array(out).T
2007-03-14 17:32:49 +01:00
2006-12-18 12:59:12 +01:00
def struct_equivalence(G,n1,n2):
"""Returns the structural equivalence of a node pair. Two nodes
are structural equal if they share the same neighbors.
x_s = [ne(n1) union ne(n2) - ne(n1) intersection ne(n2)]/[ne(n1)
union ne(n2) + ne(n1) intersection ne(n2)]
ref: Brun et.al 2003
"""
#[ne(n1) union ne(n2) - ne(n1) intersection ne(n2
s1 = set(G.neighbors(n1))
s2 = set(G.neighbors(n2))
num_union = len(s1.union(s2))
num_intersection = len(s1.intersection(s2))
if num_union & num_intersection:
xs=0
else:
xs = (num_union - num_intersection)/(num_union + num_intersection)
return xs
def struct_equivalence_all(G):
"""Not finnished.
"""
A,labels = NX.adj_matrix(G,with_labels=True)
pass
def hamming_distance(n1,n2):
"""Not finnsihed.
"""
pass
2007-03-14 17:32:49 +01:00
def graph_corrcoeff(G, vec=None, nodelist=None, sim='corr'):
"""Returns the correlation coefficient for each node. The
correlation coefficient is between the node and its neighbours.
2006-12-18 12:59:12 +01:00
"""
2007-03-14 17:32:49 +01:00
if nodelist==None:
nodelist=G.nodes()
if vec == None:
vec = G.degree(nodelist)
if len(vec)!=len(nodelist):
raise NXUTILSError("The node value vector is not of same length (%s) as the nodelist(%s)") %(len(vec), len(nodelist))
A = NX.ad_matrix(G, nodelist=nodelist)
for i, node in enumerate(nodelist):
nei_i = A[i,:]==1
vec_i = vec[nei_i]
2006-12-18 12:59:12 +01:00
def weighted_laplacian(G,with_labels=False):
"""Return standard Laplacian of graph from a weighted adjacency matrix."""
n= G.order()
I = scipy.eye(n)
A = weighted_adj_matrix(G)
D = I*scipy.sum(A, 0)
L = D-A
if with_labels:
A,index = weighted_adj_matrix(G, with_labels=True)
return L, index
else:
return L
2007-03-14 17:32:49 +01:00
def grow_subnetworks(G, T2):
2007-01-25 12:58:10 +01:00
"""Return the highest scoring (T2-test) subgraph og G.
2007-03-14 17:32:49 +01:00
Use simulated annealing to identify highly grow subgraphs.
2007-01-25 12:58:10 +01:00
ref: -- Ideker et.al (Bioinformatics 18, 2002)
-- Patil and Nielsen (PNAS 2006)
"""
N = 1000
states = [(node, False) for node in G.nodes()]
t2_last = 0.0
for i in xrange(N):
if i==0: #assign random states
states = [(state[0], True) for state in states if rand(1)>.5]
sub_nodes = [state[0] for state in states if state[1]]
Gsub = NX.subgraph(G, sub_nodes)
Gsub = NX.connected_components_subgraphs(Gsub)[0]
t2 = [T2[node] for node in Gsub]
if t2>t2_last:
pass
else:
p = numpy.exp()
2006-12-18 12:59:12 +01:00
"""Below are methods for calculating graph metrics
Four main decompositions :
0.) Adjacency diffusion kernel expm(A),
1.) von neumann kernels (diagonalisation of adjacency matrix)
2.) laplacian kernels (geometric series of adj.)
3.) diffusion kernels (exponential series of adj.)
---- Kv
von_neumann : Kv = (I-alpha*A)^-1 (mod: A(I-alpha*A)^-1)? ,
geom. series
---- Kl
laplacian: Kl = (I-alpha*L)^-1 , geom. series
---- Kd
laplacian_diffusion: Kd = expm(-alpha*L)
exp. series
---- Ke
Exponential diffusion.
Ke = expm(A) .... expm(-A)?
"""
# TODO:
# check for numerical unstable eigenvalues and set to zero
# othervise some inverses wil explode ->ok ..using pinv for inverses
#
# This gives results that look numerical unstable
#
# -- divided adj by sum(A[:]), check this one (paper by Lebart scales with number of edges)
#
#
#
# the neumann kernel is defined in Kandola to be K = A*(I-A)^-1
# lowest eigenvectors are same as the highest of K = A*A ?
# this needs clarification
# diffusion is still wrong! ... ok
# diff needs normalisation?! check the meaning of exp(-s) = exp(1/s) -L = 1/degree ... etc
# Is it the negative of exp. of adj. metrix in Kandola?
#
# Normalised=False returns only nans (no idea why!!) ... fixed ok
# 31.1: diff is ok exp(0)=1 not zero!
# 07.03.2005: normalisation is ok: -> normalisation will emphasize high degree nodes
# 10.03.2005: symeig is unstable an returns nans of some eigenvectors? switching back to eig
# 14.05.2006: diffusion returns negative values, using expm(-LL) instead (FIX)
# 13.09.2206: update for use in numpy
2007-07-23 19:33:21 +02:00
# 27.04.2007: diffusion now uses pade approximations to matrix exponential. Also the last
2006-12-18 12:59:12 +01:00
2007-01-25 12:58:10 +01:00
def K_expAdj(W, normalised=True, alpha=1.0):
2006-12-18 12:59:12 +01:00
"""Matrix exponential of adjacency matrix, mentioned in Kandola as a general diffusion kernel.
"""
W = asarray(W)
t = W.dtype.char
if len(W.shape)!=2:
raise ValueError, "Non-matrix input to matrix function."
m,n = W.shape
if t in ['F','D']:
raise TypeError, "Complex input!"
if normalised==True:
T = diag( sqrt( 1./(sum(W,0))) )
W = dot(dot(T, W), T)
e,vr = eig(W)
s = real(e)**2 # from eigenvalues to singularvalues
vri = inv(vr)
s = maximum.reduce(s) + s
cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
cutoff = abs(cond*maximum.reduce(s))
psigma = eye(m)
for i in range(len(s)):
if abs(s[i]) > cutoff:
psigma[i,i] = .5*alpha*exp(s[i])
return dot(dot(vr,psigma),vri)
2007-01-25 12:58:10 +01:00
def K_vonNeumann(W, normalised=True, alpha=1.0):
2006-12-18 12:59:12 +01:00
""" The geometric series of path lengths.
Returns matrix square root of pseudo inverse of the adjacency matrix.
"""
W = asarray(W)
t = W.dtype.char
if len(W.shape)!=2:
raise ValueError, "Non-matrix input to matrix function."
m,n = W.shape
if t in ['F','D']:
raise TypeError, "Complex input!"
if normalised==True:
T = diag(sqrt(1./(sum(W,0))))
W = dot(dot(T,W),T)
e,vr = eig(W)
vri = inv(vr)
e = real(e) # we only work with real pos. eigvals
e = maximum.reduce(e) + e
cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
cutoff = cond*maximum.reduce(e)
psigma = zeros((m,n),t)
for i in range(len(e)):
if e[i] > cutoff:
psigma[i,i] = 1.0/e[i] #these are eig.vals (=sqrt(sing.vals))
return dot(dot(vr,psigma),vri).astype(t)
def K_laplacian(W, normalised=True, alpha=1.0):
2007-07-23 19:33:21 +02:00
""" This is the matrix pseudo inverse of L.
Also known as the average commute time matrix.
2006-12-18 12:59:12 +01:00
"""
W = asarray(W)
t = W.dtype.char
if len(W.shape)!=2:
raise ValueError, "Non-matrix input to matrix function."
m,n = W.shape
if t in ['F','D']:
raise TypeError, "Complex input!"
D = diag(sum(W,0))
L = D - W
if normalised==True:
2007-01-25 12:58:10 +01:00
T = diag(sqrt(1./sum(W, 0)))
L = dot(dot(T, L), T)
2006-12-18 12:59:12 +01:00
e,vr = eig(L)
e = real(e)
vri = inv(vr)
cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
cutoff = cond*maximum.reduce(e)
psigma = zeros((m,),t) # if s close to zero -> set 1/s = 0
for i in range(len(e)):
if e[i] > cutoff:
psigma[i] = 1.0/e[i]
2007-01-25 12:58:10 +01:00
K = dot(dot(vr, diag(psigma)), vri).astype(t)
2006-12-18 12:59:12 +01:00
K = real(K)
I = eye(n)
K = (1-alpha)*I + alpha*K
return K
2007-07-23 19:33:21 +02:00
def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5, use_cut=False):
2006-12-18 12:59:12 +01:00
"""Returns diffusion kernel.
input:
-- W, adj. matrix
-- normalised [True/False]
-- alpha, [0,1] (degree of network influence)
-- beta, [0->), (diffusion degree)
"""
W = asarray(W)
t = W.dtype.char
if len(W.shape)!=2:
raise ValueError, "Non-matrix input to matrix function."
2007-07-23 19:33:21 +02:00
m, n = W.shape
2006-12-18 12:59:12 +01:00
if t in ['F','D']:
raise TypeError, "Complex input!"
2007-07-23 19:33:21 +02:00
D = diag(W.sum(0))
L = D - W
2006-12-18 12:59:12 +01:00
if normalised==True:
2007-07-23 19:33:21 +02:00
T = diag(sqrt(1./W.sum(0)))
L = dot(dot(T, L), T)
e, vr = eig(L)
2006-12-18 12:59:12 +01:00
vri = inv(vr) #inv
cond = 1.0*{0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
cutoff = 1.*abs(cond*maximum.reduce(e))
2007-07-23 19:33:21 +02:00
psigma = eye(m) # if eigvals are 0 exp(0)=1 (unnecessary)
2006-12-18 12:59:12 +01:00
#psigma = zeros((m,n), dtype='<f8')
for i in range(len(e)):
if abs(e[i]) > cutoff:
psigma[i,i] = exp(-beta*e[i])
2007-07-23 19:33:21 +02:00
#else:
# psigma[i,i] = 0.0
2006-12-18 12:59:12 +01:00
K = real(dot(dot(vr, psigma), vri))
I = eye(n, dtype='<f8')
K = (1. - alpha)*I + alpha*K
return K
2007-07-23 19:33:21 +02:00
def K_diffusion2(W, normalised=True, alpha=1.0, beta=0.5, ncomp=None):
"""Returns diffusion kernel, using fast pade approximation.
input:
-- W, adj. matrix
-- normalised [True/False]
-- beta, [0->), (diffusion degree)
"""
D = diag(W.sum(0))
L = D - W
if normalised==True:
T = diag(sqrt(1./W.sum(0)))
L = dot(dot(T, L), T)
return expm(-beta*L)
2006-12-18 12:59:12 +01:00
def K_modularity(W,alpha=1.0):
""" Returns the matrix square root of Newmans modularity."""
W = asarray(W)
t = W.dtype.char
m, n = W.shape
d = sum(W, 0)
m = 1.*sum(d)
B = W - (outer(d, d)/m)
s,v = sorted_eig(B, sort_by='lm')
psigma = zeros( (n, n), dtype='<f8' )
for i in range(len(s)):
if s[i]>1e-7:
psigma[i,i] = sqrt(s[i])
#psigma[i,i] = s[i]
K = dot(dot(v, psigma), v.T)
I = eye(n)
K = (1 - alpha)*I + alpha*K
return K
def kernel_score(K, W):
"""Returns the modularity score.
K -- (modularity) kernel
W -- adjacency matrix (possibly weighted)
"""
# normalize W (: W'W=I)
m, n = shape(W)
for i in range(n):
W[:,i] = W[:,i]/norm(W[:,i])
score = diag(dot(W, dot(K, W)) )
tot = sum(score)
return score, tot
2007-07-23 19:33:21 +02:00
def modularity_matrix(G, nodelist=None):
if not nodelist:
nodelist = G.nodes()
else:
G = NX.subgraph(G, nodelist)
A = NX.adj_matrix(G, nodelist=nodelist)
d = atleast_2d(G.degree(nbunch=nodelist))
m = 1.*G.number_of_edges()
B = A - A/m
return B