This repository has been archived on 2024-07-04. You can view files and clone it, but cannot push or open issues or pull requests.
laydi/fluents/lib/nx_utils.py
2007-01-25 11:58:10 +00:00

652 lines
20 KiB
Python

import os,sys
from itertools import izip
import networkx as NX
from scipy import shape,diag,dot,asarray,sqrt,real,zeros,eye,exp,maximum,\
outer,maximum,sum,diag,real
from scipy.linalg import eig,svd,inv,expm,norm
from cx_utils import sorted_eig
import numpy
eps = numpy.finfo(float).eps.item()
feps = numpy.finfo(numpy.single).eps.item()
_array_precision = {'f': 0, 'd': 1, 'F': 0, 'D': 1,'i': 1}
def xgraph_to_graph(G):
"""Convert an Xgraph to an ordinary graph.
Edge attributes, mult.edges and self-loops are lost in the process.
"""
GG = NX.convert.from_dict_of_lists(NX.convert.to_dict_of_lists(G))
return GG
def get_affinity_matrix(G, data, ids, dist='e', mask=None, weight=None, t=0, out='dist'):
"""
Function for calculating a general affinity matrix, based upon distances.
Affiniy = 1 - distance ((10-1) 1 is far apart)
INPUT
data:
gene expression data, type dict data[gene] = expression-vector
G:
The network (networkx.base.Graph object)
mask:
The array mask shows which data are missing. If mask[i][j]==0, then
data[i][j] is missing.
weights:
The array weight contains the weights to be used when calculating distances.
transpose:
If transpose==0, then genes are clustered. If transpose==1, microarrays are
clustered.
dist:
The character dist defines the distance function to be used:
dist=='e': Euclidean distance
dist=='b': City Block distance
dist=='h': Harmonically summed Euclidean distance
dist=='c': Pearson correlation
dist=='a': absolute value of the correlation
dist=='u': uncentered correlation
dist=='x': absolute uncentered correlation
dist=='s': Spearman's rank correlation
dist=='k': Kendall's tau
For other values of dist, the default (Euclidean distance) is used.
OUTPUT
D :
Similariy matrix (nGenes x nGenes), symetric, d_ij e in [0,1]
Normalized so max weight = 1.0
"""
try:
from Bio import Cluster as CLS
except:
raise ValueError, "Need installed biopython"
nVar = len(data)
nSamp = len(data[data.keys()[0]])
X = zeros((nVar, nSamp),dtpye='<f8')
for i,gene in enumerate(ids): #this shuld be right!!
X[i,:] = data[gene]
#X = transpose(X) # distancematrix needs matrix as (nGenes,nSamples)
D_list = CLS.distancematrix(X, dist=dist)
D = zeros((nVar,nVar),dtype='<f8')
for i,row in enumerate(D_list):
if i>0:
D[i,:len(row)]=row
D = D + D.T
MAX = 30.0
D_max = max(ravel(D))/MAX
D_n = D/D_max #normalised (max = 10.0)
D_n = (MAX+1.) - D_n #using correlation (inverse distance for dists)
A = NX.adj_matrix(G, nodelist=ids)
if out=='dist':
return D_n*A
elif out=='heat_kernel':
t=1.0
K = exp(-t*D*A)
return K
elif out=='complete':
return D_n
else:
return []
def remove_one_degree_nodes(G, iter=True):
"""Removes all nodes with only one neighbour. These nodes does
not contribute to community structure.
input:
G -- graph
iter -- True/False iteratively remove?
"""
G_copy = G.copy()
if iter==True:
while 1:
bad_nodes=[]
for node in G_copy.nodes():
if len(G_copy.neighbors(node))==1:
bad_nodes.append(node)
if len(bad_nodes)>0:
G_copy.delete_nodes_from(bad_nodes)
else:
break
else:
bad_nodes=[]
for ngb in G_copy.neighbors_iter():
if len(G_copy.neighbors(node))==1:
bad_nodes.append(node)
if len(bad_nodes)>0:
G_copy.delete_nodes_from(bad_nodes)
print "Deleted %s nodes from network" %(len(G)-len(G_copy))
return G_copy
def key_players(G, n=1, with_labels=False):
"""
Resilince measure
Identification of key nodes by fraction of nodes in
disconnected subgraph when the node is removed.
output:
fraction of nodes disconnected when node i is removed
"""
i=0
frac=[]
labels = {}
for node in G.nodes():
i+=1
print i
T = G.copy()
T.delete_node(node)
n_nodes = T.number_of_nodes()
sub_graphs = NX.connected_component_subgraphs(T)
n = len(sub_graphs)
if n>1:
strong_comp = sub_graphs[0]
fraction = 1.0 - 1.0*strong_comp.number_of_nodes()/n_nodes
frac.append(fraction)
labels[node]=fraction
else:
frac.append(0.0)
labels[node]=0.0
out = 1.0 - array(frac)
if with_labels==True:
return out,labels
else:
return out
def node_weighted_adj_matrix(G, weights=None, ave_type='harmonic', with_labels=False):
"""Return a weighted adjacency matrix of graph. The weights are
node weights.
input: G -- graph
weights -- dict, keys: nodes, values: weights
with_labels -- True/False, return labels?
output: A -- weighted eadjacency matrix
[index] -- node labels
"""
n=G.order()
# make an dictionary that maps vertex name to position
index={}
count=0
for node in G.nodes():
index[node]=count
count = count+1
a = zeros((n,n))
if type(G)=='networkx.xbase.XGraph':
raise
for head,tail in G.edges():
if ave_type == 'geometric':
a[index[head],index[tail]]= sqrt(weights[head]*weights[tail])
a[index[tail],index[head]]= a[index[head],index[tail]]
elif ave_type == 'harmonic':
a[index[head],index[tail]] = mean(weights[head],weights[tail])
a[index[tail],index[head]]= mean(weights[head],weights[tail])
if with_labels:
return a,index
else:
return a
def weighted_adj_matrix(G, with_labels=False):
"""Adjacency matrix of an XGraph whos weights are given in edges.
"""
A, labels = NX.adj_matrix(G, with_labels=True)
W = A.astype('<f8')
for orf, i in labels.items():
for orf2, j in labels.items():
if G.has_edge(orf, orf2):
edge_weight = G.get_edge(orf, orf2)
W[i,j] = edge_weight
W[j,i] = edge_weight
if with_labels==True:
return W, labels
else:
return W
def assortative_index(G):
"""Ouputs two vectors: the degree and the neighbor average degree.
Used to measure the assortative mixing. If the average degree is
pos. correlated with the degree we know that hubs tend to connect
to other hubs.
input: G, graph connected!!
ouput: d,mn_d: degree, and average degree of neighb.
(degree sorting from degree(with_labels=True))
"""
d = G.degree(with_labels=True)
out=[]
for node in G.nodes():
nn = G.neighbors(node)
if len(nn)>0:
nn_d = mean([float(d[i]) for i in nn])
out.append((d[node], nn_d))
return array(out).T
def struct_equivalence(G,n1,n2):
"""Returns the structural equivalence of a node pair. Two nodes
are structural equal if they share the same neighbors.
x_s = [ne(n1) union ne(n2) - ne(n1) intersection ne(n2)]/[ne(n1)
union ne(n2) + ne(n1) intersection ne(n2)]
ref: Brun et.al 2003
"""
#[ne(n1) union ne(n2) - ne(n1) intersection ne(n2
s1 = set(G.neighbors(n1))
s2 = set(G.neighbors(n2))
num_union = len(s1.union(s2))
num_intersection = len(s1.intersection(s2))
if num_union & num_intersection:
xs=0
else:
xs = (num_union - num_intersection)/(num_union + num_intersection)
return xs
def struct_equivalence_all(G):
"""Not finnished.
"""
A,labels = NX.adj_matrix(G,with_labels=True)
pass
def hamming_distance(n1,n2):
"""Not finnsihed.
"""
pass
def graph_corrcoeff(G):
"""Not finnished.
"""
A,index = NX.adj_matrix(G,with_labels=True)
#C = zeros(*A.shape(),'d')
n = 1.*G.number_of_nodes()
for node in G.nodes():
a_j = A[index[node],:] #neighbors
mean_a = sum(a_j)/n# degree(G)/number_of_nodes()
var_a = sqrt(sum((a_j - mean_a)**2)/n)
pass
def graph_and_data_intersection(data, graph, pathways=None,
keep_connected=True):
"""Returns the intersection of keys in two dictionaries.
NB: keep track of identifer sorting after these dict transforms.
input:
data -- dict, keys: gene id, value: measurement profile
graph -- networkx,base.graph, full graph
pathways -- dict, keys: pathway name, values: nodes in pathway
call:
new_data, new_graph,pathways = graph_and_data_intersection(data,graph,pathways,keep_connected=True)
"""
new_graph = graph.copy()
new_data = {}
new_pathways = {}
graph_set = set(graph.nodes())
data_set = set(data.keys())
intersection = data_set & graph_set
new_graph.delete_nodes_from(graph_set - data_set) #remove difference
for k in intersection:
new_data[k] = data[k]
if keep_connected:
max_iter = 0
sub_graphs = NX.connected_component_subgraphs(new_graph)
if len(sub_graphs)==0:
new_graph = sub_graphs[0]
else:
new_graph = sub_graphs[0]
old_data = new_data
while new_graph.number_of_nodes() != len(new_data) and max_iter<100:
max_iter+=1
graph_set = sets.Set(new_graph.nodes())
data_set = sets.Set(new_data.keys())
intersection = data_set & graph_set
new_graph.delete_nodes_from(graph_set - data_set)
new_data={}
for k in intersection:
new_data[k] = old_data[k]
old_data = new_data.copy()
new_graph = NX.connected_component_subgraphs(new_graph)[0]
if pathways!=None:
for pth,nodes in pathways.items():
new_pathways[pth] = [node for node in nodes if node in new_graph]
print "\nSUMMARY (graph_and_data_intersection): "
print "Number of input variables: %s\n\
Number nodes in input graph: %s" %(len(data),len(graph))
print "\nUsing intersection of connected graph and nodes with data values"
print "Number of variables is now: %s" %len(new_data)
print "Number of nodes in graph: %s" %new_graph.number_of_nodes()
if pathways!=None:
return new_data,new_graph,new_pathways
else:
return new_data,new_graph
def rx_graph_and_data_intersection(graph,node_data,pathways,data,keep_connected=False):
"""Returns a (connected) reaction graph with present gene expression data.
keep_connected==True:
When a node (gene) is not present in our expression data, the node
is deleted and all neighbors are connected with edge weight=0.5
if the are not already neigbors.
input:
data -- dict, keys: gene id, value: measurement profile
graph -- networkx.xbase.xgraph, full wieghted graph
node_data -- dict, keys: rx id, value: set of gene_ids
pathways -- dict, keys: pathway name, values: lidt of nodes in pathway
"""
# We do not connect the full graph ... may be performed by using the reference graph?
graph = NX.connected_component_subgraphs(graph)[0] #largest connected component
new_graph = graph.copy()
new_data = {}
new_node_data = node_data.copy()
new_pathways = {}
genes_in_graph=set()
genes_in_data = set(data.keys())
rx_in_graph = set(new_graph.nodes())
# genes in graph nodes (rx_nodes)
for rx in rx_in_graph:
genes_in_graph.update(set(new_node_data.get(rx)))
keep_genes = genes_in_data.intersection(genes_in_graph) #both in graph and data
#update node data
for rx,genes in node_data.items(): # delete node data of nodes not present in graph
genes = set(genes)
genes.intersection_update(keep_genes) #remove genes if they are not in inters.
if len(genes)==0 or rx not in rx_in_graph: #no gene data or not in graph
print "removing: " + str(rx)
del new_node_data[rx]
rx_in_data= set(new_node_data.keys())
rx_intersection = rx_in_data.intersection(rx_in_graph)
for gene in keep_genes:
new_data[gene] = data.get(gene)
# update pathways nodes
for pth,genes in pathways.items():
if genes:
genes = set(genes)
genes.intersection_update(keep_genes) # gene needs to have data
else:
pass
new_pathways[pth] = genes
bad_nodes = rx_in_graph.difference(rx_in_data) #in graph but no data
if keep_connected==True:
dummy = new_graph.copy()
for rx in bad_nodes:
dummy.delete_node(rx)
if len(NX.connected_component_subgraphs(dummy))>1:
nghbrs = new_graph.neighbors(rx)
for i in nghbrs:
for j in nghbrs:
if i!=j:
if not new_graph.has_edge(i,j):
new_graph.add_edge(i,j,0.5)
#update graph
new_graph.delete_nodes_from(list(bad_nodes))
return new_graph,new_node_data,new_pathways,new_data
def weighted_laplacian(G,with_labels=False):
"""Return standard Laplacian of graph from a weighted adjacency matrix."""
n= G.order()
I = scipy.eye(n)
A = weighted_adj_matrix(G)
D = I*scipy.sum(A, 0)
L = D-A
if with_labels:
A,index = weighted_adj_matrix(G, with_labels=True)
return L, index
else:
return L
def subnetworks(G, T2):
"""Return the highest scoring (T2-test) subgraph og G.
Use simulated annealing to identify highly scoring subgraphs.
ref: -- Ideker et.al (Bioinformatics 18, 2002)
-- Patil and Nielsen (PNAS 2006)
"""
N = 1000
states = [(node, False) for node in G.nodes()]
t2_last = 0.0
for i in xrange(N):
if i==0: #assign random states
states = [(state[0], True) for state in states if rand(1)>.5]
sub_nodes = [state[0] for state in states if state[1]]
Gsub = NX.subgraph(G, sub_nodes)
Gsub = NX.connected_components_subgraphs(Gsub)[0]
t2 = [T2[node] for node in Gsub]
if t2>t2_last:
pass
else:
p = numpy.exp()
"""Below are methods for calculating graph metrics
Four main decompositions :
0.) Adjacency diffusion kernel expm(A),
1.) von neumann kernels (diagonalisation of adjacency matrix)
2.) laplacian kernels (geometric series of adj.)
3.) diffusion kernels (exponential series of adj.)
---- Kv
von_neumann : Kv = (I-alpha*A)^-1 (mod: A(I-alpha*A)^-1)? ,
geom. series
---- Kl
laplacian: Kl = (I-alpha*L)^-1 , geom. series
---- Kd
laplacian_diffusion: Kd = expm(-alpha*L)
exp. series
---- Ke
Exponential diffusion.
Ke = expm(A) .... expm(-A)?
"""
# TODO:
# check for numerical unstable eigenvalues and set to zero
# othervise some inverses wil explode ->ok ..using pinv for inverses
#
# This gives results that look numerical unstable
#
# -- divided adj by sum(A[:]), check this one (paper by Lebart scales with number of edges)
#
#
#
# the neumann kernel is defined in Kandola to be K = A*(I-A)^-1
# lowest eigenvectors are same as the highest of K = A*A ?
# this needs clarification
# diffusion is still wrong! ... ok
# diff needs normalisation?! check the meaning of exp(-s) = exp(1/s) -L = 1/degree ... etc
# Is it the negative of exp. of adj. metrix in Kandola?
#
# Normalised=False returns only nans (no idea why!!) ... fixed ok
# 31.1: diff is ok exp(0)=1 not zero!
# 07.03.2005: normalisation is ok: -> normalisation will emphasize high degree nodes
# 10.03.2005: symeig is unstable an returns nans of some eigenvectors? switching back to eig
# 14.05.2006: diffusion returns negative values, using expm(-LL) instead (FIX)
# 13.09.2206: update for use in numpy
def K_expAdj(W, normalised=True, alpha=1.0):
"""Matrix exponential of adjacency matrix, mentioned in Kandola as a general diffusion kernel.
"""
W = asarray(W)
t = W.dtype.char
if len(W.shape)!=2:
raise ValueError, "Non-matrix input to matrix function."
m,n = W.shape
if t in ['F','D']:
raise TypeError, "Complex input!"
if normalised==True:
T = diag( sqrt( 1./(sum(W,0))) )
W = dot(dot(T, W), T)
e,vr = eig(W)
s = real(e)**2 # from eigenvalues to singularvalues
vri = inv(vr)
s = maximum.reduce(s) + s
cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
cutoff = abs(cond*maximum.reduce(s))
psigma = eye(m)
for i in range(len(s)):
if abs(s[i]) > cutoff:
psigma[i,i] = .5*alpha*exp(s[i])
return dot(dot(vr,psigma),vri)
def K_vonNeumann(W, normalised=True, alpha=1.0):
""" The geometric series of path lengths.
Returns matrix square root of pseudo inverse of the adjacency matrix.
"""
W = asarray(W)
t = W.dtype.char
if len(W.shape)!=2:
raise ValueError, "Non-matrix input to matrix function."
m,n = W.shape
if t in ['F','D']:
raise TypeError, "Complex input!"
if normalised==True:
T = diag(sqrt(1./(sum(W,0))))
W = dot(dot(T,W),T)
e,vr = eig(W)
vri = inv(vr)
e = real(e) # we only work with real pos. eigvals
e = maximum.reduce(e) + e
cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
cutoff = cond*maximum.reduce(e)
psigma = zeros((m,n),t)
for i in range(len(e)):
if e[i] > cutoff:
psigma[i,i] = 1.0/e[i] #these are eig.vals (=sqrt(sing.vals))
return dot(dot(vr,psigma),vri).astype(t)
def K_laplacian(W, normalised=True, alpha=1.0):
""" This is the matrix square root of the pseudo inverse of L.
Also known as th eaverage commute time matrix.
"""
W = asarray(W)
t = W.dtype.char
if len(W.shape)!=2:
raise ValueError, "Non-matrix input to matrix function."
m,n = W.shape
if t in ['F','D']:
raise TypeError, "Complex input!"
D = diag(sum(W,0))
L = D - W
if normalised==True:
T = diag(sqrt(1./sum(W, 0)))
L = dot(dot(T, L), T)
e,vr = eig(L)
e = real(e)
vri = inv(vr)
cond = {0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
cutoff = cond*maximum.reduce(e)
psigma = zeros((m,),t) # if s close to zero -> set 1/s = 0
for i in range(len(e)):
if e[i] > cutoff:
psigma[i] = 1.0/e[i]
K = dot(dot(vr, diag(psigma)), vri).astype(t)
K = real(K)
I = eye(n)
K = (1-alpha)*I + alpha*K
return K
def K_diffusion(W, normalised=True, alpha=1.0, beta=0.5):
"""Returns diffusion kernel.
input:
-- W, adj. matrix
-- normalised [True/False]
-- alpha, [0,1] (degree of network influence)
-- beta, [0->), (diffusion degree)
"""
W = asarray(W)
t = W.dtype.char
if len(W.shape)!=2:
raise ValueError, "Non-matrix input to matrix function."
m,n = W.shape
if t in ['F','D']:
raise TypeError, "Complex input!"
D = diag(sum(W,0))
L = D-W
if normalised==True:
T = diag(sqrt(1./(sum(W,0))))
L = dot(dot(T,L),T)
e,vr = eig(L)
vri = inv(vr) #inv
cond = 1.0*{0: feps*1e3, 1: eps*1e6}[_array_precision[t]]
cutoff = 1.*abs(cond*maximum.reduce(e))
psigma = eye(m) # if sing vals are 0 exp(0)=1 (unnecessary)
#psigma = zeros((m,n), dtype='<f8')
for i in range(len(e)):
if abs(e[i]) > cutoff:
psigma[i,i] = exp(-beta*e[i])
K = real(dot(dot(vr, psigma), vri))
I = eye(n, dtype='<f8')
K = (1. - alpha)*I + alpha*K
return K
def K_modularity(W,alpha=1.0):
""" Returns the matrix square root of Newmans modularity."""
W = asarray(W)
t = W.dtype.char
m, n = W.shape
d = sum(W, 0)
m = 1.*sum(d)
B = W - (outer(d, d)/m)
s,v = sorted_eig(B, sort_by='lm')
psigma = zeros( (n, n), dtype='<f8' )
for i in range(len(s)):
if s[i]>1e-7:
psigma[i,i] = sqrt(s[i])
#psigma[i,i] = s[i]
K = dot(dot(v, psigma), v.T)
I = eye(n)
K = (1 - alpha)*I + alpha*K
return K
def kernel_score(K, W):
"""Returns the modularity score.
K -- (modularity) kernel
W -- adjacency matrix (possibly weighted)
"""
# normalize W (: W'W=I)
m, n = shape(W)
for i in range(n):
W[:,i] = W[:,i]/norm(W[:,i])
score = diag(dot(W, dot(K, W)) )
tot = sum(score)
return score, tot