forked from baliga-lab/sygnal
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbicluster.py
More file actions
executable file
·138 lines (126 loc) · 6.36 KB
/
bicluster.py
File metadata and controls
executable file
·138 lines (126 loc) · 6.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#################################################################
# @Program: bicluster.py #
# @Version: 1 #
# @Author: Chris Plaisier #
# @Sponsored by: #
# Nitin Baliga, ISB #
# Institute for Systems Biology #
# 1441 North 34th Street #
# Seattle, Washington 98103-8904 #
# (216) 732-2139 #
# @Also Sponsored by: #
# Luxembourg Systems Biology Grant #
# #
# If this program is used in your analysis please mention who #
# built it. Thanks. :-) #
# #
# Copyrighted by Chris Plaisier 12/8/2009 #
#################################################################
import os
from pssm import PSSM
import sqlite3 as lite
class Bicluster:
"""A class designed to hold the information pertaining to a bicluster
and be able to output this in many different formats."""
def __init__(self, k, maxIter, de_novo_method_upstream=None, de_novo_method_3pUTR=None, sqliteDb=None):
self.k = k
self.maxIter = maxIter
self.attributes = {}
self.sqliteDb = sqliteDb
self.name = 'Bicluster %d' % k
self.__read_data(de_novo_method_upstream, de_novo_method_3pUTR)
# Reads in the bicluster file generated by cMonkeyWrapper
def __read_data(self, de_novo_method_upstream=None, de_novo_method_3pUTR=None):
# Attach to the database
con = lite.connect(self.sqliteDb)
con.row_factory = lite.Row
cur = con.cursor()
# Get the number of biclusters in run
q1 = 'SELECT name FROM row_members, row_names WHERE row_members.cluster='+str(self.k)+' and row_members.iteration='+str(self.maxIter)+' and row_members.order_num=row_names.order_num'
cur.execute(q1)
data = cur.fetchall()
self.genes = [str(i['name']) for i in data]
# Read in conditions
q1 = 'SELECT name FROM column_members, column_names WHERE column_members.cluster='+str(self.k)+' and column_members.iteration='+str(self.maxIter)+' and column_members.order_num=column_names.order_num'
cur.execute(q1)
data = cur.fetchall()
self.conditions = [i['name'] for i in data]
# Read in residuals
q1 = 'SELECT residual FROM cluster_stats WHERE cluster='+str(self.k)+' and iteration='+str(self.maxIter)
cur.execute(q1)
data = cur.fetchall()
self.norm_residual = float(data[0]['residual'])
# At some point might be nice to figure out how to include this
"""# Read in associations with clinical traits
self.cor = {}
corFile = open('biclust/'+str(self.k)+'/correlation.csv','r')
corFile.readline() # Skip header
for line in corFile.readlines():
splitUp = line.strip().split(',')
self.cor[splitUp[0].strip('"')] = {'cor':splitUp[1],'pValue':splitUp[2]}
corFile.close()
self.surv = {}
survFile = open('biclust/'+str(self.k)+'/survival.csv','r')
survFile.readline() # Skip header
for line in survFile.readlines():
splitUp = line.strip().split(',')
self.surv[splitUp[0]] = {'zScore':splitUp[1],'pValue':splitUp[2]}
survFile.close()
"""
# Now load up the PSSMs
q1 = 'SELECT rowid, seqtype, motif_num, evalue FROM motif_infos WHERE cluster='+str(self.k)+' and iteration=2001'
cur.execute(q1)
motifs = cur.fetchall()
# Set the de novo method
if not de_novo_method_upstream==None:
de_novo_method = de_novo_method_upstream
else:
de_novo_method = de_novo_method_3pUTR
self.pssms_upstream = []
self.pssms_3putr = []
for motif in motifs:
name = str(self.k)+'_motif'+str(motif['motif_num'])+'_'+de_novo_method
q1 = 'SELECT row_names.name, motif_annotations.position, motif_annotations.reverse, motif_annotations.pvalue FROM motif_annotations, row_names WHERE motif_annotations.motif_info_id='+str(motif['rowid'])+' and motif_annotations.gene_num=row_names.order_num'
cur.execute(q1)
sites = [[str(i['name']), i['position'], i['reverse'], i['pvalue']] for i in cur.fetchall()]
nsites = len(sites)
eValue = motif['evalue']
q1 = 'SELECT a, c, g, t FROM motif_pssm_rows WHERE motif_info_id='+str(motif['rowid']) #+' and iteration='+str(self.maxIter)
cur.execute(q1)
matrix = [[i['a'], i['c'], i['g'], i['t']] for i in cur.fetchall()]
genes = list(set([i[0] for i in sites]))
if motif['seqtype']=='upstream':
self.pssms_upstream.append(PSSM(name, nsites, eValue, matrix, genes, de_novo_method))
else:
self.pssms_3putr.append(PSSM(name, nsites, eValue, matrix, genes, de_novo_method))
con.close()
def num_genes(self):
return len(self.genes)
def num_conditions(self):
return len(self.conditions)
# Returns a list of the upstream pssms for this biclutster
def pssm_upstream(self, name):
for i in self.pssms_upstream:
if i.name == name:
return i
return 'No such PSSM with that name!'
def pssm_3putr(self, name):
for i in self.pssms_3putr:
if i.name == name:
return i
return 'No such PSSM with that name!'
def pssm_names_upstream(self):
return [pssm.name for pssm in self.pssms_upstream]
def pssm_names_3putr(self):
return [pssm.name for pssm in self.pssms_3putr]
# Adds a PSSM to a bicluster
def add_pssm_upstream(self, pssm):
self.pssms_upstream.append(pssm)
# Adds a PSSM to a bicluster
def add_pssm_3putr(self, pssm):
self.pssms_3putr.append(pssm)
# Add attribute
def add_attribute(self, key, value):
if key is None or value is None:
print 'Error: Key or Value have no data'
self.attributes[key] = value