sygnal/bicluster.py at master · plaisier-lab/sygnal · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#################################################################
# @Program: bicluster.py                                        #
# @Version: 1                                                   #
# @Author: Chris Plaisier                                       #
# @Sponsored by:                                                #
# Nitin Baliga, ISB                                             #
# Institute for Systems Biology                                 #
# 1441 North 34th Street                                        #
# Seattle, Washington  98103-8904                               #
# (216) 732-2139                                                #
# @Also Sponsored by:                                           #
# Luxembourg Systems Biology Grant                              #
#                                                               #
# If this program is used in your analysis please mention who   #
# built it. Thanks. :-)                                         #
#                                                               #
# Copyrighted by Chris Plaisier  12/8/2009                      #
#################################################################

import os
from pssm import PSSM
import sqlite3 as lite

class Bicluster:
    """A class designed to hold the information pertaining to a bicluster
    and be able to output this in many different formats."""
    def __init__(self, k, maxIter, de_novo_method_upstream=None, de_novo_method_3pUTR=None, sqliteDb=None):
        self.k = k
        self.maxIter = maxIter
        self.attributes = {}
        self.sqliteDb = sqliteDb
        self.name = 'Bicluster %d' % k
        self.__read_data(de_novo_method_upstream, de_novo_method_3pUTR)

    # Reads in the bicluster file generated by cMonkeyWrapper
    def __read_data(self, de_novo_method_upstream=None, de_novo_method_3pUTR=None):
        # Attach to the database
        con = lite.connect(self.sqliteDb)
        con.row_factory = lite.Row
        cur = con.cursor()
        # Get the number of biclusters in run
        q1 = 'SELECT name FROM row_members, row_names WHERE row_members.cluster='+str(self.k)+' and row_members.iteration='+str(self.maxIter)+' and row_members.order_num=row_names.order_num'
        cur.execute(q1)
        data = cur.fetchall()
        self.genes = [str(i['name']) for i in data]
        # Read in conditions
        q1 = 'SELECT name FROM column_members, column_names WHERE column_members.cluster='+str(self.k)+' and column_members.iteration='+str(self.maxIter)+' and column_members.order_num=column_names.order_num'
        cur.execute(q1)
        data = cur.fetchall()
        self.conditions = [i['name'] for i in data]
        # Read in residuals
        q1 = 'SELECT residual FROM cluster_stats WHERE cluster='+str(self.k)+' and iteration='+str(self.maxIter)
        cur.execute(q1)
        data = cur.fetchall()
        self.norm_residual = float(data[0]['residual'])
        # At some point might be nice to figure out how to include this
        """# Read in associations with clinical traits
        self.cor = {}
        corFile = open('biclust/'+str(self.k)+'/correlation.csv','r')
        corFile.readline() # Skip header
        for line in corFile.readlines():
            splitUp = line.strip().split(',')
            self.cor[splitUp[0].strip('"')] = {'cor':splitUp[1],'pValue':splitUp[2]}
        corFile.close()
        self.surv = {}
        survFile = open('biclust/'+str(self.k)+'/survival.csv','r')
        survFile.readline() # Skip header
        for line in survFile.readlines():
            splitUp = line.strip().split(',')
            self.surv[splitUp[0]] = {'zScore':splitUp[1],'pValue':splitUp[2]}
        survFile.close()
        """
        # Now load up the PSSMs
        q1 = 'SELECT rowid, seqtype, motif_num, evalue FROM motif_infos WHERE cluster='+str(self.k)+' and iteration=2001'
        cur.execute(q1)
        motifs = cur.fetchall()
        # Set the de novo method
        if not de_novo_method_upstream==None:
            de_novo_method = de_novo_method_upstream
        else:
            de_novo_method = de_novo_method_3pUTR
        self.pssms_upstream = []
        self.pssms_3putr = []
        for motif in motifs:
            name = str(self.k)+'_motif'+str(motif['motif_num'])+'_'+de_novo_method
            q1 = 'SELECT row_names.name, motif_annotations.position, motif_annotations.reverse, motif_annotations.pvalue FROM motif_annotations, row_names WHERE motif_annotations.motif_info_id='+str(motif['rowid'])+' and motif_annotations.gene_num=row_names.order_num'
            cur.execute(q1)
            sites = [[str(i['name']), i['position'], i['reverse'], i['pvalue']] for i in cur.fetchall()]
            nsites = len(sites)
            eValue = motif['evalue']
            q1 = 'SELECT a, c, g, t FROM motif_pssm_rows WHERE motif_info_id='+str(motif['rowid']) #+' and iteration='+str(self.maxIter)
            cur.execute(q1)
            matrix = [[i['a'], i['c'], i['g'], i['t']] for i in cur.fetchall()]
            genes = list(set([i[0] for i in sites]))
            if motif['seqtype']=='upstream':
                self.pssms_upstream.append(PSSM(name, nsites, eValue, matrix, genes, de_novo_method))
            else:
                self.pssms_3putr.append(PSSM(name, nsites, eValue, matrix, genes, de_novo_method))
        con.close()

    def num_genes(self):
        return len(self.genes)

    def num_conditions(self):
        return len(self.conditions)

    # Returns a list of the upstream pssms for this biclutster
    def pssm_upstream(self, name):
        for i in self.pssms_upstream:
            if i.name == name:
                return i
        return 'No such PSSM with that name!'

    def pssm_3putr(self, name):
        for i in self.pssms_3putr:
            if i.name == name:
                return i
        return 'No such PSSM with that name!'

    def pssm_names_upstream(self):
        return [pssm.name for pssm in self.pssms_upstream]

    def pssm_names_3putr(self):
        return [pssm.name for pssm in self.pssms_3putr]

    # Adds a PSSM to a bicluster
    def add_pssm_upstream(self, pssm):
        self.pssms_upstream.append(pssm)

    # Adds a PSSM to a bicluster
    def add_pssm_3putr(self, pssm):
        self.pssms_3putr.append(pssm)

    # Add attribute
    def add_attribute(self, key, value):
        if key is None or value is None:
            print 'Error:  Key or Value have no data'
        self.attributes[key] = value