@@ -83,6 +83,7 @@ def __init__(self, threads, maxRegions, prefix, width, pad):
8383 # prefix of the ROI BAM file
8484 self .prefix = prefix
8585 # width of region for VCF records
86+ self .pad = pad
8687 # add readlength padding
8788 self .width = width + pad
8889
@@ -118,10 +119,34 @@ def selectROI(self,vcfs,bams,beds):
118119 # now collapse overlapping intervals
119120 for chrom in self .callDict :
120121 self .callDict [chrom ] = reduce ( self .callDict [chrom ] )
122+ # we still have to collapse intervals that are too close to each other
123+ # if they are closer than the readlength, there can be cases when a read is picked
124+ # in both interval, so we have to merge them
125+ self .mergeCloseIntervals (chrom )
126+
121127 # save intervals to a BAM
122128 print ("Saving reads from BAMs:" )
123129 self .saveCallsToBAM (bams )
124130
131+ def mergeCloseIntervals (self ,chrom ):
132+ # we are assuming the intervals are ordered (they should to be)
133+ mergedChrom = []
134+ start = (0 ,0 )
135+ for interv in self .callDict [chrom ]:
136+ # (start[0], start[1])
137+ # (interv[0], interv[1])
138+ # |------------------| |--------------------|
139+ if interv [0 ] - start [1 ] <= self .pad * 2 :
140+ # merge the two intervals
141+ start = (start [0 ],interv [1 ])
142+ elif start [1 ]!= 0 : # if its not the very first one and have enough space between the intervals
143+ # add to the new interval set
144+ mergedChrom .append (start )
145+ start = interv
146+ else : # most of the intervals should fall here
147+ start = interv
148+ self .callDict [chrom ] = mergedChrom
149+
125150 def writeChunk (self ,bam ,chrom ,regions ):
126151 # from the regions list make a list of lists - sublists have 50 entries max
127152 sublists = [regions [i :i + self .maxRegions ] for i in range (0 , len (regions ), self .maxRegions )]
0 commit comments