macs3-project
diff --git a/‎MACS3/Signal/PairedEndTrack.py‎
Lines changed: 98 additions & 210 deletions b/‎MACS3/Signal/PairedEndTrack.py‎
Lines changed: 98 additions & 210 deletions
diff --git a/‎test/yeast_500k_SRR1822137.bedpe.gz‎
-587 Bytes b/‎test/yeast_500k_SRR1822137.bedpe.gz‎
-587 Bytes
@@ -1,6 +1,6 @@
 # cython: language_level=3
 # cython: profile=True
-# Time-stamp: <2025-07-24 10:17:48 Tao Liu>
+# Time-stamp: <2025-07-24 11:37:58 Tao Liu>
 
 """Module for filter duplicate tags from paired-end data
 
@@ -15,8 +15,7 @@
 import io
 import sys
 from array import array as pyarray
-from collections import Counter
-
+from collections import Counter,defaultdict
 # ------------------------------------
 # MACS3 modules
 # ------------------------------------
@@ -1292,265 +1291,154 @@ def exclude(self, regions):
         return
 
     @cython.ccall
-    def sample_frag_percent_copy(self,
-                                 percent: cython.float,
-                                 seed: cython.int = -1):
-        """Sample the fragments for a given percentage, return a new
-        PETrackII object.
-
-        Sampling is performed per chromosome, preserving the barcode
-        associations.
-
+    def sample_percent(self, percent: cython.float, seed: cython.int = -1):
         """
-        num: cython.uint
-        k: bytes
-        chrnames: set
-        indices: cnp.ndarray
-        n_frags: cython.int
-
-        ret_petrackII = PETrackII(anno=self.annotation,
-                                  buffer_size=self.buffer_size)
-        # Copy barcode_dict and barcode_last_n, as barcodes remain valid
-        ret_petrackII.barcode_dict = dict(self.barcode_dict)
-        ret_petrackII.barcode_last_n = self.barcode_last_n
-
-        chrnames = self.get_chr_names()
+        Downsample all counts to a specified percent, in-place.
+        Shuffle and sample per chromosome.
+        """
+        assert 0.0 < percent <= 1.0, "percent must be in (0, 1]"
+        chrnames = sorted(self.get_chr_names())
 
+        # Setup shuffling logic like PETrackI
         if seed >= 0:
-            info(f"#   A random seed {seed} has been used in the sampling function")
-            rs = np.random.default_rng(seed)
-        else:
-            rs = np.random.default_rng()
-        rs_shuffle = rs.shuffle
-
-        for k in sorted(chrnames):
-            loc = self.locations[k]
-            bar = self.barcodes[k]
-            n_frags = loc.shape[0]
-            if n_frags == 0:
-                continue
-            num = cython.cast(cython.uint, round(n_frags * percent, 5))
-            indices = np.arange(n_frags)
-            rs_shuffle(indices)
-            indices = indices[:num]
-            indices.sort()
-            ret_petrackII.locations[k] = np.copy(loc[indices])
-            ret_petrackII.barcodes[k] = np.copy(bar[indices])
-            ret_petrackII.size[k] = ret_petrackII.locations[k].shape[0]
-            ret_petrackII.buf_size[k] = ret_petrackII.size[k]
-            ret_petrackII.length += np.sum((ret_petrackII.locations[k]['r'] - ret_petrackII.locations[k]['l']) * ret_petrackII.locations[k]['c'])
-            ret_petrackII.total += np.sum(ret_petrackII.locations[k]['c'])
-
-        if ret_petrackII.total > 0:
-            ret_petrackII.average_template_length = cython.cast(cython.float, ret_petrackII.length) / ret_petrackII.total
+            info(f"#   A random seed {seed} has been used")
+            rs = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(seed)))
+            rs_shuffle = rs.shuffle
         else:
-            ret_petrackII.average_template_length = 0.0
-
-        ret_petrackII.set_rlengths(self.get_rlengths())
-        ret_petrackII.is_sorted = False  # Shuffling breaks sorting!
-        return ret_petrackII
-
-    @cython.ccall
-    def sample_num(self,
-                   samplesize: cython.ulong,
-                   seed: cython.int = -1):
-        """Downsample the object in-place so that the sum of all
-        counts is samplesize.
-
-        Sampling is performed proportionally to each fragment's count.
-
-        """
-        k: bytes
-        chrnames: set
-        n_total: cython.ulong
-        probs: cnp.ndarray
-        new_counts: cnp.ndarray
-        mask: cnp.ndarray
+            rs_shuffle = np.random.shuffle
 
         self.length = 0
         self.total = 0
         self.average_template_length = 0.0
 
-        chrnames = self.get_chr_names()
-        # Compute the current total counts
-        n_total = 0
-        chr_totals = {}
         for k in chrnames:
-            chr_totals[k] = self.locations[k]['c'].sum()
-            n_total += chr_totals[k]
-
-        if n_total == 0 or samplesize == 0:
-            # Just clear everything
-            for k in chrnames:
-                self.locations[k] = self.locations[k][:0]
-                self.barcodes[k] = self.barcodes[k][:0]
-                self.size[k] = 0
-            self.length = 0
-            self.total = 0
-            self.average_template_length = 0.0
-            return
-
-        if seed >= 0:
-            info(f"# A random seed {seed} has been used in the sampling function")            
-            rs = np.random.default_rng(seed)
-        else:
-            rs = np.random.default_rng()
-
-        for k in sorted(chrnames):
             loc = self.locations[k]
             bar = self.barcodes[k]
-            n_chr = chr_totals[k]
-            if n_chr == 0:
-                # No data in this chromosome
-                self.locations[k] = loc[:0]
-                self.barcodes[k] = bar[:0]
-                self.size[k] = 0
-                continue
-            # Number of counts to sample for this chromosome (proportional)
-            chr_target = int(round(samplesize * n_chr / n_total))
-            if chr_target == 0:
+            counts = loc['c']
+            n = int(counts.sum())
+            n_sample = int(round(n * percent))
+            if n == 0 or n_sample == 0:
                 self.locations[k] = loc[:0]
                 self.barcodes[k] = bar[:0]
                 self.size[k] = 0
                 continue
 
-            counts = loc['c']
-            probs = counts / counts.sum()
-            # Sample how many counts each fragment keeps (multinomial)
-            new_counts = rs.multinomial(chr_target, probs)
-            # Mask: keep fragments with >0 counts
-            mask = new_counts > 0
-            self.locations[k] = loc[mask].copy()
-            self.locations[k]['c'] = new_counts[mask]
-            self.barcodes[k] = bar[mask].copy()
-            self.size[k] = self.locations[k].shape[0]
-            self.length += np.sum((self.locations[k]['r'] - self.locations[k]['l']) * self.locations[k]['c'])
-            self.total += np.sum(self.locations[k]['c'])
+            # Flatten: build an array of indices into loc, repeated by count
+            idx_flat = np.repeat(np.arange(len(loc)), counts)
+            rs_shuffle(idx_flat)
+            idx_flat = idx_flat[:n_sample]
+
+            # Recount: count how many times each index is chosen
+            unique_idx, new_counts = np.unique(idx_flat, return_counts=True)
+            # Compose new arrays
+            new_locs = loc[unique_idx].copy()
+            new_locs['c'] = new_counts
+            new_bars = bar[unique_idx].copy()
+            self.locations[k] = new_locs
+            self.barcodes[k] = new_bars
+            self.size[k] = len(new_locs)
+            self.length += np.sum((new_locs['r'] - new_locs['l']) * new_locs['c'])
+            self.total += np.sum(new_locs['c'])
+
         if self.total > 0:
             self.average_template_length = float(self.length) / self.total
         else:
             self.average_template_length = 0.0
-        self.is_sorted = False  # shuffling breaks sort
+        self.sort()
         return
 
     @cython.ccall
-    def sample_num_copy(self,
-                        samplesize: cython.ulong,
-                        seed: cython.int = -1):
-        """Downsample to a new PETrackII so that the sum of all counts
-        is samplesize.
-
-        Sampling is performed proportionally to each fragment's count.
-
+    def sample_percent_copy(self, percent: cython.float, seed: cython.int = -1):
         """
-        import numpy as np
-        k: bytes
-        chrnames: set
-        n_total: cython.ulong
-        probs: cnp.ndarray
-        new_counts: cnp.ndarray
-        mask: cnp.ndarray
+        Downsample all counts to a specified percent, returning a new PETrackII object.
+        Shuffle and sample per chromosome.
+        """
+        assert 0.0 < percent <= 1.0, "percent must be in (0, 1]"
+        chrnames = sorted(self.get_chr_names())
+        
+        # Setup shuffling logic like PETrackI
+        if seed >= 0:
+            info(f"#   A random seed {seed} has been used")
+            rs = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(seed)))
+            rs_shuffle = rs.shuffle
+        else:
+            rs_shuffle = np.random.shuffle
 
         ret = PETrackII(anno=self.annotation, buffer_size=self.buffer_size)
         ret.barcode_dict = dict(self.barcode_dict)
         ret.barcode_last_n = self.barcode_last_n
 
-        chrnames = self.get_chr_names()
-        n_total = 0
-        chr_totals = {}
-        for k in chrnames:
-            chr_totals[k] = self.locations[k]['c'].sum()
-            n_total += chr_totals[k]
-
-        if n_total == 0 or samplesize == 0:
-            for k in chrnames:
-                ret.locations[k] = self.locations[k][:0]
-                ret.barcodes[k] = self.barcodes[k][:0]
-                ret.size[k] = 0
-                ret.buf_size[k] = 0
-            ret.length = 0
-            ret.total = 0
-            ret.average_template_length = 0.0
-            return ret
-
-        if seed >= 0:
-            info(f"# A random seed {seed} has been used in the sampling function")
-            rs = np.random.default_rng(seed)
-        else:
-            rs = np.random.default_rng()
+        ret.length = 0
+        ret.total = 0
 
-        for k in sorted(chrnames):
+        for k in chrnames:
             loc = self.locations[k]
             bar = self.barcodes[k]
-            n_chr = chr_totals[k]
-            if n_chr == 0:
-                ret.locations[k] = loc[:0]
-                ret.barcodes[k] = bar[:0]
-                ret.size[k] = 0
-                ret.buf_size[k] = 0
-                continue
-            chr_target = int(round(samplesize * n_chr / n_total))
-            if chr_target == 0:
+            counts = loc['c']
+            n = int(counts.sum())
+            n_sample = int(round(n * percent))
+            if n == 0 or n_sample == 0:
                 ret.locations[k] = loc[:0]
                 ret.barcodes[k] = bar[:0]
                 ret.size[k] = 0
                 ret.buf_size[k] = 0
                 continue
-            counts = loc['c']
-            probs = counts / counts.sum()
-            new_counts = rs.multinomial(chr_target, probs)
-            mask = new_counts > 0
-            ret.locations[k] = loc[mask].copy()
-            ret.locations[k]['c'] = new_counts[mask]
-            ret.barcodes[k] = bar[mask].copy()
-            ret.size[k] = ret.locations[k].shape[0]
-            ret.buf_size[k] = ret.size[k]
-            ret.length += np.sum((ret.locations[k]['r'] - ret.locations[k]['l']) * ret.locations[k]['c'])
-            ret.total += np.sum(ret.locations[k]['c'])
+
+            idx_flat = np.repeat(np.arange(len(loc)), counts)
+            rs_shuffle(idx_flat)
+            idx_flat = idx_flat[:n_sample]
+            unique_idx, new_counts = np.unique(idx_flat, return_counts=True)
+            new_locs = loc[unique_idx].copy()
+            new_locs['c'] = new_counts
+            new_bars = bar[unique_idx].copy()
+            ret.locations[k] = new_locs
+            ret.barcodes[k] = new_bars
+            ret.size[k] = len(new_locs)
+            ret.buf_size[k] = len(new_locs)
+            ret.length += np.sum((new_locs['r'] - new_locs['l']) * new_locs['c'])
+            ret.total += np.sum(new_locs['c'])
+
         if ret.total > 0:
             ret.average_template_length = float(ret.length) / ret.total
         else:
             ret.average_template_length = 0.0
         ret.set_rlengths(self.get_rlengths())
-        ret.is_sorted = False
+        ret.sort()
         return ret
 
     @cython.ccall
-    def sample_percent(self,
-                       percent: cython.float,
-                       seed: cython.int = -1):
-        """Downsample total counts to a specified percent of the
-        current total (in-place).
-
-        E.g., percent=0.5 keeps about half of all fragment counts in the object.
-
+    def sample_num(self,
+                   samplesize: cython.ulong,
+                   seed: cython.int = -1):
         """
-        samplesize: cython.ulong
-        assert 0.0 < percent <= 1.0, "percent must be in (0, 1]"
-        current_total = 0
-        for k in self.get_chr_names():
-            current_total += self.locations[k]['c'].sum()
-        samplesize = cython.cast(cython.ulong, round(current_total * percent, 5))
-        self.sample_num(samplesize, seed)
+        Downsample in-place so that the sum of all counts is samplesize.
+        """
+        chrnames = self.get_chr_names()
+        n_total = 0
+        chr_totals = {}
+        for k in chrnames:
+            chr_totals[k] = self.locations[k]['c'].sum()
+            n_total += chr_totals[k]
+        percent = 0.0 if n_total == 0 else min(samplesize / n_total, 1.0)
+        self.sample_percent(percent, seed)
         return
 
     @cython.ccall
-    def sample_percent_copy(self,
-                            percent: cython.float,
-                            seed: cython.int = -1):
-        """Downsample total counts to a specified percent of the
-        current total (returns new object).
-
-        E.g., percent=0.5 returns a new PETrackII with half the total counts.
+    def sample_num_copy(self,
+                        samplesize: cython.ulong,
+                        seed: cython.int = -1):
+        """Downsample to a new PETrackII so that the sum of all
+        counts is samplesize.
 
         """
-        samplesize: cython.ulong
-        assert 0.0 < percent <= 1.0, "percent must be in (0, 1]"
-        current_total = 0
-        for k in self.get_chr_names():
-            current_total += self.locations[k]['c'].sum()
-        samplesize = cython.cast(cython.ulong, round(current_total * percent, 5))
-        return self.sample_num_copy(samplesize, seed)
+        chrnames = self.get_chr_names()
+        n_total = 0
+        chr_totals = {}
+        for k in chrnames:
+            chr_totals[k] = self.locations[k]['c'].sum()
+            n_total += chr_totals[k]
+        percent = 0.0 if n_total == 0 else min(samplesize / n_total, 1.0)
+        return self.sample_percent_copy(percent, seed)
 
     @cython.ccall
     def pileup_bdg_hmmr(self,