Skip to content

Commit c79fa3c

Browse files
committed
fix the way to subsample fragments; upload sorted frag test file
1 parent 8819679 commit c79fa3c

2 files changed

Lines changed: 98 additions & 210 deletions

File tree

MACS3/Signal/PairedEndTrack.py

Lines changed: 98 additions & 210 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# cython: language_level=3
22
# cython: profile=True
3-
# Time-stamp: <2025-07-24 10:17:48 Tao Liu>
3+
# Time-stamp: <2025-07-24 11:37:58 Tao Liu>
44

55
"""Module for filter duplicate tags from paired-end data
66
@@ -15,8 +15,7 @@
1515
import io
1616
import sys
1717
from array import array as pyarray
18-
from collections import Counter
19-
18+
from collections import Counter,defaultdict
2019
# ------------------------------------
2120
# MACS3 modules
2221
# ------------------------------------
@@ -1292,265 +1291,154 @@ def exclude(self, regions):
12921291
return
12931292

12941293
@cython.ccall
1295-
def sample_frag_percent_copy(self,
1296-
percent: cython.float,
1297-
seed: cython.int = -1):
1298-
"""Sample the fragments for a given percentage, return a new
1299-
PETrackII object.
1300-
1301-
Sampling is performed per chromosome, preserving the barcode
1302-
associations.
1303-
1294+
def sample_percent(self, percent: cython.float, seed: cython.int = -1):
13041295
"""
1305-
num: cython.uint
1306-
k: bytes
1307-
chrnames: set
1308-
indices: cnp.ndarray
1309-
n_frags: cython.int
1310-
1311-
ret_petrackII = PETrackII(anno=self.annotation,
1312-
buffer_size=self.buffer_size)
1313-
# Copy barcode_dict and barcode_last_n, as barcodes remain valid
1314-
ret_petrackII.barcode_dict = dict(self.barcode_dict)
1315-
ret_petrackII.barcode_last_n = self.barcode_last_n
1316-
1317-
chrnames = self.get_chr_names()
1296+
Downsample all counts to a specified percent, in-place.
1297+
Shuffle and sample per chromosome.
1298+
"""
1299+
assert 0.0 < percent <= 1.0, "percent must be in (0, 1]"
1300+
chrnames = sorted(self.get_chr_names())
13181301

1302+
# Setup shuffling logic like PETrackI
13191303
if seed >= 0:
1320-
info(f"# A random seed {seed} has been used in the sampling function")
1321-
rs = np.random.default_rng(seed)
1322-
else:
1323-
rs = np.random.default_rng()
1324-
rs_shuffle = rs.shuffle
1325-
1326-
for k in sorted(chrnames):
1327-
loc = self.locations[k]
1328-
bar = self.barcodes[k]
1329-
n_frags = loc.shape[0]
1330-
if n_frags == 0:
1331-
continue
1332-
num = cython.cast(cython.uint, round(n_frags * percent, 5))
1333-
indices = np.arange(n_frags)
1334-
rs_shuffle(indices)
1335-
indices = indices[:num]
1336-
indices.sort()
1337-
ret_petrackII.locations[k] = np.copy(loc[indices])
1338-
ret_petrackII.barcodes[k] = np.copy(bar[indices])
1339-
ret_petrackII.size[k] = ret_petrackII.locations[k].shape[0]
1340-
ret_petrackII.buf_size[k] = ret_petrackII.size[k]
1341-
ret_petrackII.length += np.sum((ret_petrackII.locations[k]['r'] - ret_petrackII.locations[k]['l']) * ret_petrackII.locations[k]['c'])
1342-
ret_petrackII.total += np.sum(ret_petrackII.locations[k]['c'])
1343-
1344-
if ret_petrackII.total > 0:
1345-
ret_petrackII.average_template_length = cython.cast(cython.float, ret_petrackII.length) / ret_petrackII.total
1304+
info(f"# A random seed {seed} has been used")
1305+
rs = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(seed)))
1306+
rs_shuffle = rs.shuffle
13461307
else:
1347-
ret_petrackII.average_template_length = 0.0
1348-
1349-
ret_petrackII.set_rlengths(self.get_rlengths())
1350-
ret_petrackII.is_sorted = False # Shuffling breaks sorting!
1351-
return ret_petrackII
1352-
1353-
@cython.ccall
1354-
def sample_num(self,
1355-
samplesize: cython.ulong,
1356-
seed: cython.int = -1):
1357-
"""Downsample the object in-place so that the sum of all
1358-
counts is samplesize.
1359-
1360-
Sampling is performed proportionally to each fragment's count.
1361-
1362-
"""
1363-
k: bytes
1364-
chrnames: set
1365-
n_total: cython.ulong
1366-
probs: cnp.ndarray
1367-
new_counts: cnp.ndarray
1368-
mask: cnp.ndarray
1308+
rs_shuffle = np.random.shuffle
13691309

13701310
self.length = 0
13711311
self.total = 0
13721312
self.average_template_length = 0.0
13731313

1374-
chrnames = self.get_chr_names()
1375-
# Compute the current total counts
1376-
n_total = 0
1377-
chr_totals = {}
13781314
for k in chrnames:
1379-
chr_totals[k] = self.locations[k]['c'].sum()
1380-
n_total += chr_totals[k]
1381-
1382-
if n_total == 0 or samplesize == 0:
1383-
# Just clear everything
1384-
for k in chrnames:
1385-
self.locations[k] = self.locations[k][:0]
1386-
self.barcodes[k] = self.barcodes[k][:0]
1387-
self.size[k] = 0
1388-
self.length = 0
1389-
self.total = 0
1390-
self.average_template_length = 0.0
1391-
return
1392-
1393-
if seed >= 0:
1394-
info(f"# A random seed {seed} has been used in the sampling function")
1395-
rs = np.random.default_rng(seed)
1396-
else:
1397-
rs = np.random.default_rng()
1398-
1399-
for k in sorted(chrnames):
14001315
loc = self.locations[k]
14011316
bar = self.barcodes[k]
1402-
n_chr = chr_totals[k]
1403-
if n_chr == 0:
1404-
# No data in this chromosome
1405-
self.locations[k] = loc[:0]
1406-
self.barcodes[k] = bar[:0]
1407-
self.size[k] = 0
1408-
continue
1409-
# Number of counts to sample for this chromosome (proportional)
1410-
chr_target = int(round(samplesize * n_chr / n_total))
1411-
if chr_target == 0:
1317+
counts = loc['c']
1318+
n = int(counts.sum())
1319+
n_sample = int(round(n * percent))
1320+
if n == 0 or n_sample == 0:
14121321
self.locations[k] = loc[:0]
14131322
self.barcodes[k] = bar[:0]
14141323
self.size[k] = 0
14151324
continue
14161325

1417-
counts = loc['c']
1418-
probs = counts / counts.sum()
1419-
# Sample how many counts each fragment keeps (multinomial)
1420-
new_counts = rs.multinomial(chr_target, probs)
1421-
# Mask: keep fragments with >0 counts
1422-
mask = new_counts > 0
1423-
self.locations[k] = loc[mask].copy()
1424-
self.locations[k]['c'] = new_counts[mask]
1425-
self.barcodes[k] = bar[mask].copy()
1426-
self.size[k] = self.locations[k].shape[0]
1427-
self.length += np.sum((self.locations[k]['r'] - self.locations[k]['l']) * self.locations[k]['c'])
1428-
self.total += np.sum(self.locations[k]['c'])
1326+
# Flatten: build an array of indices into loc, repeated by count
1327+
idx_flat = np.repeat(np.arange(len(loc)), counts)
1328+
rs_shuffle(idx_flat)
1329+
idx_flat = idx_flat[:n_sample]
1330+
1331+
# Recount: count how many times each index is chosen
1332+
unique_idx, new_counts = np.unique(idx_flat, return_counts=True)
1333+
# Compose new arrays
1334+
new_locs = loc[unique_idx].copy()
1335+
new_locs['c'] = new_counts
1336+
new_bars = bar[unique_idx].copy()
1337+
self.locations[k] = new_locs
1338+
self.barcodes[k] = new_bars
1339+
self.size[k] = len(new_locs)
1340+
self.length += np.sum((new_locs['r'] - new_locs['l']) * new_locs['c'])
1341+
self.total += np.sum(new_locs['c'])
1342+
14291343
if self.total > 0:
14301344
self.average_template_length = float(self.length) / self.total
14311345
else:
14321346
self.average_template_length = 0.0
1433-
self.is_sorted = False # shuffling breaks sort
1347+
self.sort()
14341348
return
14351349

14361350
@cython.ccall
1437-
def sample_num_copy(self,
1438-
samplesize: cython.ulong,
1439-
seed: cython.int = -1):
1440-
"""Downsample to a new PETrackII so that the sum of all counts
1441-
is samplesize.
1442-
1443-
Sampling is performed proportionally to each fragment's count.
1444-
1351+
def sample_percent_copy(self, percent: cython.float, seed: cython.int = -1):
14451352
"""
1446-
import numpy as np
1447-
k: bytes
1448-
chrnames: set
1449-
n_total: cython.ulong
1450-
probs: cnp.ndarray
1451-
new_counts: cnp.ndarray
1452-
mask: cnp.ndarray
1353+
Downsample all counts to a specified percent, returning a new PETrackII object.
1354+
Shuffle and sample per chromosome.
1355+
"""
1356+
assert 0.0 < percent <= 1.0, "percent must be in (0, 1]"
1357+
chrnames = sorted(self.get_chr_names())
1358+
1359+
# Setup shuffling logic like PETrackI
1360+
if seed >= 0:
1361+
info(f"# A random seed {seed} has been used")
1362+
rs = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(seed)))
1363+
rs_shuffle = rs.shuffle
1364+
else:
1365+
rs_shuffle = np.random.shuffle
14531366

14541367
ret = PETrackII(anno=self.annotation, buffer_size=self.buffer_size)
14551368
ret.barcode_dict = dict(self.barcode_dict)
14561369
ret.barcode_last_n = self.barcode_last_n
14571370

1458-
chrnames = self.get_chr_names()
1459-
n_total = 0
1460-
chr_totals = {}
1461-
for k in chrnames:
1462-
chr_totals[k] = self.locations[k]['c'].sum()
1463-
n_total += chr_totals[k]
1464-
1465-
if n_total == 0 or samplesize == 0:
1466-
for k in chrnames:
1467-
ret.locations[k] = self.locations[k][:0]
1468-
ret.barcodes[k] = self.barcodes[k][:0]
1469-
ret.size[k] = 0
1470-
ret.buf_size[k] = 0
1471-
ret.length = 0
1472-
ret.total = 0
1473-
ret.average_template_length = 0.0
1474-
return ret
1475-
1476-
if seed >= 0:
1477-
info(f"# A random seed {seed} has been used in the sampling function")
1478-
rs = np.random.default_rng(seed)
1479-
else:
1480-
rs = np.random.default_rng()
1371+
ret.length = 0
1372+
ret.total = 0
14811373

1482-
for k in sorted(chrnames):
1374+
for k in chrnames:
14831375
loc = self.locations[k]
14841376
bar = self.barcodes[k]
1485-
n_chr = chr_totals[k]
1486-
if n_chr == 0:
1487-
ret.locations[k] = loc[:0]
1488-
ret.barcodes[k] = bar[:0]
1489-
ret.size[k] = 0
1490-
ret.buf_size[k] = 0
1491-
continue
1492-
chr_target = int(round(samplesize * n_chr / n_total))
1493-
if chr_target == 0:
1377+
counts = loc['c']
1378+
n = int(counts.sum())
1379+
n_sample = int(round(n * percent))
1380+
if n == 0 or n_sample == 0:
14941381
ret.locations[k] = loc[:0]
14951382
ret.barcodes[k] = bar[:0]
14961383
ret.size[k] = 0
14971384
ret.buf_size[k] = 0
14981385
continue
1499-
counts = loc['c']
1500-
probs = counts / counts.sum()
1501-
new_counts = rs.multinomial(chr_target, probs)
1502-
mask = new_counts > 0
1503-
ret.locations[k] = loc[mask].copy()
1504-
ret.locations[k]['c'] = new_counts[mask]
1505-
ret.barcodes[k] = bar[mask].copy()
1506-
ret.size[k] = ret.locations[k].shape[0]
1507-
ret.buf_size[k] = ret.size[k]
1508-
ret.length += np.sum((ret.locations[k]['r'] - ret.locations[k]['l']) * ret.locations[k]['c'])
1509-
ret.total += np.sum(ret.locations[k]['c'])
1386+
1387+
idx_flat = np.repeat(np.arange(len(loc)), counts)
1388+
rs_shuffle(idx_flat)
1389+
idx_flat = idx_flat[:n_sample]
1390+
unique_idx, new_counts = np.unique(idx_flat, return_counts=True)
1391+
new_locs = loc[unique_idx].copy()
1392+
new_locs['c'] = new_counts
1393+
new_bars = bar[unique_idx].copy()
1394+
ret.locations[k] = new_locs
1395+
ret.barcodes[k] = new_bars
1396+
ret.size[k] = len(new_locs)
1397+
ret.buf_size[k] = len(new_locs)
1398+
ret.length += np.sum((new_locs['r'] - new_locs['l']) * new_locs['c'])
1399+
ret.total += np.sum(new_locs['c'])
1400+
15101401
if ret.total > 0:
15111402
ret.average_template_length = float(ret.length) / ret.total
15121403
else:
15131404
ret.average_template_length = 0.0
15141405
ret.set_rlengths(self.get_rlengths())
1515-
ret.is_sorted = False
1406+
ret.sort()
15161407
return ret
15171408

15181409
@cython.ccall
1519-
def sample_percent(self,
1520-
percent: cython.float,
1521-
seed: cython.int = -1):
1522-
"""Downsample total counts to a specified percent of the
1523-
current total (in-place).
1524-
1525-
E.g., percent=0.5 keeps about half of all fragment counts in the object.
1526-
1410+
def sample_num(self,
1411+
samplesize: cython.ulong,
1412+
seed: cython.int = -1):
15271413
"""
1528-
samplesize: cython.ulong
1529-
assert 0.0 < percent <= 1.0, "percent must be in (0, 1]"
1530-
current_total = 0
1531-
for k in self.get_chr_names():
1532-
current_total += self.locations[k]['c'].sum()
1533-
samplesize = cython.cast(cython.ulong, round(current_total * percent, 5))
1534-
self.sample_num(samplesize, seed)
1414+
Downsample in-place so that the sum of all counts is samplesize.
1415+
"""
1416+
chrnames = self.get_chr_names()
1417+
n_total = 0
1418+
chr_totals = {}
1419+
for k in chrnames:
1420+
chr_totals[k] = self.locations[k]['c'].sum()
1421+
n_total += chr_totals[k]
1422+
percent = 0.0 if n_total == 0 else min(samplesize / n_total, 1.0)
1423+
self.sample_percent(percent, seed)
15351424
return
15361425

15371426
@cython.ccall
1538-
def sample_percent_copy(self,
1539-
percent: cython.float,
1540-
seed: cython.int = -1):
1541-
"""Downsample total counts to a specified percent of the
1542-
current total (returns new object).
1543-
1544-
E.g., percent=0.5 returns a new PETrackII with half the total counts.
1427+
def sample_num_copy(self,
1428+
samplesize: cython.ulong,
1429+
seed: cython.int = -1):
1430+
"""Downsample to a new PETrackII so that the sum of all
1431+
counts is samplesize.
15451432
15461433
"""
1547-
samplesize: cython.ulong
1548-
assert 0.0 < percent <= 1.0, "percent must be in (0, 1]"
1549-
current_total = 0
1550-
for k in self.get_chr_names():
1551-
current_total += self.locations[k]['c'].sum()
1552-
samplesize = cython.cast(cython.ulong, round(current_total * percent, 5))
1553-
return self.sample_num_copy(samplesize, seed)
1434+
chrnames = self.get_chr_names()
1435+
n_total = 0
1436+
chr_totals = {}
1437+
for k in chrnames:
1438+
chr_totals[k] = self.locations[k]['c'].sum()
1439+
n_total += chr_totals[k]
1440+
percent = 0.0 if n_total == 0 else min(samplesize / n_total, 1.0)
1441+
return self.sample_percent_copy(percent, seed)
15541442

15551443
@cython.ccall
15561444
def pileup_bdg_hmmr(self,
-587 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)