|
1 | 1 | # cython: language_level=3 |
2 | 2 | # cython: profile=True |
3 | | -# Time-stamp: <2025-07-24 10:17:48 Tao Liu> |
| 3 | +# Time-stamp: <2025-07-24 11:37:58 Tao Liu> |
4 | 4 |
|
5 | 5 | """Module for filter duplicate tags from paired-end data |
6 | 6 |
|
|
15 | 15 | import io |
16 | 16 | import sys |
17 | 17 | from array import array as pyarray |
18 | | -from collections import Counter |
19 | | - |
| 18 | +from collections import Counter,defaultdict |
20 | 19 | # ------------------------------------ |
21 | 20 | # MACS3 modules |
22 | 21 | # ------------------------------------ |
@@ -1292,265 +1291,154 @@ def exclude(self, regions): |
1292 | 1291 | return |
1293 | 1292 |
|
1294 | 1293 | @cython.ccall |
1295 | | - def sample_frag_percent_copy(self, |
1296 | | - percent: cython.float, |
1297 | | - seed: cython.int = -1): |
1298 | | - """Sample the fragments for a given percentage, return a new |
1299 | | - PETrackII object. |
1300 | | -
|
1301 | | - Sampling is performed per chromosome, preserving the barcode |
1302 | | - associations. |
1303 | | -
|
| 1294 | + def sample_percent(self, percent: cython.float, seed: cython.int = -1): |
1304 | 1295 | """ |
1305 | | - num: cython.uint |
1306 | | - k: bytes |
1307 | | - chrnames: set |
1308 | | - indices: cnp.ndarray |
1309 | | - n_frags: cython.int |
1310 | | - |
1311 | | - ret_petrackII = PETrackII(anno=self.annotation, |
1312 | | - buffer_size=self.buffer_size) |
1313 | | - # Copy barcode_dict and barcode_last_n, as barcodes remain valid |
1314 | | - ret_petrackII.barcode_dict = dict(self.barcode_dict) |
1315 | | - ret_petrackII.barcode_last_n = self.barcode_last_n |
1316 | | - |
1317 | | - chrnames = self.get_chr_names() |
| 1296 | + Downsample all counts to a specified percent, in-place. |
| 1297 | + Shuffle and sample per chromosome. |
| 1298 | + """ |
| 1299 | + assert 0.0 < percent <= 1.0, "percent must be in (0, 1]" |
| 1300 | + chrnames = sorted(self.get_chr_names()) |
1318 | 1301 |
|
| 1302 | + # Setup shuffling logic like PETrackI |
1319 | 1303 | if seed >= 0: |
1320 | | - info(f"# A random seed {seed} has been used in the sampling function") |
1321 | | - rs = np.random.default_rng(seed) |
1322 | | - else: |
1323 | | - rs = np.random.default_rng() |
1324 | | - rs_shuffle = rs.shuffle |
1325 | | - |
1326 | | - for k in sorted(chrnames): |
1327 | | - loc = self.locations[k] |
1328 | | - bar = self.barcodes[k] |
1329 | | - n_frags = loc.shape[0] |
1330 | | - if n_frags == 0: |
1331 | | - continue |
1332 | | - num = cython.cast(cython.uint, round(n_frags * percent, 5)) |
1333 | | - indices = np.arange(n_frags) |
1334 | | - rs_shuffle(indices) |
1335 | | - indices = indices[:num] |
1336 | | - indices.sort() |
1337 | | - ret_petrackII.locations[k] = np.copy(loc[indices]) |
1338 | | - ret_petrackII.barcodes[k] = np.copy(bar[indices]) |
1339 | | - ret_petrackII.size[k] = ret_petrackII.locations[k].shape[0] |
1340 | | - ret_petrackII.buf_size[k] = ret_petrackII.size[k] |
1341 | | - ret_petrackII.length += np.sum((ret_petrackII.locations[k]['r'] - ret_petrackII.locations[k]['l']) * ret_petrackII.locations[k]['c']) |
1342 | | - ret_petrackII.total += np.sum(ret_petrackII.locations[k]['c']) |
1343 | | - |
1344 | | - if ret_petrackII.total > 0: |
1345 | | - ret_petrackII.average_template_length = cython.cast(cython.float, ret_petrackII.length) / ret_petrackII.total |
| 1304 | + info(f"# A random seed {seed} has been used") |
| 1305 | + rs = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(seed))) |
| 1306 | + rs_shuffle = rs.shuffle |
1346 | 1307 | else: |
1347 | | - ret_petrackII.average_template_length = 0.0 |
1348 | | - |
1349 | | - ret_petrackII.set_rlengths(self.get_rlengths()) |
1350 | | - ret_petrackII.is_sorted = False # Shuffling breaks sorting! |
1351 | | - return ret_petrackII |
1352 | | - |
1353 | | - @cython.ccall |
1354 | | - def sample_num(self, |
1355 | | - samplesize: cython.ulong, |
1356 | | - seed: cython.int = -1): |
1357 | | - """Downsample the object in-place so that the sum of all |
1358 | | - counts is samplesize. |
1359 | | -
|
1360 | | - Sampling is performed proportionally to each fragment's count. |
1361 | | -
|
1362 | | - """ |
1363 | | - k: bytes |
1364 | | - chrnames: set |
1365 | | - n_total: cython.ulong |
1366 | | - probs: cnp.ndarray |
1367 | | - new_counts: cnp.ndarray |
1368 | | - mask: cnp.ndarray |
| 1308 | + rs_shuffle = np.random.shuffle |
1369 | 1309 |
|
1370 | 1310 | self.length = 0 |
1371 | 1311 | self.total = 0 |
1372 | 1312 | self.average_template_length = 0.0 |
1373 | 1313 |
|
1374 | | - chrnames = self.get_chr_names() |
1375 | | - # Compute the current total counts |
1376 | | - n_total = 0 |
1377 | | - chr_totals = {} |
1378 | 1314 | for k in chrnames: |
1379 | | - chr_totals[k] = self.locations[k]['c'].sum() |
1380 | | - n_total += chr_totals[k] |
1381 | | - |
1382 | | - if n_total == 0 or samplesize == 0: |
1383 | | - # Just clear everything |
1384 | | - for k in chrnames: |
1385 | | - self.locations[k] = self.locations[k][:0] |
1386 | | - self.barcodes[k] = self.barcodes[k][:0] |
1387 | | - self.size[k] = 0 |
1388 | | - self.length = 0 |
1389 | | - self.total = 0 |
1390 | | - self.average_template_length = 0.0 |
1391 | | - return |
1392 | | - |
1393 | | - if seed >= 0: |
1394 | | - info(f"# A random seed {seed} has been used in the sampling function") |
1395 | | - rs = np.random.default_rng(seed) |
1396 | | - else: |
1397 | | - rs = np.random.default_rng() |
1398 | | - |
1399 | | - for k in sorted(chrnames): |
1400 | 1315 | loc = self.locations[k] |
1401 | 1316 | bar = self.barcodes[k] |
1402 | | - n_chr = chr_totals[k] |
1403 | | - if n_chr == 0: |
1404 | | - # No data in this chromosome |
1405 | | - self.locations[k] = loc[:0] |
1406 | | - self.barcodes[k] = bar[:0] |
1407 | | - self.size[k] = 0 |
1408 | | - continue |
1409 | | - # Number of counts to sample for this chromosome (proportional) |
1410 | | - chr_target = int(round(samplesize * n_chr / n_total)) |
1411 | | - if chr_target == 0: |
| 1317 | + counts = loc['c'] |
| 1318 | + n = int(counts.sum()) |
| 1319 | + n_sample = int(round(n * percent)) |
| 1320 | + if n == 0 or n_sample == 0: |
1412 | 1321 | self.locations[k] = loc[:0] |
1413 | 1322 | self.barcodes[k] = bar[:0] |
1414 | 1323 | self.size[k] = 0 |
1415 | 1324 | continue |
1416 | 1325 |
|
1417 | | - counts = loc['c'] |
1418 | | - probs = counts / counts.sum() |
1419 | | - # Sample how many counts each fragment keeps (multinomial) |
1420 | | - new_counts = rs.multinomial(chr_target, probs) |
1421 | | - # Mask: keep fragments with >0 counts |
1422 | | - mask = new_counts > 0 |
1423 | | - self.locations[k] = loc[mask].copy() |
1424 | | - self.locations[k]['c'] = new_counts[mask] |
1425 | | - self.barcodes[k] = bar[mask].copy() |
1426 | | - self.size[k] = self.locations[k].shape[0] |
1427 | | - self.length += np.sum((self.locations[k]['r'] - self.locations[k]['l']) * self.locations[k]['c']) |
1428 | | - self.total += np.sum(self.locations[k]['c']) |
| 1326 | + # Flatten: build an array of indices into loc, repeated by count |
| 1327 | + idx_flat = np.repeat(np.arange(len(loc)), counts) |
| 1328 | + rs_shuffle(idx_flat) |
| 1329 | + idx_flat = idx_flat[:n_sample] |
| 1330 | + |
| 1331 | + # Recount: count how many times each index is chosen |
| 1332 | + unique_idx, new_counts = np.unique(idx_flat, return_counts=True) |
| 1333 | + # Compose new arrays |
| 1334 | + new_locs = loc[unique_idx].copy() |
| 1335 | + new_locs['c'] = new_counts |
| 1336 | + new_bars = bar[unique_idx].copy() |
| 1337 | + self.locations[k] = new_locs |
| 1338 | + self.barcodes[k] = new_bars |
| 1339 | + self.size[k] = len(new_locs) |
| 1340 | + self.length += np.sum((new_locs['r'] - new_locs['l']) * new_locs['c']) |
| 1341 | + self.total += np.sum(new_locs['c']) |
| 1342 | + |
1429 | 1343 | if self.total > 0: |
1430 | 1344 | self.average_template_length = float(self.length) / self.total |
1431 | 1345 | else: |
1432 | 1346 | self.average_template_length = 0.0 |
1433 | | - self.is_sorted = False # shuffling breaks sort |
| 1347 | + self.sort() |
1434 | 1348 | return |
1435 | 1349 |
|
1436 | 1350 | @cython.ccall |
1437 | | - def sample_num_copy(self, |
1438 | | - samplesize: cython.ulong, |
1439 | | - seed: cython.int = -1): |
1440 | | - """Downsample to a new PETrackII so that the sum of all counts |
1441 | | - is samplesize. |
1442 | | -
|
1443 | | - Sampling is performed proportionally to each fragment's count. |
1444 | | -
|
| 1351 | + def sample_percent_copy(self, percent: cython.float, seed: cython.int = -1): |
1445 | 1352 | """ |
1446 | | - import numpy as np |
1447 | | - k: bytes |
1448 | | - chrnames: set |
1449 | | - n_total: cython.ulong |
1450 | | - probs: cnp.ndarray |
1451 | | - new_counts: cnp.ndarray |
1452 | | - mask: cnp.ndarray |
| 1353 | + Downsample all counts to a specified percent, returning a new PETrackII object. |
| 1354 | + Shuffle and sample per chromosome. |
| 1355 | + """ |
| 1356 | + assert 0.0 < percent <= 1.0, "percent must be in (0, 1]" |
| 1357 | + chrnames = sorted(self.get_chr_names()) |
| 1358 | + |
| 1359 | + # Setup shuffling logic like PETrackI |
| 1360 | + if seed >= 0: |
| 1361 | + info(f"# A random seed {seed} has been used") |
| 1362 | + rs = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(seed))) |
| 1363 | + rs_shuffle = rs.shuffle |
| 1364 | + else: |
| 1365 | + rs_shuffle = np.random.shuffle |
1453 | 1366 |
|
1454 | 1367 | ret = PETrackII(anno=self.annotation, buffer_size=self.buffer_size) |
1455 | 1368 | ret.barcode_dict = dict(self.barcode_dict) |
1456 | 1369 | ret.barcode_last_n = self.barcode_last_n |
1457 | 1370 |
|
1458 | | - chrnames = self.get_chr_names() |
1459 | | - n_total = 0 |
1460 | | - chr_totals = {} |
1461 | | - for k in chrnames: |
1462 | | - chr_totals[k] = self.locations[k]['c'].sum() |
1463 | | - n_total += chr_totals[k] |
1464 | | - |
1465 | | - if n_total == 0 or samplesize == 0: |
1466 | | - for k in chrnames: |
1467 | | - ret.locations[k] = self.locations[k][:0] |
1468 | | - ret.barcodes[k] = self.barcodes[k][:0] |
1469 | | - ret.size[k] = 0 |
1470 | | - ret.buf_size[k] = 0 |
1471 | | - ret.length = 0 |
1472 | | - ret.total = 0 |
1473 | | - ret.average_template_length = 0.0 |
1474 | | - return ret |
1475 | | - |
1476 | | - if seed >= 0: |
1477 | | - info(f"# A random seed {seed} has been used in the sampling function") |
1478 | | - rs = np.random.default_rng(seed) |
1479 | | - else: |
1480 | | - rs = np.random.default_rng() |
| 1371 | + ret.length = 0 |
| 1372 | + ret.total = 0 |
1481 | 1373 |
|
1482 | | - for k in sorted(chrnames): |
| 1374 | + for k in chrnames: |
1483 | 1375 | loc = self.locations[k] |
1484 | 1376 | bar = self.barcodes[k] |
1485 | | - n_chr = chr_totals[k] |
1486 | | - if n_chr == 0: |
1487 | | - ret.locations[k] = loc[:0] |
1488 | | - ret.barcodes[k] = bar[:0] |
1489 | | - ret.size[k] = 0 |
1490 | | - ret.buf_size[k] = 0 |
1491 | | - continue |
1492 | | - chr_target = int(round(samplesize * n_chr / n_total)) |
1493 | | - if chr_target == 0: |
| 1377 | + counts = loc['c'] |
| 1378 | + n = int(counts.sum()) |
| 1379 | + n_sample = int(round(n * percent)) |
| 1380 | + if n == 0 or n_sample == 0: |
1494 | 1381 | ret.locations[k] = loc[:0] |
1495 | 1382 | ret.barcodes[k] = bar[:0] |
1496 | 1383 | ret.size[k] = 0 |
1497 | 1384 | ret.buf_size[k] = 0 |
1498 | 1385 | continue |
1499 | | - counts = loc['c'] |
1500 | | - probs = counts / counts.sum() |
1501 | | - new_counts = rs.multinomial(chr_target, probs) |
1502 | | - mask = new_counts > 0 |
1503 | | - ret.locations[k] = loc[mask].copy() |
1504 | | - ret.locations[k]['c'] = new_counts[mask] |
1505 | | - ret.barcodes[k] = bar[mask].copy() |
1506 | | - ret.size[k] = ret.locations[k].shape[0] |
1507 | | - ret.buf_size[k] = ret.size[k] |
1508 | | - ret.length += np.sum((ret.locations[k]['r'] - ret.locations[k]['l']) * ret.locations[k]['c']) |
1509 | | - ret.total += np.sum(ret.locations[k]['c']) |
| 1386 | + |
| 1387 | + idx_flat = np.repeat(np.arange(len(loc)), counts) |
| 1388 | + rs_shuffle(idx_flat) |
| 1389 | + idx_flat = idx_flat[:n_sample] |
| 1390 | + unique_idx, new_counts = np.unique(idx_flat, return_counts=True) |
| 1391 | + new_locs = loc[unique_idx].copy() |
| 1392 | + new_locs['c'] = new_counts |
| 1393 | + new_bars = bar[unique_idx].copy() |
| 1394 | + ret.locations[k] = new_locs |
| 1395 | + ret.barcodes[k] = new_bars |
| 1396 | + ret.size[k] = len(new_locs) |
| 1397 | + ret.buf_size[k] = len(new_locs) |
| 1398 | + ret.length += np.sum((new_locs['r'] - new_locs['l']) * new_locs['c']) |
| 1399 | + ret.total += np.sum(new_locs['c']) |
| 1400 | + |
1510 | 1401 | if ret.total > 0: |
1511 | 1402 | ret.average_template_length = float(ret.length) / ret.total |
1512 | 1403 | else: |
1513 | 1404 | ret.average_template_length = 0.0 |
1514 | 1405 | ret.set_rlengths(self.get_rlengths()) |
1515 | | - ret.is_sorted = False |
| 1406 | + ret.sort() |
1516 | 1407 | return ret |
1517 | 1408 |
|
1518 | 1409 | @cython.ccall |
1519 | | - def sample_percent(self, |
1520 | | - percent: cython.float, |
1521 | | - seed: cython.int = -1): |
1522 | | - """Downsample total counts to a specified percent of the |
1523 | | - current total (in-place). |
1524 | | -
|
1525 | | - E.g., percent=0.5 keeps about half of all fragment counts in the object. |
1526 | | -
|
| 1410 | + def sample_num(self, |
| 1411 | + samplesize: cython.ulong, |
| 1412 | + seed: cython.int = -1): |
1527 | 1413 | """ |
1528 | | - samplesize: cython.ulong |
1529 | | - assert 0.0 < percent <= 1.0, "percent must be in (0, 1]" |
1530 | | - current_total = 0 |
1531 | | - for k in self.get_chr_names(): |
1532 | | - current_total += self.locations[k]['c'].sum() |
1533 | | - samplesize = cython.cast(cython.ulong, round(current_total * percent, 5)) |
1534 | | - self.sample_num(samplesize, seed) |
| 1414 | + Downsample in-place so that the sum of all counts is samplesize. |
| 1415 | + """ |
| 1416 | + chrnames = self.get_chr_names() |
| 1417 | + n_total = 0 |
| 1418 | + chr_totals = {} |
| 1419 | + for k in chrnames: |
| 1420 | + chr_totals[k] = self.locations[k]['c'].sum() |
| 1421 | + n_total += chr_totals[k] |
| 1422 | + percent = 0.0 if n_total == 0 else min(samplesize / n_total, 1.0) |
| 1423 | + self.sample_percent(percent, seed) |
1535 | 1424 | return |
1536 | 1425 |
|
1537 | 1426 | @cython.ccall |
1538 | | - def sample_percent_copy(self, |
1539 | | - percent: cython.float, |
1540 | | - seed: cython.int = -1): |
1541 | | - """Downsample total counts to a specified percent of the |
1542 | | - current total (returns new object). |
1543 | | -
|
1544 | | - E.g., percent=0.5 returns a new PETrackII with half the total counts. |
| 1427 | + def sample_num_copy(self, |
| 1428 | + samplesize: cython.ulong, |
| 1429 | + seed: cython.int = -1): |
| 1430 | + """Downsample to a new PETrackII so that the sum of all |
| 1431 | + counts is samplesize. |
1545 | 1432 |
|
1546 | 1433 | """ |
1547 | | - samplesize: cython.ulong |
1548 | | - assert 0.0 < percent <= 1.0, "percent must be in (0, 1]" |
1549 | | - current_total = 0 |
1550 | | - for k in self.get_chr_names(): |
1551 | | - current_total += self.locations[k]['c'].sum() |
1552 | | - samplesize = cython.cast(cython.ulong, round(current_total * percent, 5)) |
1553 | | - return self.sample_num_copy(samplesize, seed) |
| 1434 | + chrnames = self.get_chr_names() |
| 1435 | + n_total = 0 |
| 1436 | + chr_totals = {} |
| 1437 | + for k in chrnames: |
| 1438 | + chr_totals[k] = self.locations[k]['c'].sum() |
| 1439 | + n_total += chr_totals[k] |
| 1440 | + percent = 0.0 if n_total == 0 else min(samplesize / n_total, 1.0) |
| 1441 | + return self.sample_percent_copy(percent, seed) |
1554 | 1442 |
|
1555 | 1443 | @cython.ccall |
1556 | 1444 | def pileup_bdg_hmmr(self, |
|
0 commit comments