Skip to content

Commit 9ad0b55

Browse files
authored
Merge branch 'dev' into mva
2 parents 8f77163 + 97d53bc commit 9ad0b55

5 files changed

Lines changed: 93 additions & 74 deletions

File tree

conf/test_microbial.config

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ params {
4444

4545
// BAM filtering
4646
deduplication_tool = "dedup"
47+
deduplication_skipregionsplit = true
4748
run_bamfiltering = true
4849
bamfiltering_minreadlength = 30
4950
bamfiltering_mappingquality = 37

nextflow.config

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ params {
199199

200200
// Deduplication options
201201
skip_deduplication = false
202+
deduplication_skipregionsplit = false
202203
deduplication_tool = 'markduplicates'
203204

204205
// Qualimap

nextflow_schema.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1000,6 +1000,12 @@
10001000
"description": "Specify to skip the removal of PCR duplicates.",
10011001
"fa_icon": "fas fa-forward"
10021002
},
1003+
"deduplication_skipregionsplit": {
1004+
"type": "boolean",
1005+
"description": "Specify to run deduplicaiton without splitting bams by contig (default behavior).",
1006+
"fa_icon": "fas fa-forward",
1007+
"help_text": "Run deduplication steps bam-by-bam rather than contig-by-contig for each bam file. This reduces the total number of jobs submitted to a cluster, but increases the computational runtime. If you use a shared cluster with limited resources, running many low-resource jobs can slow down the overall runtime of eager due to scheduling constraints.\nAlso applicable for poor-quality reference genomes."
1008+
},
10031009
"deduplication_tool": {
10041010
"type": "string",
10051011
"default": "markduplicates",

subworkflows/local/deduplicate.nf

Lines changed: 83 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -29,39 +29,49 @@ workflow DEDUPLICATE {
2929
addNewMetaFromAttributes( it, "id" , "reference" , false )
3030
}
3131

32-
// Create genomic regions file for splitting the bam before deduplication
33-
BUILD_INTERVALS( fasta_fai )
34-
ch_versions = ch_versions.mix( BUILD_INTERVALS.out.versions.first() )
32+
if ( params.deduplication_skipregionsplit ) {
3533

36-
// Prep regions for combining
37-
ch_intervals_for_join = BUILD_INTERVALS.out.bed
38-
.map {
39-
// Replace meta with new meta that contains the meta.id value in the meta.reference attribute only
40-
addNewMetaFromAttributes( it, "id" , "reference" , true )
41-
}
34+
// No splitting of .bam files by contig, deduplicate all in one
35+
input_for_deduplication = ch_bam_bai
4236

43-
// Ensure input bam matches the regions file
44-
ch_bam_for_split = ch_bam_bai
45-
.map {
46-
// Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute
47-
addNewMetaFromAttributes( it, "reference" , "reference" , false )
48-
}
49-
.combine(
50-
by: 0,
51-
ch_intervals_for_join
52-
)
37+
} else {
38+
39+
// Create genomic regions file for splitting the bam before deduplication
40+
BUILD_INTERVALS( fasta_fai )
41+
ch_versions = ch_versions.mix( BUILD_INTERVALS.out.versions.first() )
42+
43+
// Prep regions for combining
44+
ch_intervals_for_join = BUILD_INTERVALS.out.bed
5345
.map {
54-
ignore_me, meta, bam, bai, regions ->
55-
[ meta, bam, bai, regions ]
46+
// Replace meta with new meta that contains the meta.id value in the meta.reference attribute only
47+
addNewMetaFromAttributes( it, "id" , "reference" , true )
5648
}
5749

58-
//Split input bam by region
59-
BAM_SPLIT_BY_REGION( ch_bam_for_split )
60-
ch_versions = ch_versions.mix( BAM_SPLIT_BY_REGION.out.versions )
50+
// Ensure input bam matches the regions file
51+
ch_bam_for_split = ch_bam_bai
52+
.map {
53+
// Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute
54+
addNewMetaFromAttributes( it, "reference" , "reference" , false )
55+
}
56+
.combine(
57+
by: 0,
58+
ch_intervals_for_join
59+
)
60+
.map {
61+
ignore_me, meta, bam, bai, regions ->
62+
[ meta, bam, bai, regions ]
63+
}
64+
65+
//Split input bam by region
66+
BAM_SPLIT_BY_REGION( ch_bam_for_split )
67+
input_for_deduplication = BAM_SPLIT_BY_REGION.out.bam_bai
68+
ch_versions = ch_versions.mix( BAM_SPLIT_BY_REGION.out.versions )
69+
70+
}
6171

6272
if ( params.deduplication_tool == 'markduplicates' ) {
6373

64-
ch_markduplicates_input = BAM_SPLIT_BY_REGION.out.bam_bai
74+
ch_markduplicates_input = input_for_deduplication
6575
.map {
6676
// Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute
6777
addNewMetaFromAttributes( it, "reference" , "reference" , false )
@@ -83,70 +93,83 @@ workflow DEDUPLICATE {
8393
ch_markduplicates_input.fasta,
8494
ch_markduplicates_input.fasta_fai
8595
)
86-
ch_versions = ch_versions.mix( PICARD_MARKDUPLICATES.out.versions.first() )
96+
ch_versions = ch_versions.mix( PICARD_MARKDUPLICATES.out.versions.first() )
8797

88-
ch_dedupped_region_bam = PICARD_MARKDUPLICATES.out.bam
98+
ch_dedupped_bam = PICARD_MARKDUPLICATES.out.bam
8999

90100
} else if ( params.deduplication_tool == "dedup" ) {
91-
ch_dedup_input = BAM_SPLIT_BY_REGION.out.bam_bai
101+
ch_dedup_input = input_for_deduplication
92102
.map {
93103
meta, bam, bai ->
94104
[ meta, bam ]
95105
}
96106

97107
DEDUP( ch_dedup_input )
98-
ch_versions = ch_versions.mix( DEDUP.out.versions.first() )
108+
ch_versions = ch_versions.mix( DEDUP.out.versions.first() )
99109

100-
ch_dedupped_region_bam = DEDUP.out.bam
110+
ch_dedupped_bam = DEDUP.out.bam
101111
}
102112

103-
ch_input_for_samtools_merge = ch_dedupped_region_bam
104-
.map {
105-
meta, bam ->
106-
meta2 = meta.clone().findAll{ it.key != 'genomic_region' }
107-
[ meta2, bam ]
108-
}
109-
.groupTuple()
110-
.map {
111-
// Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute
112-
addNewMetaFromAttributes( it, "reference" , "reference" , false )
113-
}
114-
.combine(
115-
by:0,
116-
ch_refs
113+
if ( params.deduplication_skipregionsplit ) {
114+
115+
// Bams were never split by region, so bypass of re-merging
116+
ch_input_for_samtools_sort_dedupped = ch_dedupped_bam
117+
118+
} else {
119+
120+
// Re-merging of bams-by-contig must take place after deduplciation
121+
ch_input_for_samtools_merge = ch_dedupped_bam
122+
.map {
123+
meta, bam ->
124+
meta2 = meta.clone().findAll{ it.key != 'genomic_region' }
125+
[ meta2, bam ]
126+
}
127+
.groupTuple()
128+
.map {
129+
// Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute
130+
addNewMetaFromAttributes( it, "reference" , "reference" , false )
131+
}
132+
.combine(
133+
by:0,
134+
ch_refs
135+
)
136+
.multiMap{
137+
// bam here is a list of bams
138+
ignore_me, meta, bam, meta2, fasta, fasta_fai ->
139+
bam: [ meta, bam ]
140+
fasta: [ meta2, fasta ]
141+
fasta_fai: [ meta2, fasta_fai ]
142+
}
143+
144+
// Merge the bams for each region into one bam
145+
SAMTOOLS_MERGE_DEDUPPED(
146+
ch_input_for_samtools_merge.bam,
147+
ch_input_for_samtools_merge.fasta,
148+
ch_input_for_samtools_merge.fasta_fai
117149
)
118-
.multiMap{
119-
// bam here is a list of bams
120-
ignore_me, meta, bam, meta2, fasta, fasta_fai ->
121-
bam: [ meta, bam ]
122-
fasta: [ meta2, fasta ]
123-
fasta_fai: [ meta2, fasta_fai ]
124-
}
150+
ch_versions = ch_versions.mix( SAMTOOLS_MERGE_DEDUPPED.out.versions )
125151

126-
// Merge the bams for each region into one bam
127-
SAMTOOLS_MERGE_DEDUPPED(
128-
ch_input_for_samtools_merge.bam,
129-
ch_input_for_samtools_merge.fasta,
130-
ch_input_for_samtools_merge.fasta_fai
131-
)
132-
ch_versions = ch_versions.mix( SAMTOOLS_MERGE_DEDUPPED.out.versions )
152+
ch_input_for_samtools_sort_dedupped = SAMTOOLS_MERGE_DEDUPPED.out.bam
153+
154+
}
133155

134156

135157
// Sort the merged bam and index
136-
SAMTOOLS_SORT_DEDUPPED ( SAMTOOLS_MERGE_DEDUPPED.out.bam )
158+
SAMTOOLS_SORT_DEDUPPED ( ch_input_for_samtools_sort_dedupped )
137159
ch_versions = ch_versions.mix( SAMTOOLS_SORT_DEDUPPED.out.versions )
138160
ch_dedup_bam = SAMTOOLS_SORT_DEDUPPED.out.bam
139161

140162
SAMTOOLS_INDEX_DEDUPPED ( ch_dedup_bam )
141163
ch_versions = ch_versions.mix( SAMTOOLS_INDEX_DEDUPPED.out.versions )
142-
ch_dedup_bai = params.fasta_largeref ? SAMTOOLS_INDEX_DEDUPPED.out.csi : SAMTOOLS_INDEX_DEDUPPED.out.bai
164+
ch_dedup_bai = params.fasta_largeref ? SAMTOOLS_INDEX_DEDUPPED.out.csi : SAMTOOLS_INDEX_DEDUPPED.out.bai
143165

144166
// Finally run flagstat on the dedupped bam
145167
ch_input_for_samtools_flagstat = ch_dedup_bam.join( ch_dedup_bai )
146168

147169
SAMTOOLS_FLAGSTAT_DEDUPPED(
148170
ch_input_for_samtools_flagstat
149171
)
172+
150173
ch_versions = ch_versions.mix( SAMTOOLS_FLAGSTAT_DEDUPPED.out.versions )
151174
ch_multiqc_files = ch_multiqc_files.mix( SAMTOOLS_FLAGSTAT_DEDUPPED.out.flagstat )
152175
ch_dedup_flagstat = SAMTOOLS_FLAGSTAT_DEDUPPED.out.flagstat

tests/test_microbial.nf.test.snap

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -636,9 +636,6 @@
636636
"BEDTOOLS_COVERAGE_DEPTH": {
637637
"bedtools": "2.31.1)"
638638
},
639-
"BUILD_INTERVALS": {
640-
"gawk": "5.1.0"
641-
},
642639
"BWA_ALN": {
643640
"bwa": "0.7.18-r1243-dirty"
644641
},
@@ -697,9 +694,6 @@
697694
"SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES": {
698695
"samtools": 1.18
699696
},
700-
"SAMTOOLS_INDEX": {
701-
"samtools": 1.18
702-
},
703697
"SAMTOOLS_INDEX_DEDUPPED": {
704698
"samtools": 1.18
705699
},
@@ -712,9 +706,6 @@
712706
"SAMTOOLS_LENGTH_FILTER_INDEX": {
713707
"samtools": 1.18
714708
},
715-
"SAMTOOLS_MERGE_DEDUPPED": {
716-
"samtools": 1.18
717-
},
718709
"SAMTOOLS_MERGE_LIBRARIES": {
719710
"samtools": 1.18
720711
},
@@ -724,9 +715,6 @@
724715
"SAMTOOLS_SORT_MERGED_LIBRARIES": {
725716
"samtools": 1.18
726717
},
727-
"SAMTOOLS_VIEW": {
728-
"samtools": 1.18
729-
},
730718
"SAMTOOLS_VIEW_BAM_FILTERING": {
731719
"samtools": 1.18
732720
},
@@ -740,9 +728,9 @@
740728
],
741729
"meta": {
742730
"nf-test": "0.9.3",
743-
"nextflow": "25.04.8"
731+
"nextflow": "25.10.3"
744732
},
745-
"timestamp": "2025-11-07T11:08:47.368306"
733+
"timestamp": "2026-02-13T09:52:11.74559929"
746734
},
747735
"authentication": {
748736
"content": [

0 commit comments

Comments
 (0)