Merge pull request #902 from nf-core/endorspy-resume-fix

jfy133 · web-flow · commit 4e93c5518d06 · 2022-07-29T13:04:54.000+02:00
Endorspy resume fix
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 - [#882](https://github.com/nf-core/eager/pull/882) Define DSL1 execution explicitly, as new versions Nextflow made DSL2 default (♥ to & fix from @Lehmann-Fabian)
 - [#879](https://github.com/nf-core/eager/issues/879) Add missing threads parameter for pre-clipping FastQC for single end data that caused insufficient memory in some cases (♥ to @marcel-keller for reporting)
+- [#880](https://github.com/nf-core/eager/issues/880) Fix failure of endorSpy to be cached or reexecuted on resume (♥ to @KathrinNaegele, @TCLamnidis, & @mahesh-panchal for reporting and debugging)
 - [#885](https://github.com/nf-core/eager/issues/885) Specify task memory for all tools in get_software_versions to account for incompatibilty of java with some SGE clusters causing hanging of the process (♥ to @maxibor for reporting)
 - [#887](https://github.com/nf-core/eager/issues/887) Clarify what is considered 'ultra-short' reads in the help text of clip_readlength, for when you may wish to turn of length filtering during AdapterRemoval (♥ to @TCLamnidis for reporting)
 - [#889](https://github.com/nf-core/eager/issues/889) Remove/updated parameters from benchmarking test profiles (♥ to @TCLamnidis for reporting)
@@ -19,6 +20,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - [#899](https://github.com/nf-core/eager/issues/897) Fix pipeline crash for circulargenerator if reference file does not end in .fasta (♥ to @scarlhoff for reporting)
 - Fixed some missing default values in the nextflow parameter schema JSON
 - [#789](https://github.com/nf-core/eager/issues/789) Substantial speed and memory optimisation of the `extract_map_reads.py` script (♥ to @ivelsko for reporting, @maxibor for optimisation)
+- Fix staging of input bams for genotyping_pileupcaller process. Downstream changes from changes introduced when fixing endorspy caching.
 
 ### `Dependencies`
 
diff --git a/main.nf b/main.nf
@@ -769,7 +769,7 @@ ch_input_for_fastp.fourcol
       [ samplename, libraryid, lane, seqtype, organism, strandedness, udg, r1, r2 ]
 
     }
- .set { ch_skipfastp_for_merge }
+  .set { ch_skipfastp_for_merge }
 
 ch_output_from_fastp
   .map{
@@ -800,7 +800,7 @@ process adapter_removal {
 
     input:
     tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(r1), file(r2) from ch_fastp_for_adapterremoval
-    path adapterlist from ch_adapterlist.collect().dump(tag: "Adapter list")
+    path adapterlist from ch_adapterlist.collect()
 
     output:
     tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("output/*{combined.fq,.se.truncated,pair1.truncated}.gz") into ch_output_from_adapterremoval_r1
@@ -968,13 +968,10 @@ if ( params.skip_collapse ){
 // AdapterRemoval bypass when not running it
 if (!params.skip_adapterremoval) {
     ch_output_from_adapterremoval.mix(ch_fastp_for_skipadapterremoval)
-        .dump(tag: "post_ar_adapterremoval_decision_skipar")
         .filter { it =~/.*combined.fq.gz|.*truncated.gz/ }
-        .dump(tag: "ar_bypass")
         .into { ch_adapterremoval_for_post_ar_trimming; ch_adapterremoval_for_skip_post_ar_trimming; } 
 } else {
     ch_fastp_for_skipadapterremoval
-        .dump(tag: "post_ar_adapterremoval_decision_withar")
         .into { ch_adapterremoval_for_post_ar_trimming; ch_adapterremoval_for_skip_post_ar_trimming; } 
 }
 
@@ -1076,7 +1073,6 @@ ch_branched_for_lanemerge = ch_inlinebarcoderemoval_for_lanemerge
       [ samplename, libraryid, lane, seqtype, organism, strandedness, udg, r1, r2 ]
 
   }
-  .dump(tag: "lanemerge_bypass_decision")
   .branch {
     skip_merge: it[7].size() == 1 // Can skip merging if only single lanes
     merge_me: it[7].size() > 1
@@ -1097,7 +1093,6 @@ ch_branched_for_lanemerge_skipme = ch_branched_for_lanemerge.skip_merge
 
         [ samplename, libraryid, lane, seqtype, organism, strandedness, udg, r1, r2 ]
   }
-  .dump(tag: "lanemerge_reconfigure")
 
 
 ch_branched_for_lanemerge_ready = ch_branched_for_lanemerge.merge_me
@@ -1125,15 +1120,15 @@ process lanemerge {
   publishDir "${params.outdir}/lanemerging", mode: params.publish_dir_mode
 
   input:
-  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_branched_for_lanemerge_ready.dump(tag: "lange_merge_input")
+  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_branched_for_lanemerge_ready
 
   output:
   tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*_R1_lanemerged.fq.gz") into ch_lanemerge_for_mapping_r1
   tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*_R2_lanemerged.fq.gz") optional true into ch_lanemerge_for_mapping_r2
 
   script:
   if ( seqtype == 'PE' && ( params.skip_collapse || params.skip_adapterremoval ) ){
-  lane = 0
+  def lane = 0
   """
   cat ${r1} > "${libraryid}"_R1_lanemerged.fq.gz
   cat ${r2} > "${libraryid}"_R2_lanemerged.fq.gz
@@ -1149,7 +1144,6 @@ process lanemerge {
 // Ensuring always valid R2 file even if doesn't exist for AWS
 if ( ( params.skip_collapse || params.skip_adapterremoval ) ) {
   ch_lanemerge_for_mapping_r1
-    .dump(tag: "post_lanemerge_reconfigure")
     .mix(ch_lanemerge_for_mapping_r2)
     .groupTuple(by: [0,1,2,3,4,5,6])
     .map{
@@ -1264,8 +1258,8 @@ process bwa {
     publishDir "${params.outdir}/mapping/bwa", mode: params.publish_dir_mode
 
     input:
-    tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_lanemerge_for_bwa.dump(tag: "bwa_input_reads")
-    path index from bwa_index.collect().dump(tag: "input_index")
+    tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_lanemerge_for_bwa
+    path index from bwa_index.collect()
 
     output:
     tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.mapped.bam"), path("*.{bai,csi}") into ch_output_from_bwa   
@@ -1564,17 +1558,21 @@ ch_branched_for_seqtypemerge = ch_mapping_for_seqtype_merging
     it ->
       def samplename = it[0]
       def libraryid  = it[1]
-      def lane = it[2]
+      def lane = 0
       def seqtype = it[3].unique() // How to deal with this?
       def organism = it[4]
       def strandedness = it[5]
       def udg = it[6]
       def r1 = it[7]
       def r2 = it[8]
 
-      // We will assume if mixing it is better to set as PE as this is informative
+      // 1. We will assume if mixing it is better to set as PE as this is informative
       // for DeDup (and markduplicates doesn't care), but will throw a warning!
-      def seqtype_new = seqtype.flatten().size() > 1 ? 'PE' : seqtype 
+      // 2. We will also flatten to a single value to address problems with 'unstable' 
+      // Nextflow ArrayBag object types not allowing the .join to work between resumes
+      // See: https://github.com/nf-core/eager/issues/880
+
+      def seqtype_new = seqtype.flatten().size() > 1 ? 'PE' : seqtype.flatten()[0] 
                       
       if ( seqtype.flatten().size() > 1 &&  params.dedupper == 'dedup' ) {
         log.warn "[nf-core/eager] Warning: you are running DeDup on BAMs with a mixture of PE/SE data for library: ${libraryid}. DeDup is designed for PE data only, deduplication maybe suboptimal!"
@@ -1583,7 +1581,6 @@ ch_branched_for_seqtypemerge = ch_mapping_for_seqtype_merging
       [ samplename, libraryid, lane, seqtype_new, organism, strandedness, udg, r1, r2 ]
 
   }
-  .dump(tag: "pre_seqtype_decision")
   .branch {
     skip_merge: it[7].size() == 1 // Can skip merging if only single lanes
     merge_me: it[7].size() > 1
@@ -1791,11 +1788,12 @@ if (params.run_bam_filtering) {
         def seqtype = it[3]
         def organism = it[4]
         def strandedness = it[5]
-        def udg = it[6]     
+        def udg = it[6]
         def stats = file(it[7])
         def poststats = file("$projectDir/assets/nf-core_eager_dummy.txt")
 
-      [samplename, libraryid, lane, seqtype, organism, strandedness, udg, stats, poststats ] }
+      [samplename, libraryid, lane, seqtype, organism, strandedness, udg, stats, poststats ] 
+    }
     .set{ ch_allflagstats_for_endorspy }
 }
 
@@ -1956,7 +1954,6 @@ ch_input_for_librarymerging.merge_me
 
       [it[0], libraryid, it[2], seqtype, it[4], it[5], it[6], bam, bai ]
     }
-  .dump(tag: "input_for_lib_merging")
   .set { ch_fixedinput_for_librarymerging }
 
 process library_merge {
@@ -1965,7 +1962,7 @@ process library_merge {
   publishDir "${params.outdir}/merged_bams/initial", mode: params.publish_dir_mode
 
   input:
-  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(bam), file(bai) from ch_fixedinput_for_librarymerging.dump(tag: "library_merge_input")
+  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(bam), file(bai) from ch_fixedinput_for_librarymerging
 
   output:
   tuple samplename, val("${samplename}_libmerged"), lane, seqtype, organism, strandedness, udg, path("*_libmerged_rmdup.bam"), path("*_libmerged_rmdup.bam.{bai,csi}") into ch_output_from_librarymerging
@@ -2233,7 +2230,7 @@ process bam_trim {
     tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(bam), path(bai) from ch_bamutils_decision.totrim
 
     output: 
-    tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.trimmed.bam"), file("*.trimmed.bam.{bai,csi}") into ch_trimmed_from_bamutils
+    tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.trimmed.bam"), path("*.trimmed.bam.{bai,csi}") into ch_trimmed_from_bamutils
 
     script:
     def softclip = params.bamutils_softclip ? '-c' : '' 
@@ -2265,7 +2262,7 @@ ch_trimmed_formerge = ch_bamutils_decision.notrim
         def seqtype = it[3]
         def organism = it[4]
         def strandedness = it[5]
-        def udg = it[6]     
+        def udg = it[6]
         def bam = it[7].flatten()
         def bai = it[8].flatten()
 
@@ -2491,10 +2488,36 @@ ch_damagemanipulation_for_genotyping_pileupcaller
  // Create pileupcaller input tuples
 ch_input_for_genotyping_pileupcaller.singleStranded
   .groupTuple(by:[5])
+  .map{
+        def samplename = it[0]
+        def libraryid  = it[1]
+        def lane = it[2]
+        def seqtype = it[3]
+        def organism = it[4]
+        def strandedness = it[5]
+        def udg = it[6]
+        def bam = it[7].flatten()
+        def bai = it[8].flatten()
+
+      [samplename, libraryid, lane, seqtype, organism, strandedness, udg, bam, bai ]
+  }
   .set {ch_prepped_for_pileupcaller_single}
 
 ch_input_for_genotyping_pileupcaller.doubleStranded
   .groupTuple(by:[5])
+  .map{
+        def samplename = it[0]
+        def libraryid  = it[1]
+        def lane = it[2]
+        def seqtype = it[3]
+        def organism = it[4]
+        def strandedness = it[5]
+        def udg = it[6]
+        def bam = it[7].flatten()
+        def bai = it[8].flatten()
+
+      [samplename, libraryid, lane, seqtype, organism, strandedness, udg, bam, bai ]
+  }
   .set {ch_prepped_for_pileupcaller_double}
 
 process genotyping_pileupcaller {
@@ -2506,12 +2529,12 @@ process genotyping_pileupcaller {
   params.run_genotyping && params.genotyping_tool == 'pileupcaller'
 
   input:
-  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, bam, bai from ch_prepped_for_pileupcaller_double.mix(ch_prepped_for_pileupcaller_single)
+  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(bam), path(bai) from ch_prepped_for_pileupcaller_double.mix(ch_prepped_for_pileupcaller_single)
   file fasta from ch_fasta_for_genotyping_pileupcaller.collect()
   file fai from ch_fai_for_pileupcaller.collect()
   file dict from ch_dict_for_pileupcaller.collect()
   path(bed) from ch_bed_for_pileupcaller.collect()
-  path(snp) from ch_snp_for_pileupcaller.collect().dump(tag: "pileupcaller_snp_file")
+  path(snp) from ch_snp_for_pileupcaller.collect()
 
   output:
   tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("pileupcaller.${strandedness}.*") into ch_for_eigenstrat_snp_coverage
@@ -2542,7 +2565,7 @@ process eigenstrat_snp_coverage {
   params.run_genotyping && params.genotyping_tool == 'pileupcaller'
   
   input:
-  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*") from ch_for_eigenstrat_snp_coverage.dump(tag:'eigenstrat_input')
+  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*") from ch_for_eigenstrat_snp_coverage
   
   output:
   tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.json") into ch_eigenstrat_snp_cov_for_multiqc
@@ -2673,7 +2696,7 @@ process vcf2genome {
 if (!params.additional_vcf_files) {
     ch_vcfs_for_multivcfanalyzer = ch_ug_for_multivcfanalyzer.map{ it[-1] }.collect()
 } else {
-    ch_vcfs_for_multivcfanalyzer = ch_ug_for_multivcfanalyzer.map{ it[-1] }.mix(ch_extravcfs_for_multivcfanalyzer).collect().dump(tag: "postmix")
+    ch_vcfs_for_multivcfanalyzer = ch_ug_for_multivcfanalyzer.map{ it[-1] }.mix(ch_extravcfs_for_multivcfanalyzer).collect()
 }
 
 process multivcfanalyzer {
@@ -3342,7 +3365,6 @@ workflow.onError {
 def extract_data(tsvFile) {
     Channel.fromPath(tsvFile)
         .splitCsv(header: true, sep: '\t')
-        .dump(tag:'tsv_extract')
         .map { row ->
 
             def expected_keys = ['Sample_Name', 'Library_ID', 'Lane', 'Colour_Chemistry', 'SeqType', 'Organism', 'Strandedness', 'UDG_Treatment', 'R1', 'R2', 'BAM']