Merge pull request #604 from nf-core/adapterremoval-prefix-fix

jfy133 · web-flow · commit 80f263fa0b11 · 2020-11-09T13:43:30.000+01:00
Adapterremoval prefix fix
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ### `Fixed`
 
 - Fixed AWS full test profile.
+- [#587](https://github.com/nf-core/eager/issues/587) - Re-implemented AdapterRemovalFixPrefix for DeDup compatibility of including singletons
 
 ## [2.2.1] - 2020-10-20
 
diff --git a/docs/usage.md b/docs/usage.md
@@ -8,7 +8,9 @@
 
 <!-- TOC depthfrom:2 depthto:3 -->
 
+- [:warning: Please read this documentation on the nf-core website: https://nf-co.re/eager/usage](#warning-please-read-this-documentation-on-the-nf-core-website-httpsnf-coreeagerusage)
 - [Table of contents](#table-of-contents)
+- [Introduction](#introduction)
 - [Running the pipeline](#running-the-pipeline)
   - [Quick Start](#quick-start)
   - [Updating the pipeline](#updating-the-pipeline)
@@ -1351,21 +1353,16 @@ Picard. Alternatively an ancient DNA specific read deduplication tool  `dedup`
 
 This utilises both ends of paired-end data to remove duplicates (i.e. true exact
 duplicates, as markduplicates will over-zealously deduplicate anything with the
-same starting position even if the ends are different). DeDup should only be
-used solely on paired-end data otherwise suboptimal deduplication can occur if
-applied to either single-end or a mix of single-end/paired-end data.
-
-Note that if you run without the `--mergedonly` flag for AdapterRemoval, DeDup
-will likely fail. If you absolutely want to use both PE and SE data, you can
-supply the `--dedup_all_merged` flag to consider singletons to also be merged
-paired-end reads. This may result in over-zealous deduplication.
+same starting position even if the ends are different). DeDup should generally
+only be used solely on paired-end data otherwise suboptimal deduplication can
+occur if applied to either single-end or a mix of single-end/paired-end data.
 
 #### `--dedup_all_merged`
 
 Sets DeDup to treat all reads as merged reads. This is useful if reads are for
-example not prefixed with `M_` in all cases. Therefore, this can be used as a
-workaround when also using a mixture of paired-end and single-end data, however
-this is not recommended (see above).
+example not prefixed with `M_`, `R_`, or `L_` in all cases. Therefore, this can
+be used as a workaround when also using a mixture of paired-end and single-end
+data, however this is not recommended (see above).
 
 > Modifies dedup parameter: `-m`
 
diff --git a/main.nf b/main.nf
@@ -1175,14 +1175,17 @@ process adapter_removal {
     
     #Combine files
     if [ ${preserve5p}  = "--preserve5p" ] && [ ${mergedonly} = "N" ]; then 
-      cat *.collapsed.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz > output/${base}.pe.combined.fq.gz
+      cat *.collapsed.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz > output/${base}.pe.combined.tmp.fq.gz
     elif [ ${preserve5p}  = "--preserve5p" ] && [ ${mergedonly} = "Y" ] ; then
-      cat *.collapsed.gz > output/${base}.pe.combined.fq.gz
+      cat *.collapsed.gz > output/${base}.pe.combined.tmp.fq.gz
     elif [ ${mergedonly} = "Y" ] ; then
-      cat *.collapsed.gz *.collapsed.truncated.gz > output/${base}.pe.combined.fq.gz
+      cat *.collapsed.gz *.collapsed.truncated.gz > output/${base}.pe.combined.tmp.fq.gz
     else
-      cat *.collapsed.gz *.collapsed.truncated.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz > output/${base}.pe.combined.fq.gz
+      cat *.collapsed.gz *.collapsed.truncated.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz > output/${base}.pe.combined.tmp.fq.gz
     fi
+
+    ## Add R_ and L_ for unmerged reads for DeDup compatibility
+    AdapterRemovalFixPrefix output/${base}.pe.combined.tmp.fq.gz | pigz -p ${task.cpus} > output/${base}.pe.combined.fq.gz
    
     mv *.settings output/
     """
@@ -1200,11 +1203,14 @@ process adapter_removal {
     AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} ${collapse_me} ${trim_me}
     
     if [ ${mergedonly} = "Y" ]; then
-      cat *.collapsed.gz *.collapsed.truncated.gz > output/${base}.pe.combined.fq.gz
+      cat *.collapsed.gz *.collapsed.truncated.gz > output/${base}.pe.combined.tmp.fq.gz
     else
-      cat *.collapsed.gz *.collapsed.truncated.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz  > output/${base}.pe.combined.fq.gz
+      cat *.collapsed.gz *.collapsed.truncated.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz  > output/${base}.pe.combined.tmp.fq.gz
     fi
 
+    ## Add R_ and L_ for unmerged reads for DeDup compatibility
+    AdapterRemovalFixPrefix output/${base}.pe.combined.tmp.fq.gz | pigz -p ${task.cpus} > output/${base}.pe.combined.fq.gz
+
     mv *.settings output/
     """
     } else if ( seqtype != 'PE' ) {
diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -694,7 +694,7 @@
                     "default": "markduplicates",
                     "description": "Deduplication method to use. Options: 'markduplicates',  'dedup'.",
                     "fa_icon": "fas fa-object-group",
-                    "help_text": "Sets the duplicate read removal tool. By default uses `markduplicates` from Picard. Alternatively an ancient DNA specific read deduplication tool `dedup` ([Peltzer et al. 2016](http://dx.doi.org/10.1186/s13059-016-0918-z)) is offered.\n\nThis utilises both ends of paired-end data to remove duplicates (i.e. true exact duplicates, as markduplicates will over-zealously deduplicate anything with the same starting position even if the ends are different). DeDup should only be used solely on paired-end data otherwise suboptimal deduplication can occur if applied to either single-end or a mix of single-end/paired-end data.\n\nNote that if you run without the `--mergedonly` flag for AdapterRemoval, DeDup will\nlikely fail. If you absolutely want to use both PE and SE data, you can supply the\n`--dedup_all_merged` flag to consider singletons to also be merged paired-end reads. This\nmay result in over-zealous deduplication.",
+                    "help_text": "Sets the duplicate read removal tool. By default uses `markduplicates` from Picard. Alternatively an ancient DNA specific read deduplication tool `dedup` ([Peltzer et al. 2016](http://dx.doi.org/10.1186/s13059-016-0918-z)) is offered.\n\nThis utilises both ends of paired-end data to remove duplicates (i.e. true exact duplicates, as markduplicates will over-zealously deduplicate anything with the same starting position even if the ends are different). DeDup should generally only be used solely on paired-end data otherwise suboptimal deduplication can occur if applied to either single-end or a mix of single-end/paired-end data.\n",
                     "enum": [
                         "markduplicates",
                         "dedup"