Merge branch 'dev' into master

jfy133 · web-flow · commit 9fdf42658adf · 2019-05-24T21:12:03.000+02:00
diff --git a/.travis.yml b/.travis.yml
@@ -42,8 +42,10 @@ script:
   - nf-core lint ${TRAVIS_BUILD_DIR}
   # Run the basic pipeline with the test profile
   - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --saveReference
+  # Test using PMD tools
+  - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --run_pmdtools --pairedEnd
   # Run the basic pipeline with single end data (pretending its single end actually)
-  - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --singleEnd --bwa_index results/reference_genome/bwa_index/bwa_index/
+  - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --singleEnd --bwa_index results/reference_genome/bwa_index/BWAIndex/Mammoth_MT_Krause.fasta
   # Run the basic pipeline with paired end data without collapsing
   - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --skip_collapse --saveReference
   # Run the basic pipeline with paired end data without trimming
@@ -53,14 +55,19 @@ script:
    # Run the basic pipeline with output unmapped reads as fastq
   - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --strip_input_fastq
   # Run the same pipeline testing optional step: fastp, complexity 
-  - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --complexity_filter --bwa_index results/reference_genome/bwa_index/bwa_index/
+  - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --complexity_filter --bwa_index results/reference_genome/bwa_index/BWAIndex/Mammoth_MT_Krause.fasta
   # Test BAM Trimming
-  - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --trim_bam --bwa_index results/reference_genome/bwa_index/bwa_index/
+  - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --trim_bam --bwa_index results/reference_genome/bwa_index/BWAIndex/Mammoth_MT_Krause.fasta
   # Test running with CircularMapper
   - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --circularmapper --circulartarget 'NC_007596.2'
   # Test running with BWA Mem
-  - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --bwamem --bwa_index results/reference_genome/bwa_index/bwa_index/
+  - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --bwamem --bwa_index results/reference_genome/bwa_index/BWAIndex/Mammoth_MT_Krause.fasta
   # Test with zipped reference input
   - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --fasta 'https://raw.githubusercontent.com/nf-core/test-datasets/eager2/reference/Test.fasta.gz'
   # Run the basic pipeline with the bam input profile
   - nextflow run ${TRAVIS_BUILD_DIR} -profile testbam,docker --bam
+  # Run the basic pipeline with FastA reference with `fna` extension
+  - nextflow run ${TRAVIS_BUILD_DIR} -profile test_fna,docker --pairedEnd --saveReference
+  # Test using pre-computed indices from a separate run beforehand
+  - nextflow run ${TRAVIS_BUILD_DIR} -profile test_fna,docker --pairedEnd --bwa_index results/reference_genome/bwa_index/BWAIndex/Mammoth_MT_Krause.fna --fasta_index results/reference_genome/fasta_index/Mammoth_MT_Krause.fna.fai --seq_dict results/reference_genome/seq_dict/Mammoth_MT_Krause.dict 
+
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,12 +12,19 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 * [#186](https://github.com/nf-core/eager/pull/186) - Make FastQC skipping [possible]
 /(https://github.com/nf-core/eager/issues/182)
 * Merged in [nf-core/tools](https://github.com/nf-core/tools) release V1.6 template changes  
+* A lot more automated tests using Travis CI
+* Don't ignore DamageProfiler errors anymore 
 
 ### `Fixed`
 * [#152](https://github.com/nf-core/eager/pull/152) - DamageProfiler errors [won't crash entire pipeline anymore](https://github.com/nf-core/eager/issues/171)
 * [#176](https://github.com/nf-core/eager/pull/176) - Increase runtime for DamageProfiler on [large reference genomes](https://github.com/nf-core/eager/issues/173)
 * [#172](https://github.com/nf-core/eager/pull/152) - DamageProfiler errors [won't crash entire pipeline anymore](https://github.com/nf-core/eager/issues/171)
-* [#174](https://github.com/nf-core/eager/pull/190) - Publish DeDup files [properly](https://github.com/nf-core/eager/issues/183) 
+* [#174](https://github.com/nf-core/eager/pull/190) - Publish DeDup files [properly](https://github.com/nf-core/eager/issues/183)
+* [#196](https://github.com/nf-core/eager/pull/196) - Fix reference [issues](https://github.com/nf-core/eager/issues/150)
+* [#196](https://github.com/nf-core/eager/pull/196) - Fix issues with PE data being mapped incompletely
+* [#200](https://github.com/nf-core/eager/pull/200) - Fix minor issue with some [typos](https://github.com/nf-core/eager/pull/196)
+* [#210](https://github.com/nf-core/eager/pull/210) - Fix PMDTools [encoding issue](https://github.com/pontussk/PMDtools/issues/6) from `samtools calmd` generated files by running through `samtools view` first
+
 
 ### `Dependencies`
 
diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml
@@ -6,18 +6,17 @@ top_modules:
      - 'fastqc':
         name: 'FastQC (pre-AdapterRemoval)'
         path_filters:
-            - '*_fastqc.zip'
-        path_filters_exclude:
-             - '*.combined.prefixed_fastqc.zip'
+            - '*_raw_fastqc.zip'
      - 'fastp'
      - 'adapterRemoval'
      - 'fastqc':
          name: 'FastQC (post-AdapterRemoval)'
          path_filters:
+             - '*.truncated_fastqc.zip'
              - '*.combined*_fastqc.zip'
      - 'samtools'
-     - 'preseq'
      - 'dedup'
+     - 'preseq'
      - 'qualimap'
      - 'damageprofiler'
      - 'gatk'
diff --git a/bin/extract_map_reads.py b/bin/extract_map_reads.py
@@ -13,16 +13,14 @@ def _get_args():
     parser = argparse.ArgumentParser(
         prog='extract_mapped_reads',
         formatter_class=argparse.RawDescriptionHelpFormatter,
-        description=f'''
-Remove mapped in bam file from fastq files
-        ''')
+        description="Remove mapped in bam file from fastq files")
     parser.add_argument('bam_file', help="path to bam file")
     parser.add_argument('fwd', help='path to forward fastq file')
     parser.add_argument(
-        '-2',
+        '-rev',
         dest="rev",
         default=None,
-        help="path to forward fastq file")
+        help="path to reverse fastq file")
     parser.add_argument(
         '-of',
         dest="out_fwd",
@@ -89,6 +87,19 @@ def extract_mapped(bam, processes):
         chrs = bamfile.references
     except ValueError as e:
         print(e)
+
+    # Returns empty list if not reads mapped (because not ref match in bam)
+    if len(chrs) == 0:
+        return([])
+
+    # Checking that nb_process is not > nb_chromosomes
+    elif len(chrs) < processes:
+        print(
+            f"""Requested {processes} processe(s), 
+            but can only be parallelized on {len(chrs)} 
+            processes with these data""")
+        processes = len(chrs)
+
     extract_mapped_chr_partial = partial(extract_mapped_chr, bam=bam)
     p = multiprocessing.Pool(processes)
     res = p.map(extract_mapped_chr_partial, chrs)
@@ -163,8 +174,8 @@ def write_fq(fq_dict, fname, mode):
     """
     Write to fastq file
     INPUT:
-    - fq_dict(dict) dictionary with unmapped read names as keys, seq and quality as values
-        in a list
+    - fq_dict(dict) dictionary with unmapped read names as keys, 
+        unmapped/mapped (u|m), seq, and quality as values in a list
     - fname(string) Path to output fastq file
     - mode(string) strip (remove read) or replace (replace read sequence) by Ns
     """
@@ -218,13 +229,14 @@ def write_fq(fq_dict, fname, mode):
                             f.write(f"{i}\n")
 
 
+def check_strip_mode(mode):
+    if mode.lower() not in ['replace', 'strip']:
+        print(f"Mode must be {' or '.join(mode)}")
+
+
 if __name__ == "__main__":
     BAM, IN_FWD, IN_REV, OUT_FWD, OUT_REV, MODE, PROC = _get_args()
 
-    if IN_REV and not OUT_REV:
-        print('You specified an input reverse fastq, but no output reverse fastq')
-        sys.exit(1)
-
     if OUT_FWD == None:
         out_fwd = f"{IN_FWD.split('/')[-1].split('.')[0]}.r1.fq.gz"
     else:
diff --git a/conf/base.config b/conf/base.config
@@ -61,13 +61,10 @@ process {
   withName: multiqc {
     errorStrategy = { task.exitStatus in [143,137] ? 'retry' : 'ignore' }
   }
-
   withName: damageprofiler {
-    errorStrategy = 'ignore'
-    params.large_ref ? "time = { check_max(8.h * task.attempt, 'time') }" : "time = { check_max(2.h * task.attempt, 'time') }"
+    time = params.large_ref ? { check_max(8.h * task.attempt, 'time') } : { check_max(2.h * task.attempt, 'time')}
   }
-
-  withName: extract_unmapped_reads {
+  withName: strip_input_fastq {
     cpus = { check_max(8 * task.attempt, 'cpus') }
     memory = { check_max( 8.GB * task.attempt, 'memory' ) }
   }
diff --git a/conf/test_fna.config b/conf/test_fna.config
@@ -0,0 +1,25 @@
+/*
+ * -------------------------------------------------
+ *  Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ * nextflow run nf-core/eager -profile test, docker (or singularity, or conda)
+ */
+
+params {
+  config_profile_name = 'Test profile'
+  config_profile_description = 'Minimal test dataset to check pipeline function'
+  // Limit resources so that this can run on Travis
+  max_cpus = 2
+  max_memory = 6.GB
+  max_time = 48.h
+  genome = "Custom"
+  //Input data
+  singleEnd = false
+  readPaths = [['JK2782_TGGCCGATCAACGA_L008', ['https://github.com/nf-core/test-datasets/raw/eager2/testdata/Mammoth/JK2782_TGGCCGATCAACGA_L008_R1_001.fastq.gz.tengrand.fq.gz','https://github.com/nf-core/test-datasets/raw/eager2/testdata/Mammoth/JK2782_TGGCCGATCAACGA_L008_R2_001.fastq.gz.tengrand.fq.gz']],
+  ['JK2785_TGGCCGATCAACGA_L008', ['https://github.com/nf-core/test-datasets/raw/eager2/testdata/Mammoth/JK2785_TGGCCGATCAACGA_L008_R1_001.fastq.gz.tengrand.fq.gz','https://github.com/nf-core/test-datasets/raw/eager2/testdata/Mammoth/JK2785_TGGCCGATCAACGA_L008_R2_001.fastq.gz.tengrand.fq.gz']],
+  ]
+  // Genome references
+  fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager2/reference/Mammoth_MT_Krause.fna'
+}
diff --git a/docs/usage.md b/docs/usage.md
@@ -5,7 +5,7 @@
 <!-- Install Atom plugin markdown-toc-auto for this ToC to auto-update on save -->
 <!-- TOC START min:2 max:3 link:true asterisk:true update:true -->
 * [Table of contents](#table-of-contents)
-* [Introduction](#introduction)
+* [Introduction](#general-nextflow-info)
 * [Running the pipeline](#running-the-pipeline)
 * [Updating the pipeline](#updating-the-pipeline)
 * [Reproducibility](#reproducibility)
@@ -168,12 +168,14 @@ A normal glob pattern, enclosed in quotation marks, can then be used for `--read
 ```
 
 ### `--fasta`
-If you prefer, you can specify the full path to your reference genome when you run the pipeline:
+You specify the full path to your reference genome here. The FASTA file can have any file suffix, such as `.fasta`, `.fna`, `.fa`, `.FastA` etc. You may also supply a gzipped reference files, which will be unzipped automatically for you. 
+
+For example:
 
 ```bash
---fasta '[path to Fasta reference]'
+--fasta '/<path>/<to>/my_reference.fasta'
 ```
-> If you don't specify appropriate `--bwa_index`, `--fasta_index` parameters, the pipeline will create these indices for you automatically. Note, that saving these for later has to be turned on using `--saveReference`. You may also specify the path to a gzipped (`*.gz` file extension) FastA as reference genome - this will be uncompressed by the pipeline automatically for you. Note that other file extensions such as `.fna`, `.fa` are also supported but will be renamed to `.fasta` automatically by the pipeline.
+> If you don't specify appropriate `--bwa_index`, `--fasta_index` parameters (see [below](#optional-reference-options)), the pipeline will create these indices for you automatically. Note that you can save the indices created for you for later by giving the `--saveReference` flag.
 
 ### `--large_ref`
 
@@ -214,23 +216,55 @@ params {
 }
 ```
 
-### Optional Reference Utility Files
+## Optional Reference Options
+
+### Generating Fresh Indices
+
+#### `--saveReference`
+
+Use this if you do not have pre-made reference FASTA indices for `bwa`, `samtools` and `picard`. If you turn this on, the indices EAGER2 generates for you will be stored in the `<your_output_dir>/results/reference_genomes` for you. 
+
+### Premade Indices
+
+Supplying pre-made indices saves time in pipeline execution and is especially advised when running multiple times on the same cluster system for example. You can even add a resource specific profile that sets paths to pre-computed reference genomes, saving even time when specifying these.
+
+#### `--bwa_index`
+
+If you want to use pre-existing `bwa index` indices, please supply the path **and file** to the FASTA you also specified in `--fasta` (see above). EAGER2 will automagically detect the index files by searching for the FASTA filename with the corresponding `bwa` index file suffixes.
+
+For example:
+
+```
+nextflow run nf-core/eager \
+-profile test_fna,docker \
+--pairedEnd \
+--reads *{R1,R2}*.fq.gz
+--fasta results/reference_genome/bwa_index/BWAIndex/Mammoth_MT_Krause.fasta \
+--bwa_index results/reference_genome/bwa_index/BWAIndex/Mammoth_MT_Krause.fasta
+```
+
+> `bwa index` does not give you an option to supply alternative suffixes/names for these indices. Thus, the file names generated by this command _must not_ be changed, otherwise EAGER2 will not be able to find them.
 
-### `--bwa_index`
+#### `--seq_dict`
 
-Use this to specify a _directory_ containing previously created BWA index files. This saves time in pipeline execution and is especially advised when running multiple times on the same cluster system for example. You can even add a resource specific profile that sets paths to pre-computed reference genomes, saving even time when specifying these.
+If you want to use a pre-existing `picard CreateSequenceDictionary` dictionary file, use this to specify the required `.dict` file for the selected reference genome.
 
-### `--seq_dict` false
+For example:
 
-Use this to specify the required sequence dictionary file for the selected reference genome.
+```
+--seq_dict Mammoth_MT_Krause.dict
+```
+
+#### `--fasta_index`
 
-### `--fasta_index` false
+If you want to use a pre-existing `samtools faidx` index, Use this to specify the required FASTA index file for the selected reference genome. This should be generated by `samtools faidx` and has a file suffix of `.fai`
 
-Use this to specify the required FastA index file for the selected reference genome.
+For example:
 
-### `--saveReference` false
+```
+--fasta_index Mammoth_MT_Krause.fasta.fai
+```
 
-If you turn this on, the generated indices will be stored in the `./results/reference_genomes` for you. 
 
 ## Other command line parameters
 
diff --git a/environment.yml b/environment.yml
@@ -17,7 +17,7 @@ dependencies:
   - bioconda::gatk4=4.1.1.0
   - bioconda::qualimap=2.2.2b
   - bioconda::vcf2genome=0.91
-  - bioconda::damageprofiler=0.4.5
+  - bioconda::damageprofiler=0.4.6
   - bioconda::multiqc=1.7
   - bioconda::pmdtools=0.60
   - conda-forge::r-rmarkdown=1.12
@@ -29,5 +29,5 @@ dependencies:
   - bioconda::bamutil=1.0.14
   - bioconda::mtnucratio=0.5
   - pysam=0.15.2
-  - python=3.6
+  - python=3.6.3
   #Missing Schmutzi,snpAD
diff --git a/main.nf b/main.nf
diff --git a/nextflow.config b/nextflow.config

Original file line number	Diff line number	Diff line change
`@@ -61,13 +61,10 @@ process {`
`61`	`61`	`withName: multiqc {`
`62`	`62`	`errorStrategy = { task.exitStatus in [143,137] ? 'retry' : 'ignore' }`
`63`	`63`	`}`
`64`		`-`
`65`	`64`	`withName: damageprofiler {`
`66`		`- errorStrategy = 'ignore'`
`67`		`- params.large_ref ? "time = { check_max(8.h * task.attempt, 'time') }" : "time = { check_max(2.h * task.attempt, 'time') }"`
	`65`	`+ time = params.large_ref ? { check_max(8.h * task.attempt, 'time') } : { check_max(2.h * task.attempt, 'time')}`
`68`	`66`	`}`
`69`		`-`
`70`		`- withName: extract_unmapped_reads {`
	`67`	`+ withName: strip_input_fastq {`
`71`	`68`	`cpus = { check_max(8 * task.attempt, 'cpus') }`
`72`	`69`	`memory = { check_max( 8.GB * task.attempt, 'memory' ) }`
`73`	`70`	`}`