Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,27 @@ jobs:
- name: Run ${{ matrix.profile }} test
run: nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.profile }},docker

aligner:
env:
NXF_ANSI_LOG: false
runs-on: ubuntu-latest
strategy:
matrix:
aligner: [bwa-mem, bwa-mem2]
steps:
- uses: actions/checkout@v2
- name: Install Nextflow
run: |
wget -qO- get.nextflow.io | bash
sudo mv nextflow /usr/local/bin/
env:
# Only check Nextflow pipeline minimum version
NXF_VER: '19.10.0'
- name: Pull docker image
run: docker pull nfcore/sarek:dev
- name: Run ${{ matrix.profile }} test
run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --aligner ${{ matrix.aligner }}

tools:
env:
NXF_ANSI_LOG: false
Expand Down
7 changes: 7 additions & 0 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d

- [Preprocessing](#preprocessing)
- [Map to Reference](#map-to-reference)
- [bwa](#bwa)
- [BWA-mem2](#bwa-mem2)
- [Mark Duplicates](#mark-duplicates)
- [GATK MarkDuplicates](#gatk-markduplicates)
Expand Down Expand Up @@ -64,6 +65,12 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d

### Map to Reference

#### bwa

[bwa](https://github.com/lh3/bwa) is a software package for mapping low-divergent sequences against a large reference genome.

Such files are intermediate and not kept in the final files delivered to users.

#### BWA-mem2

[BWA-mem2](https://github.com/bwa-mem2/bwa-mem2) is a software package for mapping low-divergent sequences against a large reference genome.
Expand Down
25 changes: 25 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
- [--save_trimmed](#--save_trimmed)
- [--split_fastq](#--split_fastq)
- [Preprocessing](#preprocessing)
- [--aligner](#--aligner)
- [--markdup_java_options](#--markdup_java_options)
- [--no_gatk_spark](#--no_gatk_spark)
- [--save_bam_mapped](#--save_bam_mapped)
Expand Down Expand Up @@ -908,6 +909,29 @@ For example:

## Preprocessing

### --aligner

To control which aligner is used for mapping the reads.

Available: `bwa-mem` and `bwa-mem2`

Default: `bwa-mem`

Example:

```bash
--aligner "bwa-mem"
```

> **WARNING** Current indices for `bwa` in AWS iGenomes are not compatible with `bwa-mem2`.
> Use `--bwa=false` to have `Sarek` build them automatically.

Example:

```bash
--aligner "bwa-mem2" --bwa=false
```

### --markdup_java_options

To control the java options necessary for the `GATK MarkDuplicates` process, you can set this parameter.
Expand Down Expand Up @@ -1546,6 +1570,7 @@ Based on [nfcore/base:1.10.2](https://hub.docker.com/r/nfcore/base/tags), it con
- **[ASCAT](https://github.com/Crick-CancerGenomics/ascat)** 2.5.2
- **[AlleleCount](https://github.com/cancerit/alleleCount)** 4.0.2
- **[BCFTools](https://github.com/samtools/bcftools)** 1.9
- **[bwa](https://github.com/lh3/bwa)** 0.7.17
- **[bwa-mem2](https://github.com/bwa-mem2/bwa-mem2)** 2.0
- **[CNVkit](https://github.com/etal/cnvkit)** 0.9.6
- **[Control-FREEC](https://github.com/BoevaLab/FREEC)** 11.5
Expand Down
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ dependencies:
- bioconda::ascat=2.5.2
- bioconda::bcftools=1.9
- bioconda::bwa-mem2=2.0
- bioconda::bwa=0.7.17
- bioconda::cancerit-allelecount=4.0.2
- bioconda::cnvkit=0.9.6
- bioconda::control-freec=11.5
Expand Down
175 changes: 86 additions & 89 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -548,10 +548,11 @@ process get_software_versions {
when: !('versions' in skipQC)

script:
aligner = params.aligner == "bwa-mem2" ? "bwa-mem2" : "bwa"
"""
alleleCounter --version &> v_allelecount.txt 2>&1 || true
bcftools --version &> v_bcftools.txt 2>&1 || true
bwa-mem2 version &> v_bwa.txt 2>&1 || true
${aligner} version &> v_bwa.txt 2>&1 || true
cnvkit.py version &> v_cnvkit.txt 2>&1 || true
configManta.py --version &> v_manta.txt 2>&1 || true
configureStrelkaGermlineWorkflow.py --version &> v_strelka.txt 2>&1 || true
Expand Down Expand Up @@ -602,8 +603,9 @@ process BuildBWAindexes {
when: !(params.bwa) && params.fasta && 'mapping' in step

script:
aligner = params.aligner == "bwa-mem2" ? "bwa-mem2" : "bwa"
"""
bwa-mem2 index ${fasta}
${aligner} index ${fasta}
"""
}

Expand Down Expand Up @@ -1015,61 +1017,58 @@ process TrimGalore {
// and while doing the conversion, tag the bam field RX with the UMI sequence

process UMIFastqToBAM {
publishDir "${params.outdir}/Reports/${idSample}/UMI/${idSample}_${idRun}", mode: params.publish_dir_mode

publishDir "${params.outdir}/Reports/${idSample}/UMI/${idSample}_${idRun}", mode: params.publish_dir_mode

input:
set idPatient, idSample, idRun, file("${idSample}_${idRun}_R1.fastq.gz"), file("${idSample}_${idRun}_R2.fastq.gz") from inputPairReadsUMI
val read_structure1 from ch_read_structure1
val read_structure2 from ch_read_structure2

output:
tuple val(idPatient), val(idSample), val(idRun), file("${idSample}_umi_converted.bam") into umi_converted_bams_ch
input:
set idPatient, idSample, idRun, file("${idSample}_${idRun}_R1.fastq.gz"), file("${idSample}_${idRun}_R2.fastq.gz") from inputPairReadsUMI
val read_structure1 from ch_read_structure1
val read_structure2 from ch_read_structure2

when: params.umi
output:
tuple val(idPatient), val(idSample), val(idRun), file("${idSample}_umi_converted.bam") into umi_converted_bams_ch

// tmp folder for fgbio might be solved more elengantly?
when: params.umi

script:
"""
mkdir tmp
// tmp folder for fgbio might be solved more elengantly?

fgbio --tmp-dir=${PWD}/tmp \
FastqToBam \
-i "${idSample}_${idRun}_R1.fastq.gz" "${idSample}_${idRun}_R2.fastq.gz" \
-o "${idSample}_umi_converted.bam" \
--read-structures ${read_structure1} ${read_structure2} \
--sample ${idSample} \
--library ${idSample}
"""
script:
"""
mkdir tmp

fgbio --tmp-dir=${PWD}/tmp \
FastqToBam \
-i "${idSample}_${idRun}_R1.fastq.gz" "${idSample}_${idRun}_R2.fastq.gz" \
-o "${idSample}_umi_converted.bam" \
--read-structures ${read_structure1} ${read_structure2} \
--sample ${idSample} \
--library ${idSample}
"""
}

// UMI - STEP 2 - MAP THE BAM FILE
// this is necessary because the UMI groups are created based on
// mapping position + same UMI tag

process UMIMapBamFile {
input:
set idPatient, idSample, idRun, file(convertedBam) from umi_converted_bams_ch
file(bwaIndex) from ch_bwa
file(fasta) from ch_fasta
file(fastaFai) from ch_fai

input:
set idPatient, idSample, idRun, file(convertedBam) from umi_converted_bams_ch
file(bwaIndex) from ch_bwa
file(fasta) from ch_fasta
file(fastaFai) from ch_fai

output:
tuple val(idPatient), val(idSample), val(idRun), file("${idSample}_umi_unsorted.bam") into umi_aligned_bams_ch

when: params.umi
output:
tuple val(idPatient), val(idSample), val(idRun), file("${idSample}_umi_unsorted.bam") into umi_aligned_bams_ch

script:
"""
samtools bam2fq -T RX ${convertedBam} | \
bwa-mem2 mem -p -t ${task.cpus} -C -M -R \"@RG\\tID:${idSample}\\tSM:${idSample}\\tPL:Illumina\" \
${fasta} - | \
samtools view -bS - > ${idSample}_umi_unsorted.bam
"""
when: params.umi

script:
aligner = params.aligner == "bwa-mem2" ? "bwa-mem2" : "bwa"
"""
samtools bam2fq -T RX ${convertedBam} | \
${aligner} mem -p -t ${task.cpus} -C -M -R \"@RG\\tID:${idSample}\\tSM:${idSample}\\tPL:Illumina\" \
${fasta} - | \
samtools view -bS - > ${idSample}_umi_unsorted.bam
"""
}

// UMI - STEP 3 - GROUP READS BY UMIs
Expand All @@ -1079,34 +1078,32 @@ process UMIMapBamFile {
// alternatively we can define this as input for the user to choose from

process GroupReadsByUmi {
publishDir "${params.outdir}/Reports/${idSample}/UMI/${idSample}_${idRun}", mode: params.publish_dir_mode

publishDir "${params.outdir}/Reports/${idSample}/UMI/${idSample}_${idRun}", mode: params.publish_dir_mode

input:
set idPatient, idSample, idRun, file(alignedBam) from umi_aligned_bams_ch

output:
file("${idSample}_umi_histogram.txt") into umi_histogram_ch
tuple val(idPatient), val(idSample), val(idRun), file("${idSample}_umi-grouped.bam") into umi_grouped_bams_ch
input:
set idPatient, idSample, idRun, file(alignedBam) from umi_aligned_bams_ch

when: params.umi
output:
file("${idSample}_umi_histogram.txt") into umi_histogram_ch
tuple val(idPatient), val(idSample), val(idRun), file("${idSample}_umi-grouped.bam") into umi_grouped_bams_ch

script:
"""
mkdir tmp
when: params.umi

samtools view -h ${alignedBam} | \
samblaster -M --addMateTags | \
samtools view -Sb - >${idSample}_unsorted_tagged.bam
script:
"""
mkdir tmp

fgbio --tmp-dir=${PWD}/tmp \
GroupReadsByUmi \
-s Adjacency \
-i ${idSample}_unsorted_tagged.bam \
-o ${idSample}_umi-grouped.bam \
-f ${idSample}_umi_histogram.txt
"""
samtools view -h ${alignedBam} | \
samblaster -M --addMateTags | \
samtools view -Sb - >${idSample}_unsorted_tagged.bam

fgbio --tmp-dir=${PWD}/tmp \
GroupReadsByUmi \
-s Adjacency \
-i ${idSample}_unsorted_tagged.bam \
-o ${idSample}_umi-grouped.bam \
-f ${idSample}_umi_histogram.txt
"""
}

// UMI - STEP 4 - CALL MOLECULAR CONSENSUS
Expand All @@ -1115,27 +1112,26 @@ process GroupReadsByUmi {
// existing workflow from the step mapping

process CallMolecularConsensusReads {
publishDir "${params.outdir}/Reports/${idSample}/UMI/${idSample}_${idRun}", mode: params.publish_dir_mode

publishDir "${params.outdir}/Reports/${idSample}/UMI/${idSample}_${idRun}", mode: params.publish_dir_mode

input:
set idPatient, idSample, idRun, file(groupedBamFile) from umi_grouped_bams_ch
input:
set idPatient, idSample, idRun, file(groupedBamFile) from umi_grouped_bams_ch

output:
tuple val(idPatient), val(idSample), val(idRun), file("${idSample}_umi-consensus.bam"), val("null") into consensus_bam_ch
output:
tuple val(idPatient), val(idSample), val(idRun), file("${idSample}_umi-consensus.bam"), val("null") into consensus_bam_ch

when: params.umi
when: params.umi

script:
"""
mkdir tmp
script:
"""
mkdir tmp

fgbio --tmp-dir=${PWD}/tmp \
CallMolecularConsensusReads \
-i $groupedBamFile \
-o ${idSample}_umi-consensus.bam \
-M 1 -S Coordinate
"""
fgbio --tmp-dir=${PWD}/tmp \
CallMolecularConsensusReads \
-i $groupedBamFile \
-o ${idSample}_umi-consensus.bam \
-M 1 -S Coordinate
"""
}

// ################# END OF UMI READS PRE-PROCESSING
Expand All @@ -1146,18 +1142,18 @@ process CallMolecularConsensusReads {
input_pair_reads_sentieon = Channel.empty()

if (params.umi) {
inputPairReads = inputPairReads.dump(tag:'INPUT before BWA-mem2 mem')
if (params.sentieon) input_pair_reads_sentieon = consensus_bam_ch
else inputPairReads = consensus_bam_ch
inputPairReads = inputPairReads.dump(tag:'INPUT before mapping')
if (params.sentieon) input_pair_reads_sentieon = consensus_bam_ch
else inputPairReads = consensus_bam_ch
}
else {
if (params.trim_fastq) inputPairReads = outputPairReadsTrimGalore
else inputPairReads = inputPairReads.mix(inputBam)
inputPairReads = inputPairReads.dump(tag:'INPUT before BWA-mem2 mem')
if (params.trim_fastq) inputPairReads = outputPairReadsTrimGalore
else inputPairReads = inputPairReads.mix(inputBam)
inputPairReads = inputPairReads.dump(tag:'INPUT before mapping')

(inputPairReads, input_pair_reads_sentieon) = inputPairReads.into(2)
if (params.sentieon) inputPairReads.close()
else input_pair_reads_sentieon.close()
(inputPairReads, input_pair_reads_sentieon) = inputPairReads.into(2)
if (params.sentieon) inputPairReads.close()
else input_pair_reads_sentieon.close()
}

process MapReads {
Expand Down Expand Up @@ -1190,9 +1186,10 @@ process MapReads {
extra = status == 1 ? "-B 3" : ""
convertToFastq = hasExtension(inputFile1, "bam") ? "gatk --java-options -Xmx${task.memory.toGiga()}g SamToFastq --INPUT=${inputFile1} --FASTQ=/dev/stdout --INTERLEAVE=true --NON_PF=true | \\" : ""
input = hasExtension(inputFile1, "bam") ? "-p /dev/stdin - 2> >(tee ${inputFile1}.bwa.stderr.log >&2)" : "${inputFile1} ${inputFile2}"
aligner = params.aligner == "bwa-mem2" ? "bwa-mem2" : "bwa"
"""
${convertToFastq}
bwa-mem2 mem -K 100000000 -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${fasta} \
${aligner} mem -K 100000000 -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${fasta} \
${input} | \
samtools sort --threads ${task.cpus} -m 2G - > ${idSample}_${idRun}.bam
"""
Expand Down
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ params {
split_fastq = null // Fastq files will not be split by default

// Preprocessing
aligner = 'bwa-mem'
markdup_java_options = '"-Xms4000m -Xmx7g"' // Established values for markDuplicates memory consumption, see https://github.com/SciLifeLab/Sarek/pull/689 for details
no_gatk_spark = null // GATK Spark implementation of their tools in local mode used by default
save_bam_mapped = null // Mapped BAMs not saved
Expand Down
Loading