SciLifeLab · maxulysse · Sep 21, 2018 · Aug 16, 2018 · Sep 3, 2018 · Sep 4, 2018
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - [#615](https://github.com/SciLifeLab/Sarek/pull/615) - Update documentation
 - [#620](https://github.com/SciLifeLab/Sarek/pull/620) - Add `tmp/` to `.gitignore`
 - [#625](https://github.com/SciLifeLab/Sarek/pull/625) - Add [`pathfindr`](https://github.com/NBISweden/pathfindr) as a submodule
+- [#613](https://github.com/SciLifeLab/Sarek/pull/635) - To process targeted sequencing with a target BED
 
 ### `Changed`
 - [#608](https://github.com/SciLifeLab/Sarek/pull/608) - Update Nextflow required version

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # [![Sarek](https://raw.githubusercontent.com/SciLifeLab/Sarek/master/docs/images/Sarek_logo.png "Sarek")](http://sarek.scilifelab.se/)
 
-#### An open-source analysis pipeline to detect germline or somatic variants from whole genome sequencing
+#### An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing
 
 [![Nextflow version][nextflow-badge]][nextflow-link]
 [![Travis build status][travis-badge]][travis-link]

diff --git a/annotate.nf b/annotate.nf
@@ -216,7 +216,7 @@ process RunVEP {
   finalannotator = annotator == "snpeff" ? 'merge' : 'vep'
   genome = params.genome == 'smallGRCh37' ? 'GRCh37' : params.genome
   """
-  vep --dir /opt/vep/.vep/ \
+  vep --dir /opt/vep/.vep/  \
   -i ${vcf} \
   -o ${vcf.simpleName}_VEP.ann.vcf \
   --assembly ${genome} \

diff --git a/docs/USE_CASES.md b/docs/USE_CASES.md
@@ -164,7 +164,16 @@ SUBJECT_ID  XX    0    SAMPLEIDN    /samples/SAMPLEIDN.bam    /samples/SAMPLEIDN
 SUBJECT_ID  XX    1    SAMPLEIDT    /samples/SAMPLEIDT.bam    /samples/SAMPLEIDT.bai
 SUBJECT_ID  XX    1    SAMPLEIDR    /samples/SAMPLEIDR.bam    /samples/SAMPLEIDR.bai
 ```
-
 If you want to restart a previous run of the pipeline, you may not have a recalibrated BAM file.
 This is the case if HaplotypeCaller was the only tool (recalibration is done on-the-fly with HaplotypeCaller to improve performance and save space).
 In this case, you need to start with `--step=recalibrate` (see previous section).
+
+## Processing targeted (whole exome or panel) sequencing data
+
+The recommended flow for thrgeted sequencing data is to use the whole genome workflow as it is, but also provide a BED file containing targets for variant calling. 
+The Strelka part of the workflow will pick up these intervals, and activate the `--exome` flag to process deeper coverage. It is adviced to pad the variant calling 
+regions (exons or the target) to some extent before submitting to the workflow. To add the target BED file configure the flow like:
+
+```bash
+nextflow run SciLifeLab/Sarek/germlineVC.nf --tools haplotypecaller,strelka,mutect2 --targetBED targets.bed --sample my_panel.tsv
+```
diff --git a/germlineVC.nf b/germlineVC.nf
@@ -363,7 +363,8 @@ process ConcatVCF {
     file(genomeIndex) from Channel.value(referenceMap.genomeIndex)
 
   output:
-    set variantCaller, idPatient, idSampleNormal, idSampleTumor, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfConcatenated
+		// we have this funny *_* pattern to avoid copying the raw calls to publishdir
+    set variantCaller, idPatient, idSampleNormal, idSampleTumor, file("*_*.vcf.gz"), file("*_*.vcf.gz.tbi") into vcfConcatenated
 
 
   when: ( 'haplotypecaller' in tools || 'mutect2' in tools || 'freebayes' in tools ) && !params.onlyQC
@@ -373,47 +374,14 @@ process ConcatVCF {
   else if (variantCaller == 'gvcf-hc') outputFile = "haplotypecaller_${idSampleNormal}.g.vcf"
   else outputFile = "${variantCaller}_${idSampleTumor}_vs_${idSampleNormal}.vcf"
 
-  """
-	set -euo pipefail
-  # first make a header from one of the VCF intervals
-  # get rid of interval information only from the GATK command-line, but leave the rest
-  FIRSTVCF=\$(ls *.vcf | head -n 1)
-  sed -n '/^[^#]/q;p' \$FIRSTVCF | \
-  awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if(\$i!~/intervals=/ && \$i !~ /out=/){printf("%s ",\$i)}}printf("\\n")}' \
-  > header
-
-  # Get list of contigs from the FASTA index (.fai). We cannot use the ##contig
-  # header in the VCF as it is optional (FreeBayes does not save it, for example)
-  CONTIGS=(\$(cut -f1 ${genomeIndex}))
-
-  # concatenate VCFs in the correct order
-  (
-    cat header
-
-    for chr in "\${CONTIGS[@]}"; do
-      # Skip if globbing would not match any file to avoid errors such as
-      # "ls: cannot access chr3_*.vcf: No such file or directory" when chr3
-      # was not processed.
-      pattern="\${chr}_*.vcf"
-      if ! compgen -G "\${pattern}" > /dev/null; then continue; fi
-
-      # ls -v sorts by numeric value ("version"), which means that chr1_100_
-      # is sorted *after* chr1_99_.
-      for vcf in \$(ls -v \${pattern}); do
-        # Determine length of header.
-        # The 'q' command makes sed exit when it sees the first non-header
-        # line, which avoids reading in the entire file.
-        L=\$(sed -n '/^[^#]/q;p' \${vcf} | wc -l)
-
-        # Then print all non-header lines. Since tail is very fast (nearly as
-        # fast as cat), this is way more efficient than using a single sed,
-        # awk or grep command.
-        tail -n +\$((L+1)) \${vcf}
-      done
-    done
-  ) | bgzip > ${outputFile}.gz
-  tabix ${outputFile}.gz
-  """
+	if(params.targetBED)		// targeted
+		concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} -t ${params.targetBED}"
+	else										// WGS
+		concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} "
+
+	"""
+	${workflow.projectDir}/scripts/concatenateVCFs.sh ${concatOptions}
+	"""
 }
 
 if (params.verbose) vcfConcatenated = vcfConcatenated.view {
@@ -441,23 +409,32 @@ process RunSingleStrelka {
   when: 'strelka' in tools && !params.onlyQC
 
   script:
-  """
-  configureStrelkaGermlineWorkflow.py \
-  --bam ${bam} \
-  --referenceFasta ${genomeFile} \
-  --runDir Strelka
-
-  python Strelka/runWorkflow.py -m local -j ${task.cpus}
-
-  mv Strelka/results/variants/genome.*.vcf.gz \
-    Strelka_${idSample}_genome.vcf.gz
-  mv Strelka/results/variants/genome.*.vcf.gz.tbi \
-    Strelka_${idSample}_genome.vcf.gz.tbi
-  mv Strelka/results/variants/variants.vcf.gz \
-    Strelka_${idSample}_variants.vcf.gz
-  mv Strelka/results/variants/variants.vcf.gz.tbi \
-    Strelka_${idSample}_variants.vcf.gz.tbi
-  """
+	"""
+	if [ ! -s "${params.targetBED}" ]; then
+		# do WGS
+		configureStrelkaGermlineWorkflow.py \
+		--bam ${bam} \
+		--referenceFasta ${genomeFile} \
+		--runDir Strelka
+	else
+		# WES or targeted
+		bgzip --threads ${task.cpus} -c ${params.targetBED} > call_targets.bed.gz
+		tabix call_targets.bed.gz
+		configureStrelkaGermlineWorkflow.py \
+		--bam ${bam} \
+		--referenceFasta ${genomeFile} \
+		--exome \
+		--callRegions call_targets.bed.gz \
+		--runDir Strelka
+	fi
+
+	# always run this part
+		python Strelka/runWorkflow.py -m local -j ${task.cpus}
+		mv Strelka/results/variants/genome.*.vcf.gz Strelka_${idSample}_genome.vcf.gz
+		mv Strelka/results/variants/genome.*.vcf.gz.tbi Strelka_${idSample}_genome.vcf.gz.tbi
+		mv Strelka/results/variants/variants.vcf.gz Strelka_${idSample}_variants.vcf.gz
+		mv Strelka/results/variants/variants.vcf.gz.tbi Strelka_${idSample}_variants.vcf.gz.tbi
+	"""
 }
 
 if (params.verbose) singleStrelkaOutput = singleStrelkaOutput.view {

diff --git a/lib/SarekUtils.groovy b/lib/SarekUtils.groovy
@@ -88,6 +88,8 @@ class SarekUtils {
       'strelka-BP',
       'strelkaBP',
       'tag',
+      'target-BED',
+      'targetBED',
       'test',
       'tools',
       'total-memory',

diff --git a/scripts/concatenateVCFs.sh b/scripts/concatenateVCFs.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+# this script concatenates all VCFs that are in the local directory: the 
+# purpose is to make a single VCF from all the VCFs that were created from different intervals
+
+usage() { echo "Usage: $0 [-i genome_index_file] [-o output.file.no.gz.extension] <-t target.bed> <-c cpus>" 1>&2; exit 1; }
+
+while getopts "i:c:o:t:" p; do
+	case "${p}" in
+		i)
+			genomeIndex=${OPTARG}
+			;;
+		c)
+			cpus=${OPTARG}
+			;;
+		o)
+			outputFile=${OPTARG}
+			;;
+		t)
+			targetBED=${OPTARG}
+			;;
+		*)
+			usage
+			;;
+	esac
+done
+shift $((OPTIND-1))
+
+if [ -z ${genomeIndex} ]; then echo "Missing index file "; usage; fi
+if [ -z ${cpus} ]; then echo "No CPUs defined: setting to 1"; cpus=1; fi
+if [ -z ${outputFile} ]; then echo "Missing output file name"; usage; fi
+
+set -euo pipefail
+
+# first make a header from one of the VCF intervals
+# get rid of interval information only from the GATK command-line, but leave the rest
+FIRSTVCF=$(ls *.vcf | head -n 1)
+sed -n '/^[^#]/q;p' $FIRSTVCF | \
+awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if($i!~/intervals=/ && $i !~ /out=/){printf("%s ",$i)}}printf("\n")}' \
+> header
+
+# Get list of contigs from the FASTA index (.fai). We cannot use the ##contig
+# header in the VCF as it is optional (FreeBayes does not save it, for example)
+CONTIGS=($(cut -f1 ${genomeIndex}))
+
+# concatenate VCFs in the correct order
+(
+  cat header
+
+  for chr in "${CONTIGS[@]}"; do
+    # Skip if globbing would not match any file to avoid errors such as
+    # "ls: cannot access chr3_*.vcf: No such file or directory" when chr3
+    # was not processed.
+    pattern="${chr}_*.vcf"
+    if ! compgen -G "${pattern}" > /dev/null; then continue; fi
+
+    # ls -v sorts by numeric value ("version"), which means that chr1_100_
+    # is sorted *after* chr1_99_.
+    for vcf in $(ls -v ${pattern}); do
+      # Determine length of header.
+      # The 'q' command makes sed exit when it sees the first non-header
+      # line, which avoids reading in the entire file.
+      L=$(sed -n '/^[^#]/q;p' ${vcf} | wc -l)
+
+      # Then print all non-header lines. Since tail is very fast (nearly as
+      # fast as cat), this is way more efficient than using a single sed,
+      # awk or grep command.
+      tail -n +$((L+1)) ${vcf}
+    done
+  done
+) | bgzip -@${cpus} > rawcalls.vcf.gz
+tabix rawcalls.vcf.gz
+
+set +u
+
+# now we have the concatenated VCF file, check for WES/panel targets, and generate a subset if there is a BED provided
+echo "target is $targetBED"
+if [ ! -z ${targetBED+x} ]; then
+	echo "Selecting subset..."
+	bcftools isec --targets-file ${targetBED} rawcalls.vcf.gz | bgzip -@${cpus} > ${outputFile}.gz
+	tabix ${outputFile}.gz
+else
+	# simply rename the raw calls as WGS results
+	for f in rawcalls*; do mv -v $f ${outputFile}${f#rawcalls.vcf}; done
+fi
+
diff --git a/somaticVC.nf b/somaticVC.nf
@@ -364,53 +364,22 @@ process ConcatVCF {
     file(genomeIndex) from Channel.value(referenceMap.genomeIndex)
 
   output:
-    set variantCaller, idPatient, idSampleNormal, idSampleTumor, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfConcatenated
+		// we have this funny *_* pattern to avoid copying the raw calls to publishdir
+    set variantCaller, idPatient, idSampleNormal, idSampleTumor, file("*_*.vcf.gz"), file("*_*.vcf.gz.tbi") into vcfConcatenated
+		// TODO DRY with ConcatVCF
 
   when: ( 'mutect2' in tools || 'freebayes' in tools ) && !params.onlyQC
 
   script:
   outputFile = "${variantCaller}_${idSampleTumor}_vs_${idSampleNormal}.vcf"
 
-  """
-	set -euo pipefail
-  # first make a header from one of the VCF intervals
-  # get rid of interval information only from the GATK command-line, but leave the rest
-  FIRSTVCF=\$(ls *.vcf | head -n 1)
-  sed -n '/^[^#]/q;p' \$FIRSTVCF | \
-  awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if(\$i!~/intervals=/ && \$i !~ /out=/){printf("%s ",\$i)}}printf("\\n")}' \
-  > header
-
-  # Get list of contigs from the FASTA index (.fai). We cannot use the ##contig
-  # header in the VCF as it is optional (FreeBayes does not save it, for example)
-  CONTIGS=(\$(cut -f1 ${genomeIndex}))
-
-  # concatenate VCFs in the correct order
-  (
-    cat header
-
-    for chr in "\${CONTIGS[@]}"; do
-      # Skip if globbing would not match any file to avoid errors such as
-      # "ls: cannot access chr3_*.vcf: No such file or directory" when chr3
-      # was not processed.
-      pattern="\${chr}_*.vcf"
-      if ! compgen -G "\${pattern}" > /dev/null; then continue; fi
-
-      # ls -v sorts by numeric value ("version"), which means that chr1_100_
-      # is sorted *after* chr1_99_.
-      for vcf in \$(ls -v \${pattern}); do
-        # Determine length of header.
-        # The 'q' command makes sed exit when it sees the first non-header
-        # line, which avoids reading in the entire file.
-        L=\$(sed -n '/^[^#]/q;p' \${vcf} | wc -l)
-
-        # Then print all non-header lines. Since tail is very fast (nearly as
-        # fast as cat), this is way more efficient than using a single sed,
-        # awk or grep command.
-        tail -n +\$((L+1)) \${vcf}
-      done
-    done
-  ) | bgzip > ${outputFile}.gz
-  tabix ${outputFile}.gz
+  if(params.targetBED)		// targeted
+		concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} -t ${params.targetBED}"
+	else										// WGS
+		concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} "
+
+	"""
+	${workflow.projectDir}/scripts/concatenateVCFs.sh ${concatOptions}
   """
 }
 
@@ -439,24 +408,35 @@ process RunStrelka {
   when: 'strelka' in tools && !params.onlyQC
 
   script:
-  """
-  configureStrelkaSomaticWorkflow.py \
-  --tumor ${bamTumor} \
-  --normal ${bamNormal} \
-  --referenceFasta ${genomeFile} \
-  --runDir Strelka
-
-  python Strelka/runWorkflow.py -m local -j ${task.cpus}
-
-  mv Strelka/results/variants/somatic.indels.vcf.gz \
-    Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz
-  mv Strelka/results/variants/somatic.indels.vcf.gz.tbi \
-    Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz.tbi
-  mv Strelka/results/variants/somatic.snvs.vcf.gz \
-    Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz
-  mv Strelka/results/variants/somatic.snvs.vcf.gz.tbi \
-    Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz.tbi
-  """
+	"""
+	if [ ! -s "${params.targetBED}" ]; then
+		# do WGS
+		configureStrelkaSomaticWorkflow.py \
+		--tumor ${bamTumor} \
+		--normal ${bamNormal} \
+		--referenceFasta ${genomeFile} \
+		--runDir Strelka
+	else
+		# WES or targeted
+		bgzip --threads ${task.cpus} -c ${params.targetBED} > call_targets.bed.gz
+		tabix call_targets.bed.gz
+		configureStrelkaSomaticWorkflow.py \
+		--tumor ${bamTumor} \
+		--normal ${bamNormal} \
+		--referenceFasta ${genomeFile} \
+		--exome \
+		--callRegions call_targets.bed.gz \
+		--runDir Strelka
+	fi
+
+	python Strelka/runWorkflow.py -m local -j ${task.cpus}
+	# always run this part
+
+	mv Strelka/results/variants/somatic.indels.vcf.gz Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz
+	mv Strelka/results/variants/somatic.indels.vcf.gz.tbi Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz.tbi
+	mv Strelka/results/variants/somatic.snvs.vcf.gz Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz
+	mv Strelka/results/variants/somatic.snvs.vcf.gz.tbi Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz.tbi
+	"""
 }
 
 if (params.verbose) strelkaOutput = strelkaOutput.view {