This repository was archived by the owner on Jan 27, 2020. It is now read-only.
forked from nf-core/sarek
-
Notifications
You must be signed in to change notification settings - Fork 7
To process targeted sequencing with a target BED #635
Merged
Merged
Changes from 16 commits
Commits
Show all changes
36 commits
Select commit
Hold shift + click to select a range
a7e54aa
removing spurious VEP directory
e89d52d
Merge remote-tracking branch 'upstream/master'
ae126ae
Strelka targeted is working fine
5082a7a
changes to ConcatVCF to accomodate bcftools isec in germline targets
f980e11
concatenateVCF.sh is now a separate script to avoid code duplication
df4d8d1
added concatOptions and Strelka fix
60d15cc
added getopts, fixed existence check and set +u
9ca0c93
Somatic ConcatVCF also simplified
f09345e
Merge remote-tracking branch 'upstream/master'
fa8aec8
updated documentation
4bd45cf
\n or \n that is the question
738c137
exclamation misplaced
2f69352
killing me softly with a VEP line
210d050
Merge branch 'dev' into master
1331a79
Merge remote-tracking branch 'upstream/dev'
fba72df
concatVCF.sh moved to bin
ce331ea
Added --cpus directive
b0a0da2
Merge branch 'master' of github.com:szilvajuhos/Sarek
70b40d2
Sarek-data updated
43ea430
putting concatenateVCF.sh to bin
d237b8f
adding targetBED to tests and wrapper
5d5407a
temporary fix for vepgrch37 container path problem
def35d8
added target report at the end, targetBED=false in base.config
213b07e
resolved merge conflict
39fdfd4
falling back to tiny.tsv
72c0c2c
simplified tests without singularity
a227630
Merge branch 'dev' into master
be3cc7f
typo fix
ae0add8
Merge branch 'master' of github.com:szilvajuhos/Sarek
813ba52
CHANGELOG changes changed
8204c7a
even less somatic test
aa056c9
fixed spacing
d8c35d5
Fixing fixed fixes in CHANGELOG
fef3c1f
Zenodo REST API to upload data
44c7ffa
Conflict resolve
60da87c
Zenodo tests
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -363,7 +363,8 @@ process ConcatVCF { | |
| file(genomeIndex) from Channel.value(referenceMap.genomeIndex) | ||
|
|
||
| output: | ||
| set variantCaller, idPatient, idSampleNormal, idSampleTumor, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfConcatenated | ||
| // we have this funny *_* pattern to avoid copying the raw calls to publishdir | ||
| set variantCaller, idPatient, idSampleNormal, idSampleTumor, file("*_*.vcf.gz"), file("*_*.vcf.gz.tbi") into vcfConcatenated | ||
|
|
||
|
|
||
| when: ( 'haplotypecaller' in tools || 'mutect2' in tools || 'freebayes' in tools ) && !params.onlyQC | ||
|
|
@@ -373,47 +374,14 @@ process ConcatVCF { | |
| else if (variantCaller == 'gvcf-hc') outputFile = "haplotypecaller_${idSampleNormal}.g.vcf" | ||
| else outputFile = "${variantCaller}_${idSampleTumor}_vs_${idSampleNormal}.vcf" | ||
|
|
||
| """ | ||
| set -euo pipefail | ||
| # first make a header from one of the VCF intervals | ||
| # get rid of interval information only from the GATK command-line, but leave the rest | ||
| FIRSTVCF=\$(ls *.vcf | head -n 1) | ||
| sed -n '/^[^#]/q;p' \$FIRSTVCF | \ | ||
| awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if(\$i!~/intervals=/ && \$i !~ /out=/){printf("%s ",\$i)}}printf("\\n")}' \ | ||
| > header | ||
|
|
||
| # Get list of contigs from the FASTA index (.fai). We cannot use the ##contig | ||
| # header in the VCF as it is optional (FreeBayes does not save it, for example) | ||
| CONTIGS=(\$(cut -f1 ${genomeIndex})) | ||
|
|
||
| # concatenate VCFs in the correct order | ||
| ( | ||
| cat header | ||
|
|
||
| for chr in "\${CONTIGS[@]}"; do | ||
| # Skip if globbing would not match any file to avoid errors such as | ||
| # "ls: cannot access chr3_*.vcf: No such file or directory" when chr3 | ||
| # was not processed. | ||
| pattern="\${chr}_*.vcf" | ||
| if ! compgen -G "\${pattern}" > /dev/null; then continue; fi | ||
|
|
||
| # ls -v sorts by numeric value ("version"), which means that chr1_100_ | ||
| # is sorted *after* chr1_99_. | ||
| for vcf in \$(ls -v \${pattern}); do | ||
| # Determine length of header. | ||
| # The 'q' command makes sed exit when it sees the first non-header | ||
| # line, which avoids reading in the entire file. | ||
| L=\$(sed -n '/^[^#]/q;p' \${vcf} | wc -l) | ||
|
|
||
| # Then print all non-header lines. Since tail is very fast (nearly as | ||
| # fast as cat), this is way more efficient than using a single sed, | ||
| # awk or grep command. | ||
| tail -n +\$((L+1)) \${vcf} | ||
| done | ||
| done | ||
| ) | bgzip > ${outputFile}.gz | ||
| tabix ${outputFile}.gz | ||
| """ | ||
| if(params.targetBED) // targeted | ||
| concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} -t ${params.targetBED}" | ||
| else // WGS | ||
| concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} " | ||
|
|
||
| """ | ||
| ${workflow.projectDir}/scripts/concatenateVCFs.sh ${concatOptions} | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should put the new
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cool, thanks, I am still testing, but will move |
||
| """ | ||
| } | ||
|
|
||
| if (params.verbose) vcfConcatenated = vcfConcatenated.view { | ||
|
|
@@ -441,23 +409,32 @@ process RunSingleStrelka { | |
| when: 'strelka' in tools && !params.onlyQC | ||
|
|
||
| script: | ||
| """ | ||
| configureStrelkaGermlineWorkflow.py \ | ||
| --bam ${bam} \ | ||
| --referenceFasta ${genomeFile} \ | ||
| --runDir Strelka | ||
|
|
||
| python Strelka/runWorkflow.py -m local -j ${task.cpus} | ||
|
|
||
| mv Strelka/results/variants/genome.*.vcf.gz \ | ||
| Strelka_${idSample}_genome.vcf.gz | ||
| mv Strelka/results/variants/genome.*.vcf.gz.tbi \ | ||
| Strelka_${idSample}_genome.vcf.gz.tbi | ||
| mv Strelka/results/variants/variants.vcf.gz \ | ||
| Strelka_${idSample}_variants.vcf.gz | ||
| mv Strelka/results/variants/variants.vcf.gz.tbi \ | ||
| Strelka_${idSample}_variants.vcf.gz.tbi | ||
| """ | ||
| """ | ||
| if [ ! -s "${params.targetBED}" ]; then | ||
| # do WGS | ||
| configureStrelkaGermlineWorkflow.py \ | ||
| --bam ${bam} \ | ||
| --referenceFasta ${genomeFile} \ | ||
| --runDir Strelka | ||
| else | ||
| # WES or targeted | ||
| bgzip --threads ${task.cpus} -c ${params.targetBED} > call_targets.bed.gz | ||
| tabix call_targets.bed.gz | ||
| configureStrelkaGermlineWorkflow.py \ | ||
| --bam ${bam} \ | ||
| --referenceFasta ${genomeFile} \ | ||
| --exome \ | ||
| --callRegions call_targets.bed.gz \ | ||
| --runDir Strelka | ||
| fi | ||
|
|
||
| # always run this part | ||
| python Strelka/runWorkflow.py -m local -j ${task.cpus} | ||
| mv Strelka/results/variants/genome.*.vcf.gz Strelka_${idSample}_genome.vcf.gz | ||
| mv Strelka/results/variants/genome.*.vcf.gz.tbi Strelka_${idSample}_genome.vcf.gz.tbi | ||
| mv Strelka/results/variants/variants.vcf.gz Strelka_${idSample}_variants.vcf.gz | ||
| mv Strelka/results/variants/variants.vcf.gz.tbi Strelka_${idSample}_variants.vcf.gz.tbi | ||
| """ | ||
| } | ||
|
|
||
| if (params.verbose) singleStrelkaOutput = singleStrelkaOutput.view { | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,85 @@ | ||
| #!/usr/bin/env bash | ||
| # this script concatenates all VCFs that are in the local directory: the | ||
| # purpose is to make a single VCF from all the VCFs that were created from different intervals | ||
|
|
||
| usage() { echo "Usage: $0 [-i genome_index_file] [-o output.file.no.gz.extension] <-t target.bed> <-c cpus>" 1>&2; exit 1; } | ||
|
|
||
| while getopts "i:c:o:t:" p; do | ||
| case "${p}" in | ||
| i) | ||
| genomeIndex=${OPTARG} | ||
| ;; | ||
| c) | ||
| cpus=${OPTARG} | ||
| ;; | ||
| o) | ||
| outputFile=${OPTARG} | ||
| ;; | ||
| t) | ||
| targetBED=${OPTARG} | ||
| ;; | ||
| *) | ||
| usage | ||
| ;; | ||
| esac | ||
| done | ||
| shift $((OPTIND-1)) | ||
|
|
||
| if [ -z ${genomeIndex} ]; then echo "Missing index file "; usage; fi | ||
| if [ -z ${cpus} ]; then echo "No CPUs defined: setting to 1"; cpus=1; fi | ||
| if [ -z ${outputFile} ]; then echo "Missing output file name"; usage; fi | ||
|
|
||
| set -euo pipefail | ||
|
|
||
| # first make a header from one of the VCF intervals | ||
| # get rid of interval information only from the GATK command-line, but leave the rest | ||
| FIRSTVCF=$(ls *.vcf | head -n 1) | ||
| sed -n '/^[^#]/q;p' $FIRSTVCF | \ | ||
| awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if($i!~/intervals=/ && $i !~ /out=/){printf("%s ",$i)}}printf("\n")}' \ | ||
| > header | ||
|
|
||
| # Get list of contigs from the FASTA index (.fai). We cannot use the ##contig | ||
| # header in the VCF as it is optional (FreeBayes does not save it, for example) | ||
| CONTIGS=($(cut -f1 ${genomeIndex})) | ||
|
|
||
| # concatenate VCFs in the correct order | ||
| ( | ||
| cat header | ||
|
|
||
| for chr in "${CONTIGS[@]}"; do | ||
| # Skip if globbing would not match any file to avoid errors such as | ||
| # "ls: cannot access chr3_*.vcf: No such file or directory" when chr3 | ||
| # was not processed. | ||
| pattern="${chr}_*.vcf" | ||
| if ! compgen -G "${pattern}" > /dev/null; then continue; fi | ||
|
|
||
| # ls -v sorts by numeric value ("version"), which means that chr1_100_ | ||
| # is sorted *after* chr1_99_. | ||
| for vcf in $(ls -v ${pattern}); do | ||
| # Determine length of header. | ||
| # The 'q' command makes sed exit when it sees the first non-header | ||
| # line, which avoids reading in the entire file. | ||
| L=$(sed -n '/^[^#]/q;p' ${vcf} | wc -l) | ||
|
|
||
| # Then print all non-header lines. Since tail is very fast (nearly as | ||
| # fast as cat), this is way more efficient than using a single sed, | ||
| # awk or grep command. | ||
| tail -n +$((L+1)) ${vcf} | ||
| done | ||
| done | ||
| ) | bgzip -@${cpus} > rawcalls.vcf.gz | ||
| tabix rawcalls.vcf.gz | ||
|
|
||
| set +u | ||
|
|
||
| # now we have the concatenated VCF file, check for WES/panel targets, and generate a subset if there is a BED provided | ||
| echo "target is $targetBED" | ||
| if [ ! -z ${targetBED+x} ]; then | ||
| echo "Selecting subset..." | ||
| bcftools isec --targets-file ${targetBED} rawcalls.vcf.gz | bgzip -@${cpus} > ${outputFile}.gz | ||
| tabix ${outputFile}.gz | ||
| else | ||
| # simply rename the raw calls as WGS results | ||
| for f in rawcalls*; do mv -v $f ${outputFile}${f#rawcalls.vcf}; done | ||
| fi | ||
|
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.