-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathvg_trio_multi_map_call.wdl
More file actions
670 lines (641 loc) · 30.4 KB
/
vg_trio_multi_map_call.wdl
File metadata and controls
670 lines (641 loc) · 30.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
version 1.0
### vg_trio_multi_map_call.wdl ###
## Author: Charles Markello
## Description: Trio-backed VG mapping and variant calling workflow for mother-father-child trio datasets.
## Reference: https://github.com/vgteam/vg/wiki
import "./vg_multi_map_call.wdl" as vgMultiMapCallWorkflow
import "./vg_construct_and_index.wdl" as vgConstructWorkflow
import "./vg_indel_realign.wdl" as vgIndelRealignmentWorkflow
workflow vgTrioPipeline {
meta {
author: "Charles Markello"
email: "cmarkell@ucsc.edu"
Updated: "Parsa Eskandar"
description: "Core VG mapping and variant calling workflow for pedigree datasets."
}
input {
File MATERNAL_INPUT_READ_FILE_1 # Input maternal 1st read pair fastq.gz
File MATERNAL_INPUT_READ_FILE_2 # Input maternal 2nd read pair fastq.gz
File PATERNAL_INPUT_READ_FILE_1 # Input paternal 1st read pair fastq.gz
File PATERNAL_INPUT_READ_FILE_2 # Input paternal 2nd read pair fastq.gz
Array[File]+ SIBLING_INPUT_READ_FILE_1_LIST # Input 1st read pair list where the proband file is listed first followed by sibling fastq.gz
Array[File]+ SIBLING_INPUT_READ_FILE_2_LIST # Input 2nd read pair list where the proband file is listed first followed by sibling fastq.gz
Array[String]+ SAMPLE_NAME_SIBLING_LIST # Sample name for siblings where the proband file is listed first followed by the siblings
String SAMPLE_NAME_MATERNAL # Sample name for the mother
String SAMPLE_NAME_PATERNAL # Sample name for the father
String VG_CONTAINER = "quay.io/vgteam/vg:v1.64.0" # VG Container used in the pipeline (e.g. quay.io/vgteam/vg:v1.64.0)
Int READS_PER_CHUNK = 10000000 # Number of reads contained in each mapping chunk (20000000 for wgs)
Int CHUNK_BASES = 50000000 # Number of bases to chunk .gam alignment files for variant calling
Int OVERLAP = 2000 # Number of overlapping bases between each .gam chunk
File? PATH_LIST_FILE # (OPTIONAL) Text file where each line is a path name in the XG index
File XG_FILE # Path to .xg index file
File? GCSA_FILE # (OPTIONAL) Path to .gcsa index file
File? GCSA_LCP_FILE # (OPTIONAL) Path to .gcsa.lcp index file
File? GBWT_FILE # (OPTIONAL) Path to .gbwt index file
File? GGBWT_FILE # (OPTIONAL) Path to .gg index file
File? DIST_FILE # (OPTIONAL) Path to .dist index file
File? MIN_FILE # (OPTIONAL) Path to .min index file
File? SNARLS_FILE # (OPTIONAL) Path to .snarls index file
File REF_FILE # Path to .fa cannonical reference fasta (only grch37/hg19 currently supported)
File REF_INDEX_FILE # Path to .fai index of the REF_FILE fasta reference
File REF_DICT_FILE # Path to .dict file of the REF_FILE fasta reference
File? SNPEFF_DATABASE # Path to snpeff database .zip file for snpEff annotation functionality.
Int SPLIT_READ_CORES = 32
Int SPLIT_READ_DISK = 200
Int MAP_CORES = 32
Int MAP_DISK = 100
Int MAP_MEM = 100
Int MERGE_GAM_CORES = 56
Int MERGE_GAM_DISK = 400
Int MERGE_GAM_MEM = 100
Int MERGE_GAM_TIME = 1200
Int CHUNK_GAM_CORES = 32
Int CHUNK_GAM_DISK = 400
Int CHUNK_GAM_MEM = 100
Int VGCALL_CORES = 8
Int VGCALL_DISK = 40
Int VGCALL_MEM = 64
Array[String]+ CONTIGS = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "MT"]
File REF_FASTA_GZ
File PED_FILE
File? GEN_MAP_FILES
String GRAPH_NAME
String MAPPER = "GIRAFFE" # Set to 'MAP' to use the "VG MAP" algorithm, set to 'MPMAP' to use "VG MPMAP" algorithm, set to 'GIRAFFE' to use "VG GIRAFFE".
Boolean DEEPVARIANT_MODE = false # Set to 'true' to use the DeepVariant variant caller. Set to 'false' to use GATK HaplotypeCallers genotyper.
Boolean GIRAFFE_INDEXES = true # Set to 'true' to construct the GBWT index which incorporates haplotype information into the graph.
Boolean USE_DECOYS = true # Set to 'true' to include decoy contigs from the FASTA reference into the graph reference.
Boolean SNPEFF_ANNOTATION = true # Set to 'true' to run snpEff annotation on the joint genotyped VCF.
Boolean CLEANUP_FILES = true # Set to 'false' to turn off intermediate file cleanup.
String DECOY_REGEX = ">GL\|>NC_007605\|>hs37d5" # grep regular expression string that is used to extract decoy contig ids. USE_DECOYS must be set to 'true'
}
File PROBAND_INPUT_READ_FILE_1 = SIBLING_INPUT_READ_FILE_1_LIST[0] # Input proband 1st read pair fastq.gz
File PROBAND_INPUT_READ_FILE_2 = SIBLING_INPUT_READ_FILE_2_LIST[0] # Input proband 2nd read pair fastq.gz
String SAMPLE_NAME_PROBAND = SAMPLE_NAME_SIBLING_LIST[0] # Sample name for the proband
#######################################################
## Run mapping and variant calling workflows on Trio ##
#######################################################
call vgMultiMapCallWorkflow.vgMultiMapCall as maternalMapCallWorkflow {
input:
INPUT_READ_FILE_1=MATERNAL_INPUT_READ_FILE_1,
INPUT_READ_FILE_2=MATERNAL_INPUT_READ_FILE_2,
SAMPLE_NAME=SAMPLE_NAME_MATERNAL,
VG_CONTAINER=VG_CONTAINER,
READS_PER_CHUNK=READS_PER_CHUNK,
PATH_LIST_FILE=PATH_LIST_FILE,
XG_FILE=XG_FILE,
GCSA_FILE=GCSA_FILE,
GCSA_LCP_FILE=GCSA_LCP_FILE,
GBWT_FILE=GBWT_FILE,
GGBWT_FILE=GGBWT_FILE,
DIST_FILE=DIST_FILE,
MIN_FILE=MIN_FILE,
SNARLS_FILE=SNARLS_FILE,
REF_FILE=REF_FILE,
REF_INDEX_FILE=REF_INDEX_FILE,
REF_DICT_FILE=REF_DICT_FILE,
SPLIT_READ_CORES=SPLIT_READ_CORES,
SPLIT_READ_DISK=SPLIT_READ_DISK,
MAP_CORES=MAP_CORES,
MAP_DISK=MAP_DISK,
MAP_MEM=MAP_MEM,
MERGE_GAM_CORES=MERGE_GAM_CORES,
MERGE_GAM_DISK=MERGE_GAM_DISK,
MERGE_GAM_MEM=MERGE_GAM_MEM,
MERGE_GAM_TIME=MERGE_GAM_TIME,
CHUNK_GAM_CORES=CHUNK_GAM_CORES,
CHUNK_GAM_DISK=CHUNK_GAM_DISK,
CHUNK_GAM_MEM=CHUNK_GAM_MEM,
VGCALL_CORES=VGCALL_CORES,
VGCALL_DISK=VGCALL_DISK,
VGCALL_MEM=VGCALL_MEM,
MAPPER=MAPPER,
CLEANUP_FILES=CLEANUP_FILES,
SURJECT_MODE=true,
DEEPVARIANT_MODE=DEEPVARIANT_MODE,
GVCF_MODE=true,
SNPEFF_ANNOTATION=false
}
call vgMultiMapCallWorkflow.vgMultiMapCall as paternalMapCallWorkflow {
input:
INPUT_READ_FILE_1=PATERNAL_INPUT_READ_FILE_1,
INPUT_READ_FILE_2=PATERNAL_INPUT_READ_FILE_2,
SAMPLE_NAME=SAMPLE_NAME_PATERNAL,
VG_CONTAINER=VG_CONTAINER,
READS_PER_CHUNK=READS_PER_CHUNK,
PATH_LIST_FILE=PATH_LIST_FILE,
XG_FILE=XG_FILE,
GCSA_FILE=GCSA_FILE,
GCSA_LCP_FILE=GCSA_LCP_FILE,
GBWT_FILE=GBWT_FILE,
GGBWT_FILE=GGBWT_FILE,
DIST_FILE=DIST_FILE,
MIN_FILE=MIN_FILE,
SNARLS_FILE=SNARLS_FILE,
REF_FILE=REF_FILE,
REF_INDEX_FILE=REF_INDEX_FILE,
REF_DICT_FILE=REF_DICT_FILE,
SPLIT_READ_CORES=SPLIT_READ_CORES,
SPLIT_READ_DISK=SPLIT_READ_DISK,
MAP_CORES=MAP_CORES,
MAP_DISK=MAP_DISK,
MAP_MEM=MAP_MEM,
MERGE_GAM_CORES=MERGE_GAM_CORES,
MERGE_GAM_DISK=MERGE_GAM_DISK,
MERGE_GAM_MEM=MERGE_GAM_MEM,
MERGE_GAM_TIME=MERGE_GAM_TIME,
CHUNK_GAM_CORES=CHUNK_GAM_CORES,
CHUNK_GAM_DISK=CHUNK_GAM_DISK,
CHUNK_GAM_MEM=CHUNK_GAM_MEM,
VGCALL_CORES=VGCALL_CORES,
VGCALL_DISK=VGCALL_DISK,
VGCALL_MEM=VGCALL_MEM,
MAPPER=MAPPER,
CLEANUP_FILES=CLEANUP_FILES,
SURJECT_MODE=true,
DEEPVARIANT_MODE=DEEPVARIANT_MODE,
GVCF_MODE=true,
SNPEFF_ANNOTATION=false
}
call vgMultiMapCallWorkflow.vgMultiMapCall as probandMapCallWorkflow {
input:
INPUT_READ_FILE_1=PROBAND_INPUT_READ_FILE_1,
INPUT_READ_FILE_2=PROBAND_INPUT_READ_FILE_2,
SAMPLE_NAME=SAMPLE_NAME_PROBAND,
VG_CONTAINER=VG_CONTAINER,
READS_PER_CHUNK=READS_PER_CHUNK,
PATH_LIST_FILE=PATH_LIST_FILE,
XG_FILE=XG_FILE,
GCSA_FILE=GCSA_FILE,
GCSA_LCP_FILE=GCSA_LCP_FILE,
GBWT_FILE=GBWT_FILE,
GGBWT_FILE=GGBWT_FILE,
DIST_FILE=DIST_FILE,
MIN_FILE=MIN_FILE,
SNARLS_FILE=SNARLS_FILE,
REF_FILE=REF_FILE,
REF_INDEX_FILE=REF_INDEX_FILE,
REF_DICT_FILE=REF_DICT_FILE,
SPLIT_READ_CORES=SPLIT_READ_CORES,
SPLIT_READ_DISK=SPLIT_READ_DISK,
MAP_CORES=MAP_CORES,
MAP_DISK=MAP_DISK,
MAP_MEM=MAP_MEM,
MERGE_GAM_CORES=MERGE_GAM_CORES,
MERGE_GAM_DISK=MERGE_GAM_DISK,
MERGE_GAM_MEM=MERGE_GAM_MEM,
MERGE_GAM_TIME=MERGE_GAM_TIME,
CHUNK_GAM_CORES=CHUNK_GAM_CORES,
CHUNK_GAM_DISK=CHUNK_GAM_DISK,
CHUNK_GAM_MEM=CHUNK_GAM_MEM,
VGCALL_CORES=VGCALL_CORES,
VGCALL_DISK=VGCALL_DISK,
VGCALL_MEM=VGCALL_MEM,
MAPPER=MAPPER,
CLEANUP_FILES=CLEANUP_FILES,
SURJECT_MODE=true,
DEEPVARIANT_MODE=DEEPVARIANT_MODE,
GVCF_MODE=true,
SNPEFF_ANNOTATION=false
}
###############################
## Run trio joint genotyping ##
###############################
if (!DEEPVARIANT_MODE) {
call runGATKCombineGenotypeGVCFs as gatkJointGenotyper1st {
input:
in_sample_name=SAMPLE_NAME_PROBAND,
in_gvcf_file_maternal=maternalMapCallWorkflow.output_vcf,
in_gvcf_file_paternal=paternalMapCallWorkflow.output_vcf,
in_gvcf_files_siblings=[probandMapCallWorkflow.output_vcf],
in_reference_file=REF_FILE,
in_reference_index_file=REF_INDEX_FILE,
in_reference_dict_file=REF_DICT_FILE
}
call vgMultiMapCallWorkflow.bgzipMergedVCF as bgzipGATKGVCF {
input:
in_sample_name=SAMPLE_NAME_PROBAND,
in_merged_vcf_file=gatkJointGenotyper1st.joint_genotyped_vcf,
in_vg_container=VG_CONTAINER,
in_vgcall_disk=VGCALL_DISK,
in_vgcall_mem=VGCALL_MEM
}
}
if (DEEPVARIANT_MODE) {
call runDeepVariantJointGenotyper as deepVarJointGenotyper1st {
input:
in_sample_name=SAMPLE_NAME_PROBAND,
in_gvcf_file_maternal=maternalMapCallWorkflow.output_vcf,
in_gvcf_file_paternal=paternalMapCallWorkflow.output_vcf,
in_gvcf_files_siblings=[probandMapCallWorkflow.output_vcf]
}
}
#####################################
## Run parental graph construction ##
#####################################
File output_joint_genotyped_vcf = select_first([bgzipGATKGVCF.output_merged_vcf, deepVarJointGenotyper1st.joint_genotyped_vcf])
File output_joint_genotyped_vcf_index = select_first([bgzipGATKGVCF.output_merged_vcf_index, deepVarJointGenotyper1st.joint_genotyped_vcf_index])
if (GIRAFFE_INDEXES) {
call runSplitJointGenotypedVCF as splitJointGenotypedVCF {
input:
in_proband_sample_name=SAMPLE_NAME_PROBAND,
in_maternal_sample_name=SAMPLE_NAME_MATERNAL,
in_paternal_sample_name=SAMPLE_NAME_PATERNAL,
joint_genotyped_vcf=output_joint_genotyped_vcf,
joint_genotyped_vcf_index=output_joint_genotyped_vcf_index,
contigs=CONTIGS,
filter_parents=false
}
scatter (contig_pair in zip(CONTIGS, splitJointGenotypedVCF.contig_vcfs)) {
call runWhatsHapPhasing {
input:
in_cohort_sample_name=SAMPLE_NAME_PROBAND,
joint_genotyped_vcf=contig_pair.right,
in_maternal_bam=maternalMapCallWorkflow.output_bam,
in_maternal_bam_index=maternalMapCallWorkflow.output_bam_index,
in_paternal_bam=paternalMapCallWorkflow.output_bam,
in_paternal_bam_index=paternalMapCallWorkflow.output_bam_index,
in_proband_bam=probandMapCallWorkflow.output_bam,
in_proband_bam_index=probandMapCallWorkflow.output_bam_index,
in_ped_file=PED_FILE,
in_genetic_map=GEN_MAP_FILES,
in_contig=contig_pair.left,
in_reference_file=REF_FILE,
in_reference_index_file=REF_INDEX_FILE,
in_reference_dict_file=REF_DICT_FILE
}
}
call vgMultiMapCallWorkflow.concatClippedVCFChunks as concatCohortPhasedVCFs {
input:
in_sample_name=SAMPLE_NAME_PROBAND,
in_clipped_vcf_chunk_files=runWhatsHapPhasing.phased_cohort_vcf,
in_vgcall_disk=VGCALL_DISK,
in_vgcall_mem=VGCALL_MEM
}
call vgMultiMapCallWorkflow.bgzipMergedVCF as bgzipCohortPhasedVCF {
input:
in_sample_name=SAMPLE_NAME_PROBAND,
in_merged_vcf_file=concatCohortPhasedVCFs.output_merged_vcf,
in_vg_container=VG_CONTAINER,
in_vgcall_disk=VGCALL_DISK,
in_vgcall_mem=VGCALL_MEM
}
}
File cohort_vcf_output = select_first([bgzipCohortPhasedVCF.output_merged_vcf, output_joint_genotyped_vcf])
File cohort_vcf_index_output = select_first([bgzipCohortPhasedVCF.output_merged_vcf_index, output_joint_genotyped_vcf_index])
call runSplitJointGenotypedVCF as splitPhasedVCF {
input:
in_proband_sample_name=SAMPLE_NAME_PROBAND,
in_maternal_sample_name=SAMPLE_NAME_MATERNAL,
in_paternal_sample_name=SAMPLE_NAME_PATERNAL,
joint_genotyped_vcf=cohort_vcf_output,
joint_genotyped_vcf_index=cohort_vcf_index_output,
contigs=CONTIGS,
filter_parents=true
}
call vgConstructWorkflow.vg_construct_and_index as constructGraphIndexWorkflow {
input:
graph_name=GRAPH_NAME,
ref_fasta_gz=REF_FASTA_GZ,
contigs=CONTIGS,
contigs_vcf_gz=splitPhasedVCF.contig_vcfs,
giraffe_indexes=GIRAFFE_INDEXES,
use_decoys=USE_DECOYS,
decoy_regex=DECOY_REGEX,
vg_docker=VG_CONTAINER
}
######################################################################################################
## Run mapping and variant calling workflow of proband and sibling reads against the parental graph ##
######################################################################################################
Array[Pair[File,File]] read_pair_files_list = zip(SIBLING_INPUT_READ_FILE_1_LIST, SIBLING_INPUT_READ_FILE_2_LIST)
scatter (read_pair_set in zip(read_pair_files_list, SAMPLE_NAME_SIBLING_LIST)) {
Pair[File,File] read_pair_files = read_pair_set.left
call vgMultiMapCallWorkflow.vgMultiMapCall as secondIterationSiblingMapCall {
input:
INPUT_READ_FILE_1=read_pair_files.left,
INPUT_READ_FILE_2=read_pair_files.right,
SAMPLE_NAME=read_pair_set.right,
VG_CONTAINER=VG_CONTAINER,
READS_PER_CHUNK=READS_PER_CHUNK,
PATH_LIST_FILE=PATH_LIST_FILE,
XG_FILE=constructGraphIndexWorkflow.xg,
GCSA_FILE=constructGraphIndexWorkflow.gcsa,
GCSA_LCP_FILE=constructGraphIndexWorkflow.gcsa_lcp,
GBWT_FILE=constructGraphIndexWorkflow.gbwt,
GGBWT_FILE=constructGraphIndexWorkflow.ggbwt,
DIST_FILE=constructGraphIndexWorkflow.dist,
MIN_FILE=constructGraphIndexWorkflow.min,
SNARLS_FILE=constructGraphIndexWorkflow.snarls,
REF_FILE=REF_FILE,
REF_INDEX_FILE=REF_INDEX_FILE,
REF_DICT_FILE=REF_DICT_FILE,
SPLIT_READ_CORES=SPLIT_READ_CORES,
SPLIT_READ_DISK=SPLIT_READ_DISK,
MAP_CORES=MAP_CORES,
MAP_DISK=MAP_DISK,
MAP_MEM=MAP_MEM,
MERGE_GAM_CORES=MERGE_GAM_CORES,
MERGE_GAM_DISK=MERGE_GAM_DISK,
MERGE_GAM_MEM=MERGE_GAM_MEM,
MERGE_GAM_TIME=MERGE_GAM_TIME,
CHUNK_GAM_CORES=CHUNK_GAM_CORES,
CHUNK_GAM_DISK=CHUNK_GAM_DISK,
CHUNK_GAM_MEM=CHUNK_GAM_MEM,
VGCALL_CORES=VGCALL_CORES,
VGCALL_DISK=VGCALL_DISK,
VGCALL_MEM=VGCALL_MEM,
MAPPER=MAPPER,
CLEANUP_FILES=CLEANUP_FILES,
SURJECT_MODE=true,
DEEPVARIANT_MODE=DEEPVARIANT_MODE,
GVCF_MODE=true,
SNPEFF_ANNOTATION=false
}
}
Array[File?] output_sibling_bam_list_maybes = secondIterationSiblingMapCall.output_bam
Array[File?] output_sibling_bam_index_list_maybes = secondIterationSiblingMapCall.output_bam_index
Array[File] output_sibling_bam_list = select_all(output_sibling_bam_list_maybes)
Array[File] output_sibling_bam_index_list = select_all(output_sibling_bam_index_list_maybes)
Array[File?] gvcf_files_siblings_maybes = secondIterationSiblingMapCall.output_vcf
Array[File] gvcf_files_siblings = select_all(gvcf_files_siblings_maybes)
#######################################################
## Run 2nd trio joint genotyping on new proband GVCF ##
#######################################################
if (!DEEPVARIANT_MODE) {
call runGATKCombineGenotypeGVCFs as gatkJointGenotyper2nd {
input:
in_sample_name=SAMPLE_NAME_PROBAND,
in_gvcf_file_maternal=maternalMapCallWorkflow.output_vcf,
in_gvcf_file_paternal=paternalMapCallWorkflow.output_vcf,
in_gvcf_files_siblings=gvcf_files_siblings,
in_reference_file=REF_FILE,
in_reference_index_file=REF_INDEX_FILE,
in_reference_dict_file=REF_DICT_FILE
}
call vgMultiMapCallWorkflow.bgzipMergedVCF as bgzip2ndGATKGVCF {
input:
in_sample_name=SAMPLE_NAME_PROBAND,
in_merged_vcf_file=gatkJointGenotyper2nd.joint_genotyped_vcf,
in_vg_container=VG_CONTAINER,
in_vgcall_disk=VGCALL_DISK,
in_vgcall_mem=VGCALL_MEM
}
}
if (DEEPVARIANT_MODE) {
call runDeepVariantJointGenotyper as deepVarJointGenotyper2nd {
input:
in_sample_name=SAMPLE_NAME_PROBAND,
in_gvcf_file_maternal=maternalMapCallWorkflow.output_vcf,
in_gvcf_file_paternal=paternalMapCallWorkflow.output_vcf,
in_gvcf_files_siblings=gvcf_files_siblings
}
}
File output_2nd_joint_genotyped_vcf = select_first([bgzip2ndGATKGVCF.output_merged_vcf, deepVarJointGenotyper2nd.joint_genotyped_vcf])
File output_2nd_joint_genotyped_vcf_index = select_first([bgzip2ndGATKGVCF.output_merged_vcf_index, deepVarJointGenotyper2nd.joint_genotyped_vcf_index])
# Run snpEff annotation on final VCF as desired
if (SNPEFF_ANNOTATION && defined(SNPEFF_DATABASE)) {
call vgMultiMapCallWorkflow.normalizeVCF as normalizeCohortVCF {
input:
in_sample_name=SAMPLE_NAME_PROBAND,
in_bgzip_vcf_file=output_2nd_joint_genotyped_vcf,
in_vgcall_cores=VGCALL_CORES,
in_vgcall_disk=VGCALL_DISK,
in_vgcall_mem=VGCALL_MEM
}
call vgMultiMapCallWorkflow.snpEffAnnotateVCF as snpEffAnnotateCohortVCF {
input:
in_sample_name=SAMPLE_NAME_PROBAND,
in_normalized_vcf_file=normalizeCohortVCF.output_normalized_vcf,
in_snpeff_database=SNPEFF_DATABASE,
in_vgcall_cores=VGCALL_CORES,
in_vgcall_disk=VGCALL_DISK,
in_vgcall_mem=VGCALL_MEM
}
}
if (!SNPEFF_ANNOTATION) {
File final_vcf_output = output_2nd_joint_genotyped_vcf
}
######################################################
## Run indel realignment workflows on pedigree bams ##
######################################################
call vgIndelRealignmentWorkflow.vgIndelRealign as maternalIndelRealignmentWorkflow {
input:
INPUT_BAM_FILE=maternalMapCallWorkflow.output_bam,
INPUT_BAM_FILE_INDEX=maternalMapCallWorkflow.output_bam_index,
SAMPLE_NAME=SAMPLE_NAME_MATERNAL,
VG_CONTAINER=VG_CONTAINER,
PATH_LIST_FILE=PATH_LIST_FILE,
XG_FILE=XG_FILE,
REF_FILE=REF_FILE,
REF_INDEX_FILE=REF_INDEX_FILE,
REF_DICT_FILE=REF_DICT_FILE,
MAP_CORES=MAP_CORES,
MAP_DISK=MAP_DISK,
MAP_MEM=MAP_MEM
}
call vgIndelRealignmentWorkflow.vgIndelRealign as paternalIndelRealignmentWorkflow {
input:
INPUT_BAM_FILE=paternalMapCallWorkflow.output_bam,
INPUT_BAM_FILE_INDEX=paternalMapCallWorkflow.output_bam_index,
SAMPLE_NAME=SAMPLE_NAME_PATERNAL,
VG_CONTAINER=VG_CONTAINER,
PATH_LIST_FILE=PATH_LIST_FILE,
XG_FILE=XG_FILE,
REF_FILE=REF_FILE,
REF_INDEX_FILE=REF_INDEX_FILE,
REF_DICT_FILE=REF_DICT_FILE,
MAP_CORES=MAP_CORES,
MAP_DISK=MAP_DISK,
MAP_MEM=MAP_MEM
}
Array[Pair[File,File]] bam_pair_files_list = zip(output_sibling_bam_list, output_sibling_bam_index_list)
scatter (bam_pair_set in zip(bam_pair_files_list, SAMPLE_NAME_SIBLING_LIST)) {
Pair[File,File] bam_pair_files = bam_pair_set.left
call vgIndelRealignmentWorkflow.vgIndelRealign as siblingIndelRealignmentWorkflow {
input:
INPUT_BAM_FILE=bam_pair_files.left,
INPUT_BAM_FILE_INDEX=bam_pair_files.right,
SAMPLE_NAME=bam_pair_set.right,
VG_CONTAINER=VG_CONTAINER,
PATH_LIST_FILE=PATH_LIST_FILE,
XG_FILE=XG_FILE,
REF_FILE=REF_FILE,
REF_INDEX_FILE=REF_INDEX_FILE,
REF_DICT_FILE=REF_DICT_FILE,
MAP_CORES=MAP_CORES,
MAP_DISK=MAP_DISK,
MAP_MEM=MAP_MEM
}
}
Array[File?] output_sibling_indel_bam_list_maybes = siblingIndelRealignmentWorkflow.output_bam
Array[File?] output_sibling_indel_bam_index_list_maybes = siblingIndelRealignmentWorkflow.output_bam_index
output {
File output_cohort_vcf = select_first([snpEffAnnotateCohortVCF.output_snpeff_annotated_vcf, final_vcf_output])
File? output_maternal_bam = maternalIndelRealignmentWorkflow.output_bam
File? output_maternal_bam_index = maternalIndelRealignmentWorkflow.output_bam_index
File? output_paternal_bam = paternalIndelRealignmentWorkflow.output_bam
File? output_paternal_bam_index = paternalIndelRealignmentWorkflow.output_bam_index
Array[File] output_gvcf_files_siblings = gvcf_files_siblings
Array[File] final_output_sibling_bam_list = select_all(output_sibling_indel_bam_list_maybes)
Array[File] final_output_sibling_bam_index_list = select_all(output_sibling_indel_bam_index_list_maybes)
}
}
########################
### TASK DEFINITIONS ###
########################
task runGATKCombineGenotypeGVCFs {
input {
String in_sample_name
File in_gvcf_file_maternal
File in_gvcf_file_paternal
Array[File]+ in_gvcf_files_siblings
File in_reference_file
File in_reference_index_file
File in_reference_dict_file
}
command <<<
set -exu -o pipefail
tabix -f -p vcf ~{in_gvcf_file_maternal}
tabix -f -p vcf ~{in_gvcf_file_paternal}
for sibling_gvcf_file in ~{sep=" " in_gvcf_files_siblings} ; do
tabix -f -p vcf "${sibling_gvcf_file}"
done
gatk CombineGVCFs \
--reference ~{in_reference_file} \
-V ~{in_gvcf_file_maternal} -V ~{in_gvcf_file_paternal} -V ~{sep=" -V " in_gvcf_files_siblings} \
--output ~{in_sample_name}_cohort.combined.gvcf \
&& gatk GenotypeGVCFs \
--reference ~{in_reference_file} \
--variant ~{in_sample_name}_cohort.combined.gvcf \
--output ~{in_sample_name}_cohort.jointgenotyped.vcf
>>>
output {
File joint_genotyped_vcf = "~{in_sample_name}_cohort.jointgenotyped.vcf"
}
runtime {
memory: 100 + " GB"
cpu: 32
docker: "broadinstitute/gatk@sha256:cc8981d0527e716775645b04a7f59e96a52ad59a7ae9788ddc47902384bf35aa"
}
}
task runDeepVariantJointGenotyper {
input {
String in_sample_name
File in_gvcf_file_maternal
File in_gvcf_file_paternal
Array[File]+ in_gvcf_files_siblings
}
command <<<
set -exu -o pipefail
tabix -f -p vcf ~{in_gvcf_file_maternal}
tabix -f -p vcf ~{in_gvcf_file_paternal}
for sibling_gvcf_file in ~{sep=" " in_gvcf_files_siblings} ; do
tabix -f -p vcf "${sibling_gvcf_file}"
done
/usr/local/bin/glnexus_cli \
--config DeepVariantWGS \
~{in_gvcf_file_maternal} \
~{in_gvcf_file_paternal} \
~{sep=" " in_gvcf_files_siblings} \
| bcftools view - | bgzip -c > ~{in_sample_name}_cohort.jointgenotyped.vcf.gz \
&& tabix -f -p vcf ~{in_sample_name}_cohort.jointgenotyped.vcf.gz
>>>
output {
File joint_genotyped_vcf = "~{in_sample_name}_cohort.jointgenotyped.vcf.gz"
File joint_genotyped_vcf_index = "~{in_sample_name}_cohort.jointgenotyped.vcf.gz.tbi"
}
runtime {
preemptible: 5
maxRetries: 5
memory: 100 + " GB"
cpu: 32
docker: "quay.io/mlin/glnexus:v1.2.7"
}
}
# Split a vcf into a list of vcfs each representing a contig
task runSplitJointGenotypedVCF {
input {
String in_proband_sample_name
String in_maternal_sample_name
String in_paternal_sample_name
File joint_genotyped_vcf
File joint_genotyped_vcf_index
Array[String]+ contigs
Boolean filter_parents
}
command <<<
set -exu -o pipefail
if [ ~{filter_parents} == true ]; then
SAMPLE_FILTER_STRING="-s ~{in_maternal_sample_name},~{in_paternal_sample_name}"
else
SAMPLE_FILTER_STRING=""
fi
ln -s ~{joint_genotyped_vcf} input_vcf_file.vcf.gz
ln -s ~{joint_genotyped_vcf_index} input_vcf_file.vcf.gz.tbi
while read -r contig; do
if [[ ~{filter_parents} == true && ${contig} == "MT" ]]; then
bcftools view -O z -r "${contig}" -s ~{in_maternal_sample_name} input_vcf_file.vcf.gz > "${contig}.vcf.gz"
elif [[ ~{filter_parents} == true && ${contig} == "Y" ]]; then
bcftools view -O z -r "${contig}" -s ~{in_paternal_sample_name} input_vcf_file.vcf.gz > "${contig}.vcf.gz"
else
bcftools view -O z -r "${contig}" ${SAMPLE_FILTER_STRING} input_vcf_file.vcf.gz > "${contig}.vcf.gz"
fi
echo "${contig}.vcf.gz" >> contig_vcf_list.txt
done < "~{write_lines(contigs)}"
>>>
output {
Array[File]+ contig_vcfs = read_lines("contig_vcf_list.txt")
}
runtime {
memory: 50 + " GB"
disks: "local-disk 100 SSD"
docker: "quay.io/biocontainers/bcftools:1.20--h8b25389_0"
}
}
task runWhatsHapPhasing {
input {
String in_cohort_sample_name
File joint_genotyped_vcf
File? in_maternal_bam
File? in_maternal_bam_index
File? in_paternal_bam
File? in_paternal_bam_index
File? in_proband_bam
File? in_proband_bam_index
File in_ped_file
File? in_genetic_map
String in_contig
File in_reference_file
File in_reference_index_file
File in_reference_dict_file
}
Boolean genetic_map_available = defined(in_genetic_map)
command <<<
set -exu -o pipefail
if [ ~{genetic_map_available} == true ]; then
tar -xvf ~{in_genetic_map}
fi
if [[ ~{in_contig} == "Y" || ~{in_contig} == "MT" || ~{in_contig} == "ABOlocus" ]]; then
GENMAP_OPTION_STRING=""
elif [ ~{in_contig} == "X" ]; then
GENMAP_OPTION_STRING="--genmap genetic_map_GRCh37/genetic_map_chrX_nonPAR_combined_b37.txt --chromosome X"
else
GENMAP_OPTION_STRING="--genmap genetic_map_GRCh37/genetic_map_chr~{in_contig}_combined_b37.txt --chromosome ~{in_contig}"
fi
whatshap phase \
--reference ~{in_reference_file} \
--indels \
--ped ~{in_ped_file} \
${GENMAP_OPTION_STRING} \
-o ~{in_cohort_sample_name}_cohort_~{in_contig}.phased.vcf \
~{joint_genotyped_vcf} ~{in_proband_bam} ~{in_maternal_bam} ~{in_paternal_bam} \
&& bgzip ~{in_cohort_sample_name}_cohort_~{in_contig}.phased.vcf
>>>
output {
File phased_cohort_vcf = "~{in_cohort_sample_name}_cohort_~{in_contig}.phased.vcf.gz"
}
runtime {
memory: 50 + " GB"
disks: "local-disk 100 SSD"
docker: "quay.io/biocontainers/whatshap@sha256:cf82de1173a35a0cb063469a602eff2e8999b4cfc0f0ee9cef0dbaedafa5ab6c"
}
}