-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvcf.c
More file actions
106 lines (93 loc) · 3.71 KB
/
vcf.c
File metadata and controls
106 lines (93 loc) · 3.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
/* File: treevcf.c
* Author: Richard Durbin (rd109@cam.ac.uk)
* Copyright (C) Richard Durbin, Cambridge University, 2019
*-------------------------------------------------------------------
* Description: based on pbwtHtslib.c
* Exported functions:
* HISTORY:
* Last edited: Aug 13 12:21 2020 (rd109)
* Created: Sun Nov 17 19:41:19 2019 (rd109)
*-------------------------------------------------------------------
*/
#include "vcf.h"
#include <htslib/synced_bcf_reader.h>
#include <htslib/faidx.h>
#include <ctype.h> /* for toupper() */
Vcf *vcfRead (char *filename, int *multi, int *nonSNP) /* read GTs from vcf/bcf using htslib */
{
int i, j, k ;
bcf_srs_t *sr = bcf_sr_init () ;
if (!bcf_sr_add_reader (sr, filename)) die ("failed to open vcf file", filename) ;
Vcf *v = new0 (1, Vcf) ;
v->samples = dictCreate (1024) ;
v->sites = arrayCreate (1024, VcfSite) ;
bcf_hdr_t *hr = sr->readers[0].header ;
int nSample = bcf_hdr_nsamples(hr) ;
for (i = 0 ; i < nSample ; ++i)
{ if (!dictAdd (v->samples, hr->samples[i], &k))
fprintf (stderr, "repeat sample name %s in vcf file\n", hr->samples[i]) ;
assert (k == i) ;
}
assert (nSample == dictMax(v->samples)) ;
int ngt_arr = 0, *gt_arr = NULL ;
*multi = 0, *nonSNP = 0 ;
BOOL isDiploid = FALSE ;
while (bcf_sr_next_line (sr))
{ bcf1_t *line = bcf_sr_get_line(sr,0) ;
if (!v->seqName)
v->seqName = strdup (bcf_seqname(hr,line)) ;
else if (strcmp (v->seqName, bcf_seqname(hr,line)))
die ("multiple sequence names in VCF file %s != %s", v->seqName, bcf_seqname(hr,line)) ;
if (line->n_allele == 1) continue ; // ignore lines with no alts
if (line->n_allele > 2) { ++*multi ; continue ; }
if (line->d.allele[0][1] || line->d.allele[1][1]) { ++*nonSNP ; continue ; }
// get GTs
int ngt = bcf_get_genotypes(hr, line, >_arr, &ngt_arr) ;
if (ngt <= 0) continue ; // -1 is used if GT is not in the FORMAT
if ((!isDiploid && ngt != nSample) || (isDiploid && ngt != 2*nSample))
{ if (!isDiploid && (ngt == 2*nSample) && !arrayMax(v->sites))
{ fprintf (stderr, "VCF file appears to be diploid - converting homs to haploid and hets to missing\n") ;
isDiploid = TRUE ;
}
else
die ("wrong number of genotypes %d != nsample %d at VCF %s %lld",
ngt, nSample, v->seqName, line->pos+1) ;
}
VcfSite *s = arrayp(v->sites,arrayMax(v->sites),VcfSite) ;
s->pos = line->pos + 1 ; // bcf coordinates are 0-based
s->ref = tolower(*line->d.allele[0]) ;
s->alt = tolower(*line->d.allele[1]) ;
s->gt = new (nSample+1, char) ; // zero-terminate so can use as a string in dict package
for (i = 0 ; i < nSample ; i++)
if (isDiploid)
{ if (gt_arr[2*i] == bcf_int32_vector_end || gt_arr[2*i+1] == bcf_int32_vector_end)
die ("unexpected end of genotype vector in VCF sample %d site %d", i, nSample, s->pos) ;
if (gt_arr[2*i] == bcf_gt_missing || gt_arr[2*i+1] == bcf_gt_missing ||
bcf_gt_allele(gt_arr[2*i]) != bcf_gt_allele(gt_arr[2*i+1]))
s->gt[i] = 1 ;
else
s->gt[i] = 2 + bcf_gt_allele(gt_arr[2*i]) ; // convert from BCF binary to 2 or 3
}
else
{ if (gt_arr[i] == bcf_int32_vector_end)
die ("unexpected end of genotype vector in VCF") ;
if (gt_arr[i] == bcf_gt_missing)
s->gt[i] = 1 ;
else
s->gt[i] = 2 + bcf_gt_allele(gt_arr[i]) ; // convert from BCF binary to 2 or 3
}
s->gt[nSample] = 0 ;
}
bcf_sr_destroy (sr) ;
return v ;
}
void vcfDestroy (Vcf *v)
{
dictDestroy (v->samples) ;
int i ;
for (i = 0 ; i < arrayMax(v->sites) ; ++i) free (arrp(v->sites,i,VcfSite)->gt) ;
arrayDestroy (v->sites) ;
free (v->seqName) ;
free (v) ;
}
/******* end of file ********/