sars-cov-2-eu-bdmm-prime/config.yaml at main · cecivale/sars-cov-2-eu-bdmm-prime · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# Modified Nextrain workflow from 2020-09-11 by Cecilia Valenzuela
# MIT License
# Copyright (c) 2020 Nextstrain, full text https://github.com/nextstrain/ncov/blob/master/LICENSE


# This file contains defaults for the "config" object used in the Snakefile.
# To temporarily override or provide a value, you can use snakemake's --config
# or --configfile options.
---

# conda environment file to use by default
conda_environment: "/envs/nextstrain.yaml"

# These are the two main starting files for the run
# Files are previously downloaded from GISAID database, Downloads > nextmeta and nextfasta to the data folder
sequences: "data/201113_sequences.fasta"
metadata: "data/201113_metadata.tsv"

# Builds to run: subsampling + beast analysis
builds:
  europeC:
    subsampling_scheme: ssprop_China
    beast_analysis:
      - 210205_europe10

# Define files used for external configuration. Common examples consist of a
# list of strains to include and exclude from analyses, a reference sequence to
# align sequences to.
files:
  include: "files/include.txt"
  exclude: "files/exclude.txt"
  reference: "files/reference_seq.gb"
  demes: "files/demes.csv"

# Filter settings
filter:
  # Require nearly full-length genomes.
  min_length: 27000

  # Omit sequences with incomplete date annotations, and USA seqs without a state.
  exclude_where: "division='USA' date='2020' date='2020-01-XX' date='2020-02-XX' date='2020-03-XX' date='2020-04-XX' date='2020-05-XX' date='2020-06-XX' date='2020-07-XX' date='2020-08-XX' date='2020-09-XX' date='2020-10-XX' date='2020-11-XX' date='2020-12-XX' date='2020-01' date='2020-02' date='2020-03' date='2020-04' date='2020-05' date='2020-06' date='2020-07' date='2020-08' date='2020-09' date='2020-10' date='2020-11' date='2020-12'"

  # Exclude sequences which are from before late 2019 (likely date mix-ups)
  min_date: 2019.74

  # Exclude sequences that are after this date
  max_date: 2020.183 # 2020-03-08

# Alignment settings
# Alignments are partitioned into smaller groups to speed up the overall alignment process.
# The number of sequences per group determines the run time of a single alignment job.
partition_sequences:
  sequences_per_group: 150

# Mask settings determine how the multiple sequence alignment is masked prior to phylogenetic inference.
mask:
  # Number of bases to mask from the beginning and end of the alignment. These regions of the genome
  # are difficult to sequence accurately.
  mask_from_beginning: 100
  mask_from_end: 50

  # Specific sites to mask in the reference genome's coordinates.
  # These are 1-indexed coordinates of sites that have been identified as prone to sequencing errors.
  # 13402, 24389 and 24390 are restricted to Belgian samples
  mask_sites: "13402 24389 24390"


# Subsampling schemes
subsampling:
  ssprop_China:
    France:
      country: "France"
      region: "Europe"
      min_date: "2020-01-23"
      max_date: "2020-03-08"
      seq_per_deme: 60
      prob: "cases"
    Germany:
      country: "Germany"
      region: "Europe"
      min_date: "2020-01-28"
      max_date: "2020-03-08"
      seq_per_deme: 50
      prob: "cases"
    China:
      country: "China"
      region: "Asia"
      min_date: "2019-12-24"
      max_date: "2020-01-23"
      seq_per_deme: 60
      prob: "deaths"
    Italy:
      country: "Italy"
      region: "Europe"
      min_date: "2020-01-29"
      max_date: "2020-03-08"
      seq_per_deme: 60
      prob: "cases"
    OtherEuropean:
      region: "Europe"
      exclude_country: ["France","Germany","Italy","Spain"]
      min_date: "2020-01-29"
      max_date: "2020-03-08"
      seq_per_deme: 50
      prob: "cases"
    Spain:
      country: "Spain"
      region: "Europe"
      min_date: "2020-02-24"
      max_date: "2020-03-08"
      seq_per_deme: 60
      prob: "cases"

tree:
  tree-builder-args: "'-ninit 10 -n 4'"

# TreeTime settings
refine:
  root: "EPI_ISL_402123/China/2019-12-24"
  clock_rate: 0.0008
  clock_std_dev: 0.0004
  coalescent: "skyline"
  date_inference: "marginal"
  divergence_unit: "mutations"
  clock_filter_iqd: 4


# BEAST2 analysis settings
beast:
  # BEAST2 jar file path
  #jar: "/cluster/home/ceciliav/BDMM-Prime.jar"
  jar: "/Users/maceci/code/mt-beast-dev/BDMM-Prime/out/artifacts/BDMM_Prime_jar/BDMM-Prime.jar"
  cpus_mcmc: 2
  # List of parallel mcmc chains seeds to run
  n_mcmc: [1,2,3,4,5,6,7,8,9,10]
  # Length of a mcmc chain, it has to be the same length than the one specified in the xml file
  l_mcmc: 20000000
  # Time for mcmc chains in cluster in minutes
  t_mcmc: 7200
  t_traj: 720
  # Burnin percentage
  burnin: 10
  # Minimum ESS value to combine chains
  min_ess: 50
  mcc_heights: "mean"
  # Trajectories
  n_traj: 1000
  resample: 100000
  n_particles: [300, 1000, 3000, 10000]