@@ -62,52 +62,83 @@ if config.get("s3_dst") and config.get("s3_src"):
6262 ruleorder : download_nextclade_tsv_from_s3 > create_empty_nextclade_info
6363 ruleorder : download_previous_alignment_from_s3 > create_empty_nextclade_aligned
6464
65+
66+ rule use_nextclade_cache :
67+ input :
68+ nextclade = "data/nextclade" ,
69+ nextclade_dataset = lambda w : f"data/nextclade_data/sars-cov-2{ w .reference .replace ('_' ,'-' )} .zip" ,
70+ params :
71+ dst_source = config ["s3_dst" ],
72+ src_source = config ["s3_src" ],
73+ output :
74+ use_nextclade_cache = f"data/{ database } /use_nextclade_cache{{reference}}.txt" ,
75+ shell :
76+ """
77+ ./bin/use-nextclade-cache \
78+ {params.dst_source:q} \
79+ {params.src_source:q} \
80+ {input.nextclade:q} \
81+ {input.nextclade_dataset:q} \
82+ {wildcards.reference:q} \
83+ > {output.use_nextclade_cache}
84+ """
85+
86+
6587 rule download_nextclade_tsv_from_s3 :
6688 """
6789 If there's a .renew touchfile, do not use the cache
6890 """
91+ input :
92+ use_nextclade_cache = f"data/{ database } /use_nextclade_cache{{reference}}.txt" ,
6993 params :
7094 dst_source = config ["s3_dst" ] + "/nextclade{reference}.tsv.zst" ,
7195 src_source = config ["s3_src" ] + "/nextclade{reference}.tsv.zst" ,
72- dst_rerun_touchfile = config ["s3_dst" ] + "/nextclade{reference}.tsv.zst.renew" ,
73- src_rerun_touchfile = config ["s3_dst" ] + "/nextclade{reference}.tsv.zst.renew" ,
7496 lines = config .get ("subsample" , {}).get ("nextclade" , 0 ),
7597 output :
7698 nextclade = f"data/{ database } /nextclade{{reference}}_old.tsv" ,
7799 benchmark :
78100 f"benchmarks/download_nextclade_tsv_from_s3_{ database } {{reference}}.txt"
79101 shell :
80102 """
81- ./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.nextclade} 0 || \
82- ./vendored/download-from-s3 {params.src_rerun_touchfile} {output.nextclade} 0 || \
83- ./vendored/download-from-s3 {params.dst_source} {output.nextclade} {params.lines} || \
84- ./vendored/download-from-s3 {params.src_source} {output.nextclade} {params.lines} || \
85- touch {output.nextclade}
103+ use_nextclade_cache=$(cat {input.use_nextclade_cache})
104+
105+ if [[ "$use_nextclade_cache" == 'true' ]]; then
106+ echo "[INFO] Downloading cached nextclade{wildcards.reference}.tsv.zst"
107+ ./vendored/download-from-s3 {params.dst_source} {output.nextclade} {params.lines} || \
108+ ./vendored/download-from-s3 {params.src_source} {output.nextclade} {params.lines}
109+ else
110+ echo "[INFO] Ignoring cached nextclade{wildcards.reference}.tsv.zst"
111+ touch {output.nextclade}
112+ fi
86113 """
87114
88115 rule download_previous_alignment_from_s3 :
89116 ## NOTE two potential bugs with this implementation:
90117 ## (1) race condition. This file may be updated on the remote after download_nextclade has run but before this rule
91118 ## (2) we may get `download_nextclade` and `download_previous_alignment` from different s3 buckets
119+ input :
120+ use_nextclade_cache = f"data/{ database } /use_nextclade_cache.txt" ,
92121 params :
93122 dst_source = config ["s3_dst" ] + "/{seqtype}.fasta.zst" ,
94123 src_source = config ["s3_src" ] + "/{seqtype}.fasta.zst" ,
95- dst_rerun_touchfile = config ["s3_dst" ] + "/nextclade.tsv.zst.renew" ,
96- src_rerun_touchfile = config ["s3_dst" ] + "/nextclade.tsv.zst.renew" ,
97124 lines = config .get ("subsample" , {}).get ("nextclade" , 0 ),
98125 output :
99126 alignment = temp (f"data/{ database } /nextclade.{{seqtype}}.old.fasta" ),
100127 benchmark :
101128 f"benchmarks/download_previous_alignment_from_s3_{ database } {{seqtype}}.txt"
102129 shell :
103130 """
104- ./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.alignment} 0 || \
105- ./vendored/download-from-s3 {params.src_rerun_touchfile} {output.alignment} 0 || \
106- ./vendored/download-from-s3 {params.dst_source} {output.alignment} {params.lines} || \
107- ./vendored/download-from-s3 {params.src_source} {output.alignment} {params.lines} || \
108- touch {output.alignment}
109- """
131+ use_nextclade_cache=$(cat {input.use_nextclade_cache})
110132
133+ if [[ "$use_nextclade_cache" == 'true' ]]; then
134+ echo "[INFO] Downloading cached Nextclade {wildcards.seqtype}.fasta.zst"
135+ ./vendored/download-from-s3 {params.dst_source} {output.alignment} {params.lines} || \
136+ ./vendored/download-from-s3 {params.src_source} {output.alignment} {params.lines}
137+ else
138+ echo "[INFO] Ignoring cached Nextclade {wildcards.seqtype}.fasta.zst"
139+ touch {output.alignment}
140+ fi
141+ """
111142
112143rule get_sequences_without_nextclade_annotations :
113144 """Find sequences in FASTA which don't have clades assigned yet"""
@@ -135,40 +166,40 @@ rule get_sequences_without_nextclade_annotations:
135166rule download_nextclade_executable :
136167 """Download Nextclade"""
137168 output :
138- nextclade = "nextclade" ,
169+ nextclade = "data/ nextclade" ,
139170 benchmark :
140171 f"benchmarks/download_nextclade_executable_{ database } .txt"
141172 shell :
142173 """
143174 if [ "$(uname)" = "Darwin" ]; then
144- curl -fsSL "https://github.com/nextstrain/nextclade/releases/latest/download/nextclade-x86_64-apple-darwin" -o " nextclade"
175+ curl -fsSL "https://github.com/nextstrain/nextclade/releases/latest/download/nextclade-x86_64-apple-darwin" -o {output. nextclade:q}
145176
146177 else
147- curl -fsSL "https://github.com/nextstrain/nextclade/releases/latest/download/nextclade-x86_64-unknown-linux-gnu" -o " nextclade"
178+ curl -fsSL "https://github.com/nextstrain/nextclade/releases/latest/download/nextclade-x86_64-unknown-linux-gnu" -o {output. nextclade:q}
148179 fi
149- chmod +x nextclade
180+ chmod +x {output. nextclade:q}
150181
151- if ! command -v ./ nextclade &>/dev/null; then
182+ if ! command -v {output. nextclade:q} &>/dev/null; then
152183 echo "[ERROR] Nextclade executable not found"
153184 exit 1
154185 fi
155186
156- NEXTCLADE_VERSION="$(./ nextclade --version)"
187+ NEXTCLADE_VERSION="$({output. nextclade:q} --version)"
157188 echo "[ INFO] Nextclade version: $NEXTCLADE_VERSION"
158189 """
159190
160191
161192rule download_nextclade_dataset :
162193 """Download Nextclade dataset"""
163194 input :
164- " nextclade" ,
195+ nextclade = "data/ nextclade" ,
165196 output :
166197 dataset = "data/nextclade_data/{dataset_name}.zip" ,
167198 benchmark :
168199 f"benchmarks/download_nextclade_dataset_{ database } _{{dataset_name}}.txt"
169200 shell :
170201 """
171- ./ nextclade dataset get --name="{wildcards.dataset_name}" --output-zip={output.dataset} --verbose
202+ {input. nextclade:q} dataset get --name="{wildcards.dataset_name}" --output-zip={output.dataset} --verbose
172203 """
173204
174205
@@ -179,7 +210,7 @@ rule run_wuhan_nextclade:
179210 metrics which will ultimately end up in metadata.tsv.
180211 """
181212 input :
182- nextclade_path = "nextclade" ,
213+ nextclade_path = "data/ nextclade" ,
183214 dataset = "data/nextclade_data/sars-cov-2.zip" ,
184215 sequences = f"data/{ database } /nextclade.sequences.fasta" ,
185216 params :
@@ -214,7 +245,7 @@ rule run_21L_nextclade:
214245 Like wuhan nextclade, but TSV only, no alignments output
215246 """
216247 input :
217- nextclade_path = "nextclade" ,
248+ nextclade_path = "data/ nextclade" ,
218249 dataset = lambda w : f"data/nextclade_data/sars-cov-2-21L.zip" ,
219250 sequences = f"data/{ database } /nextclade_21L.sequences.fasta" ,
220251 output :
@@ -235,6 +266,7 @@ rule run_21L_nextclade:
235266
236267rule nextclade_tsv_concat_versions :
237268 input :
269+ nextclade = "data/nextclade" ,
238270 tsv = f"data/{ database } /nextclade{{reference}}_new_raw.tsv" ,
239271 dataset = lambda w : f"data/nextclade_data/sars-cov-2{ w .reference .replace ('_' ,'-' )} .zip" ,
240272 output :
@@ -245,7 +277,7 @@ rule nextclade_tsv_concat_versions:
245277 """
246278 if [ -s {input.tsv} ]; then
247279 # Get version numbers
248- nextclade_version="$(./ nextclade --version)"
280+ nextclade_version="$({input. nextclade:q} --version)"
249281 dataset_version="$(unzip -p {input.dataset} pathogen.json | jq -r '.version.tag')"
250282 timestamp="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
251283
0 commit comments