Skip to content

Commit f9bca07

Browse files
authored
Merge pull request #466 from nextstrain/ignore-cache
Ignore cache if Nextclade or dataset version is different
2 parents 48e2cec + 9a2ca57 commit f9bca07

File tree

3 files changed

+137
-26
lines changed

3 files changed

+137
-26
lines changed

bin/fetch-cache-version

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#!/bin/bash
2+
3+
# this script intentionally doesn't `set -euo pipefail`
4+
# because otherwise the `head -n 2` step triggers SIGPIPE
5+
# causing the script to exit before it is done.
6+
7+
s3_url="${1:?An S3 URL is required as the first argument}"
8+
9+
10+
trap '' SIGPIPE
11+
12+
(aws s3 cp "$s3_url" - \
13+
| zstd -T0 -dcq \
14+
| head -n 2 \
15+
| tsv-select -H -f 'nextclade_version,dataset_version' \
16+
| tail -n 1 \
17+
| jq --raw-input -c '
18+
split("\t")
19+
| { "nextclade_version": .[0], "nextclade_dataset_version": .[1] }') \
20+
2> /dev/null

bin/use-nextclade-cache

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
bin="$(dirname "$0")"
5+
vendored="$bin"/../vendored
6+
7+
main() {
8+
s3_dst="${1:?A destination s3:// URL where the renew file is hosted is required as the first argument.}"
9+
s3_src="${2:?A source s3:// URL where the fallback renew file is hosted is required as the second argument.}"
10+
nextclade="${3:?A path to the Nextclade executable is required as the third argument}"
11+
nextclade_dataset="${4:?A path to a Nextclade dataset ZIP archive is required as the fourth argument}"
12+
# Nextclade dataset reference wildcard
13+
reference="${5:-}"
14+
15+
if renew-flag-exists; then
16+
echo "[INFO] Found renew flag" >&2
17+
echo "false"
18+
exit 0
19+
fi
20+
21+
cache_versions="$(get-cache-version-info)"
22+
cache_nextclade_version="$(echo "$cache_versions" | jq -r .nextclade_version)"
23+
current_nextclade_version="$("$nextclade" --version)"
24+
if [[ "$cache_nextclade_version" != "$current_nextclade_version" ]]; then
25+
echo "[INFO] Current Nextclade version ($current_nextclade_version) is different from cache version ($cache_nextclade_version)" >&2
26+
echo "false"
27+
exit 0
28+
fi
29+
30+
cache_dataset_version="$(echo "$cache_versions" | jq -r .nextclade_dataset_version)"
31+
current_dataset_version="$(unzip -p "$nextclade_dataset" pathogen.json | jq -r '.version.tag')"
32+
if [[ "$cache_dataset_version" != "$current_dataset_version" ]]; then
33+
echo "[INFO] Current Nextclade dataset version ($current_dataset_version) is different from cache version ($cache_dataset_version)" >&2
34+
echo "false"
35+
exit 0
36+
fi
37+
38+
echo "true"
39+
}
40+
41+
renew-flag-exists() {
42+
local renew_file="nextclade$reference.tsv.zst.renew"
43+
local dst_renew_file="$s3_dst/$renew_file"
44+
local src_renew_file="$s3_src/$renew_file"
45+
46+
"$vendored"/s3-object-exists "$dst_renew_file" || "$vendored"/s3-object-exists "$src_renew_file"
47+
}
48+
49+
get-cache-version-info() {
50+
# TODO: Update check a separate file for version info
51+
# Currently just checks the first row of the nextclade.tsv file
52+
local version_file="nextclade$reference.tsv.zst"
53+
local dst_version_file="$s3_dst/$version_file"
54+
local src_version_file="$s3_src/$version_file"
55+
56+
"$bin"/fetch-cache-version "$dst_version_file" || "$bin"/cache-version "$src_version_file"
57+
}
58+
59+
main "$@"

workflow/snakemake_rules/nextclade.smk

Lines changed: 58 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -62,52 +62,83 @@ if config.get("s3_dst") and config.get("s3_src"):
6262
ruleorder: download_nextclade_tsv_from_s3 > create_empty_nextclade_info
6363
ruleorder: download_previous_alignment_from_s3 > create_empty_nextclade_aligned
6464

65+
66+
rule use_nextclade_cache:
67+
input:
68+
nextclade="data/nextclade",
69+
nextclade_dataset=lambda w: f"data/nextclade_data/sars-cov-2{w.reference.replace('_','-')}.zip",
70+
params:
71+
dst_source=config["s3_dst"],
72+
src_source=config["s3_src"],
73+
output:
74+
use_nextclade_cache=f"data/{database}/use_nextclade_cache{{reference}}.txt",
75+
shell:
76+
"""
77+
./bin/use-nextclade-cache \
78+
{params.dst_source:q} \
79+
{params.src_source:q} \
80+
{input.nextclade:q} \
81+
{input.nextclade_dataset:q} \
82+
{wildcards.reference:q} \
83+
> {output.use_nextclade_cache}
84+
"""
85+
86+
6587
rule download_nextclade_tsv_from_s3:
6688
"""
6789
If there's a .renew touchfile, do not use the cache
6890
"""
91+
input:
92+
use_nextclade_cache=f"data/{database}/use_nextclade_cache{{reference}}.txt",
6993
params:
7094
dst_source=config["s3_dst"] + "/nextclade{reference}.tsv.zst",
7195
src_source=config["s3_src"] + "/nextclade{reference}.tsv.zst",
72-
dst_rerun_touchfile=config["s3_dst"] + "/nextclade{reference}.tsv.zst.renew",
73-
src_rerun_touchfile=config["s3_dst"] + "/nextclade{reference}.tsv.zst.renew",
7496
lines=config.get("subsample", {}).get("nextclade", 0),
7597
output:
7698
nextclade=f"data/{database}/nextclade{{reference}}_old.tsv",
7799
benchmark:
78100
f"benchmarks/download_nextclade_tsv_from_s3_{database}{{reference}}.txt"
79101
shell:
80102
"""
81-
./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.nextclade} 0 || \
82-
./vendored/download-from-s3 {params.src_rerun_touchfile} {output.nextclade} 0 || \
83-
./vendored/download-from-s3 {params.dst_source} {output.nextclade} {params.lines} || \
84-
./vendored/download-from-s3 {params.src_source} {output.nextclade} {params.lines} || \
85-
touch {output.nextclade}
103+
use_nextclade_cache=$(cat {input.use_nextclade_cache})
104+
105+
if [[ "$use_nextclade_cache" == 'true' ]]; then
106+
echo "[INFO] Downloading cached nextclade{wildcards.reference}.tsv.zst"
107+
./vendored/download-from-s3 {params.dst_source} {output.nextclade} {params.lines} || \
108+
./vendored/download-from-s3 {params.src_source} {output.nextclade} {params.lines}
109+
else
110+
echo "[INFO] Ignoring cached nextclade{wildcards.reference}.tsv.zst"
111+
touch {output.nextclade}
112+
fi
86113
"""
87114

88115
rule download_previous_alignment_from_s3:
89116
## NOTE two potential bugs with this implementation:
90117
## (1) race condition. This file may be updated on the remote after download_nextclade has run but before this rule
91118
## (2) we may get `download_nextclade` and `download_previous_alignment` from different s3 buckets
119+
input:
120+
use_nextclade_cache=f"data/{database}/use_nextclade_cache.txt",
92121
params:
93122
dst_source=config["s3_dst"] + "/{seqtype}.fasta.zst",
94123
src_source=config["s3_src"] + "/{seqtype}.fasta.zst",
95-
dst_rerun_touchfile=config["s3_dst"] + "/nextclade.tsv.zst.renew",
96-
src_rerun_touchfile=config["s3_dst"] + "/nextclade.tsv.zst.renew",
97124
lines=config.get("subsample", {}).get("nextclade", 0),
98125
output:
99126
alignment=temp(f"data/{database}/nextclade.{{seqtype}}.old.fasta"),
100127
benchmark:
101128
f"benchmarks/download_previous_alignment_from_s3_{database}{{seqtype}}.txt"
102129
shell:
103130
"""
104-
./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.alignment} 0 || \
105-
./vendored/download-from-s3 {params.src_rerun_touchfile} {output.alignment} 0 || \
106-
./vendored/download-from-s3 {params.dst_source} {output.alignment} {params.lines} || \
107-
./vendored/download-from-s3 {params.src_source} {output.alignment} {params.lines} || \
108-
touch {output.alignment}
109-
"""
131+
use_nextclade_cache=$(cat {input.use_nextclade_cache})
110132
133+
if [[ "$use_nextclade_cache" == 'true' ]]; then
134+
echo "[INFO] Downloading cached Nextclade {wildcards.seqtype}.fasta.zst"
135+
./vendored/download-from-s3 {params.dst_source} {output.alignment} {params.lines} || \
136+
./vendored/download-from-s3 {params.src_source} {output.alignment} {params.lines}
137+
else
138+
echo "[INFO] Ignoring cached Nextclade {wildcards.seqtype}.fasta.zst"
139+
touch {output.alignment}
140+
fi
141+
"""
111142

112143
rule get_sequences_without_nextclade_annotations:
113144
"""Find sequences in FASTA which don't have clades assigned yet"""
@@ -135,40 +166,40 @@ rule get_sequences_without_nextclade_annotations:
135166
rule download_nextclade_executable:
136167
"""Download Nextclade"""
137168
output:
138-
nextclade="nextclade",
169+
nextclade="data/nextclade",
139170
benchmark:
140171
f"benchmarks/download_nextclade_executable_{database}.txt"
141172
shell:
142173
"""
143174
if [ "$(uname)" = "Darwin" ]; then
144-
curl -fsSL "https://github.com/nextstrain/nextclade/releases/latest/download/nextclade-x86_64-apple-darwin" -o "nextclade"
175+
curl -fsSL "https://github.com/nextstrain/nextclade/releases/latest/download/nextclade-x86_64-apple-darwin" -o {output.nextclade:q}
145176
146177
else
147-
curl -fsSL "https://github.com/nextstrain/nextclade/releases/latest/download/nextclade-x86_64-unknown-linux-gnu" -o "nextclade"
178+
curl -fsSL "https://github.com/nextstrain/nextclade/releases/latest/download/nextclade-x86_64-unknown-linux-gnu" -o {output.nextclade:q}
148179
fi
149-
chmod +x nextclade
180+
chmod +x {output.nextclade:q}
150181
151-
if ! command -v ./nextclade &>/dev/null; then
182+
if ! command -v {output.nextclade:q} &>/dev/null; then
152183
echo "[ERROR] Nextclade executable not found"
153184
exit 1
154185
fi
155186
156-
NEXTCLADE_VERSION="$(./nextclade --version)"
187+
NEXTCLADE_VERSION="$({output.nextclade:q} --version)"
157188
echo "[ INFO] Nextclade version: $NEXTCLADE_VERSION"
158189
"""
159190

160191

161192
rule download_nextclade_dataset:
162193
"""Download Nextclade dataset"""
163194
input:
164-
"nextclade",
195+
nextclade="data/nextclade",
165196
output:
166197
dataset="data/nextclade_data/{dataset_name}.zip",
167198
benchmark:
168199
f"benchmarks/download_nextclade_dataset_{database}_{{dataset_name}}.txt"
169200
shell:
170201
"""
171-
./nextclade dataset get --name="{wildcards.dataset_name}" --output-zip={output.dataset} --verbose
202+
{input.nextclade:q} dataset get --name="{wildcards.dataset_name}" --output-zip={output.dataset} --verbose
172203
"""
173204

174205

@@ -179,7 +210,7 @@ rule run_wuhan_nextclade:
179210
metrics which will ultimately end up in metadata.tsv.
180211
"""
181212
input:
182-
nextclade_path="nextclade",
213+
nextclade_path="data/nextclade",
183214
dataset="data/nextclade_data/sars-cov-2.zip",
184215
sequences=f"data/{database}/nextclade.sequences.fasta",
185216
params:
@@ -214,7 +245,7 @@ rule run_21L_nextclade:
214245
Like wuhan nextclade, but TSV only, no alignments output
215246
"""
216247
input:
217-
nextclade_path="nextclade",
248+
nextclade_path="data/nextclade",
218249
dataset=lambda w: f"data/nextclade_data/sars-cov-2-21L.zip",
219250
sequences=f"data/{database}/nextclade_21L.sequences.fasta",
220251
output:
@@ -235,6 +266,7 @@ rule run_21L_nextclade:
235266

236267
rule nextclade_tsv_concat_versions:
237268
input:
269+
nextclade="data/nextclade",
238270
tsv=f"data/{database}/nextclade{{reference}}_new_raw.tsv",
239271
dataset=lambda w: f"data/nextclade_data/sars-cov-2{w.reference.replace('_','-')}.zip",
240272
output:
@@ -245,7 +277,7 @@ rule nextclade_tsv_concat_versions:
245277
"""
246278
if [ -s {input.tsv} ]; then
247279
# Get version numbers
248-
nextclade_version="$(./nextclade --version)"
280+
nextclade_version="$({input.nextclade:q} --version)"
249281
dataset_version="$(unzip -p {input.dataset} pathogen.json | jq -r '.version.tag')"
250282
timestamp="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
251283

0 commit comments

Comments
 (0)