Skip to content

Commit 3367f56

Browse files
committed
Ignore cache if Nextclade or dataset version is different
Currently checks Nextclade and dataset versions of the first row of the nextclade.tsv file and formats them as the propose JSON. Once the version JSON file is in place, it should be easy to swap out the check for the new file.
1 parent a8b57d0 commit 3367f56

File tree

3 files changed

+70
-6
lines changed

3 files changed

+70
-6
lines changed

bin/fetch-cache-version

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/bin/bash
2+
3+
s3_url="${1:?An S3 URL is required as the first argument}"
4+
5+
6+
trap '' SIGPIPE
7+
8+
(aws s3 cp "$s3_url" - \
9+
| zstd -T0 -dcq \
10+
| head -n 2 \
11+
| tsv-select -H -f 'nextclade_version,dataset_version' \
12+
| tail -n 1 \
13+
| jq --raw-input -c '
14+
split("\t")
15+
| { "nextclade_version": .[0], "nextclade_dataset_version": .[1] }') \
16+
2> /dev/null

bin/use-nextclade-cache

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,59 @@
11
#!/bin/bash
22
set -euo pipefail
33

4-
vendored="$(dirname "$0")"/../vendored
4+
bin="$(dirname "$0")"
5+
vendored="$bin"/../vendored
56

67
main() {
78
s3_dst="${1:?A destination s3:// URL where the renew file is hosted is required as the first argument.}"
89
s3_src="${2:?A source s3:// URL where the fallback renew file is hosted is required as the second argument.}"
10+
nextclade="${3:?A path to the Nextclade executable is required as the third argument}"
11+
nextclade_dataset="${4:?A path to a Nextclade dataset ZIP archive is required as the fourth argument}"
912
# Nextclade dataset reference wildcard
10-
reference="${3:-}"
13+
reference="${5:-}"
14+
1115
if renew-flag-exists; then
1216
echo "[INFO] Found renew flag" >&2
1317
echo "false"
1418
exit 0
1519
fi
1620

21+
cache_versions="$(get-cache-version-info)"
22+
cache_nextclade_version="$(echo "$cache_versions" | jq -r .nextclade_version)"
23+
current_nextclade_version="$("$nextclade" --version)"
24+
if [[ "$cache_nextclade_version" != "$current_nextclade_version" ]]; then
25+
echo "[INFO] Current Nextclade version ($current_nextclade_version) is different from cache version ($cache_nextclade_version)" >&2
26+
echo "false"
27+
exit 0
28+
fi
29+
30+
cache_dataset_version="$(echo "$cache_versions" | jq -r .nextclade_dataset_version)"
31+
current_dataset_version="$(unzip -p "$nextclade_dataset" pathogen.json | jq -r '.version.tag')"
32+
if [[ "$cache_dataset_version" != "$current_dataset_version" ]]; then
33+
echo "[INFO] Current Nextclade dataset version ($current_dataset_version) is different from cache version ($cache_dataset_version)" >&2
34+
echo "false"
35+
exit 0
36+
fi
37+
1738
echo "true"
1839
}
1940

2041
renew-flag-exists() {
21-
local renew_file="nextclade${reference}.tsv.zst.renew"
22-
local dst_renew_file="${s3_dst}/${renew_file}"
23-
local src_renew_file="${s3_src}/${renew_file}"
42+
local renew_file="nextclade$reference.tsv.zst.renew"
43+
local dst_renew_file="$s3_dst/$renew_file"
44+
local src_renew_file="$s3_src/$renew_file"
45+
46+
"$vendored"/s3-object-exists "$dst_renew_file" || "$vendored"/s3-object-exists "$src_renew_file"
47+
}
48+
49+
get-cache-version-info() {
50+
# TODO: Update check a separate file for version info
51+
# Currently just checks the first row of the nextclade.tsv file
52+
local version_file="nextclade$reference.tsv.zst"
53+
local dst_version_file="$s3_dst/$version_file"
54+
local src_version_file="$s3_src/$version_file"
2455

25-
"$vendored"/s3-object-exists "${dst_renew_file}" || "$vendored"/s3-object-exists "${src_renew_file}"
56+
"$bin"/fetch-cache-version "$dst_version_file" || "$bin"/cache-version "$src_version_file"
2657
}
2758

2859
main "$@"

workflow/snakemake_rules/nextclade.smk

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,22 @@ if config.get("s3_dst") and config.get("s3_src"):
6262
ruleorder: download_nextclade_tsv_from_s3 > create_empty_nextclade_info
6363
ruleorder: download_previous_alignment_from_s3 > create_empty_nextclade_aligned
6464

65+
def _convert_dataset_name(wildcards):
66+
if wildcards.reference == '':
67+
dataset_name="sars-cov-2"
68+
elif wildcards.reference == '_21L':
69+
dataset_name="sars-cov-2-21L"
70+
else:
71+
# We shouldn't run into this since we have wildcard_constraints,
72+
# but doesn't hurt to include it in case that changes
73+
raise ValueError(f"Cannot convert unsupported reference {wildcards.reference!r} to Nextclade dataset name")
74+
75+
return f"data/nextclade_data/{dataset_name}.zip",
76+
6577
rule use_nextclade_cache:
78+
input:
79+
nextclade="./nextclade",
80+
nextclade_dataset=_convert_dataset_name,
6681
params:
6782
dst_source=config["s3_dst"],
6883
src_source=config["s3_src"],
@@ -73,6 +88,8 @@ if config.get("s3_dst") and config.get("s3_src"):
7388
./bin/use-nextclade-cache \
7489
{params.dst_source:q} \
7590
{params.src_source:q} \
91+
{input.nextclade:q} \
92+
{input.nextclade_dataset:q} \
7693
{wildcards.reference:q} \
7794
> {output.use_nextclade_cache}
7895
"""

0 commit comments

Comments
 (0)