Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions bin/fetch-cache-version
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

Comment thread
joverlee521 marked this conversation as resolved.
# this script intentionally doesn't `set -euo pipefail`
# because otherwise the `head -n 2` step triggers SIGPIPE
# causing the script to exit before it is done.

s3_url="${1:?An S3 URL is required as the first argument}"


trap '' SIGPIPE

(aws s3 cp "$s3_url" - \
| zstd -T0 -dcq \
| head -n 2 \
| tsv-select -H -f 'nextclade_version,dataset_version' \
| tail -n 1 \
| jq --raw-input -c '
split("\t")
| { "nextclade_version": .[0], "nextclade_dataset_version": .[1] }') \
2> /dev/null
59 changes: 59 additions & 0 deletions bin/use-nextclade-cache
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/bin/bash
set -euo pipefail

bin="$(dirname "$0")"
vendored="$bin"/../vendored

main() {
s3_dst="${1:?A destination s3:// URL where the renew file is hosted is required as the first argument.}"
s3_src="${2:?A source s3:// URL where the fallback renew file is hosted is required as the second argument.}"
nextclade="${3:?A path to the Nextclade executable is required as the third argument}"
nextclade_dataset="${4:?A path to a Nextclade dataset ZIP archive is required as the fourth argument}"
# Nextclade dataset reference wildcard
reference="${5:-}"

if renew-flag-exists; then
echo "[INFO] Found renew flag" >&2
echo "false"
exit 0
fi

cache_versions="$(get-cache-version-info)"
cache_nextclade_version="$(echo "$cache_versions" | jq -r .nextclade_version)"
current_nextclade_version="$("$nextclade" --version)"
if [[ "$cache_nextclade_version" != "$current_nextclade_version" ]]; then
echo "[INFO] Current Nextclade version ($current_nextclade_version) is different from cache version ($cache_nextclade_version)" >&2
echo "false"
exit 0
fi

cache_dataset_version="$(echo "$cache_versions" | jq -r .nextclade_dataset_version)"
current_dataset_version="$(unzip -p "$nextclade_dataset" pathogen.json | jq -r '.version.tag')"
if [[ "$cache_dataset_version" != "$current_dataset_version" ]]; then
echo "[INFO] Current Nextclade dataset version ($current_dataset_version) is different from cache version ($cache_dataset_version)" >&2
echo "false"
exit 0
fi

echo "true"
}

renew-flag-exists() {
local renew_file="nextclade$reference.tsv.zst.renew"
local dst_renew_file="$s3_dst/$renew_file"
local src_renew_file="$s3_src/$renew_file"

"$vendored"/s3-object-exists "$dst_renew_file" || "$vendored"/s3-object-exists "$src_renew_file"
}

get-cache-version-info() {
# TODO: Update check a separate file for version info
# Currently just checks the first row of the nextclade.tsv file
local version_file="nextclade$reference.tsv.zst"
local dst_version_file="$s3_dst/$version_file"
local src_version_file="$s3_src/$version_file"

"$bin"/fetch-cache-version "$dst_version_file" || "$bin"/cache-version "$src_version_file"
}

main "$@"
84 changes: 58 additions & 26 deletions workflow/snakemake_rules/nextclade.smk
Original file line number Diff line number Diff line change
Expand Up @@ -62,52 +62,83 @@ if config.get("s3_dst") and config.get("s3_src"):
ruleorder: download_nextclade_tsv_from_s3 > create_empty_nextclade_info
ruleorder: download_previous_alignment_from_s3 > create_empty_nextclade_aligned


rule use_nextclade_cache:
input:
nextclade="data/nextclade",
nextclade_dataset=lambda w: f"data/nextclade_data/sars-cov-2{w.reference.replace('_','-')}.zip",
params:
dst_source=config["s3_dst"],
src_source=config["s3_src"],
output:
use_nextclade_cache=f"data/{database}/use_nextclade_cache{{reference}}.txt",
shell:
"""
./bin/use-nextclade-cache \
{params.dst_source:q} \
{params.src_source:q} \
{input.nextclade:q} \
{input.nextclade_dataset:q} \
{wildcards.reference:q} \
> {output.use_nextclade_cache}
"""


rule download_nextclade_tsv_from_s3:
"""
If there's a .renew touchfile, do not use the cache
"""
input:
use_nextclade_cache=f"data/{database}/use_nextclade_cache{{reference}}.txt",
params:
dst_source=config["s3_dst"] + "/nextclade{reference}.tsv.zst",
src_source=config["s3_src"] + "/nextclade{reference}.tsv.zst",
dst_rerun_touchfile=config["s3_dst"] + "/nextclade{reference}.tsv.zst.renew",
src_rerun_touchfile=config["s3_dst"] + "/nextclade{reference}.tsv.zst.renew",
lines=config.get("subsample", {}).get("nextclade", 0),
output:
nextclade=f"data/{database}/nextclade{{reference}}_old.tsv",
benchmark:
f"benchmarks/download_nextclade_tsv_from_s3_{database}{{reference}}.txt"
shell:
"""
./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.nextclade} 0 || \
./vendored/download-from-s3 {params.src_rerun_touchfile} {output.nextclade} 0 || \
./vendored/download-from-s3 {params.dst_source} {output.nextclade} {params.lines} || \
./vendored/download-from-s3 {params.src_source} {output.nextclade} {params.lines} || \
touch {output.nextclade}
use_nextclade_cache=$(cat {input.use_nextclade_cache})

if [[ "$use_nextclade_cache" == 'true' ]]; then
echo "[INFO] Downloading cached nextclade{wildcards.reference}.tsv.zst"
./vendored/download-from-s3 {params.dst_source} {output.nextclade} {params.lines} || \
./vendored/download-from-s3 {params.src_source} {output.nextclade} {params.lines}
else
echo "[INFO] Ignoring cached nextclade{wildcards.reference}.tsv.zst"
touch {output.nextclade}
fi
"""

rule download_previous_alignment_from_s3:
## NOTE two potential bugs with this implementation:
## (1) race condition. This file may be updated on the remote after download_nextclade has run but before this rule
## (2) we may get `download_nextclade` and `download_previous_alignment` from different s3 buckets
input:
use_nextclade_cache=f"data/{database}/use_nextclade_cache.txt",
params:
dst_source=config["s3_dst"] + "/{seqtype}.fasta.zst",
src_source=config["s3_src"] + "/{seqtype}.fasta.zst",
dst_rerun_touchfile=config["s3_dst"] + "/nextclade.tsv.zst.renew",
src_rerun_touchfile=config["s3_dst"] + "/nextclade.tsv.zst.renew",
lines=config.get("subsample", {}).get("nextclade", 0),
output:
alignment=temp(f"data/{database}/nextclade.{{seqtype}}.old.fasta"),
benchmark:
f"benchmarks/download_previous_alignment_from_s3_{database}{{seqtype}}.txt"
shell:
"""
./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.alignment} 0 || \
./vendored/download-from-s3 {params.src_rerun_touchfile} {output.alignment} 0 || \
./vendored/download-from-s3 {params.dst_source} {output.alignment} {params.lines} || \
./vendored/download-from-s3 {params.src_source} {output.alignment} {params.lines} || \
touch {output.alignment}
"""
use_nextclade_cache=$(cat {input.use_nextclade_cache})

if [[ "$use_nextclade_cache" == 'true' ]]; then
echo "[INFO] Downloading cached Nextclade {wildcards.seqtype}.fasta.zst"
./vendored/download-from-s3 {params.dst_source} {output.alignment} {params.lines} || \
./vendored/download-from-s3 {params.src_source} {output.alignment} {params.lines}
else
echo "[INFO] Ignoring cached Nextclade {wildcards.seqtype}.fasta.zst"
touch {output.alignment}
fi
"""

rule get_sequences_without_nextclade_annotations:
"""Find sequences in FASTA which don't have clades assigned yet"""
Expand Down Expand Up @@ -135,40 +166,40 @@ rule get_sequences_without_nextclade_annotations:
rule download_nextclade_executable:
"""Download Nextclade"""
output:
nextclade="nextclade",
nextclade="data/nextclade",
benchmark:
f"benchmarks/download_nextclade_executable_{database}.txt"
shell:
"""
if [ "$(uname)" = "Darwin" ]; then
curl -fsSL "https://github.com/nextstrain/nextclade/releases/latest/download/nextclade-x86_64-apple-darwin" -o "nextclade"
curl -fsSL "https://github.com/nextstrain/nextclade/releases/latest/download/nextclade-x86_64-apple-darwin" -o {output.nextclade:q}

else
curl -fsSL "https://github.com/nextstrain/nextclade/releases/latest/download/nextclade-x86_64-unknown-linux-gnu" -o "nextclade"
curl -fsSL "https://github.com/nextstrain/nextclade/releases/latest/download/nextclade-x86_64-unknown-linux-gnu" -o {output.nextclade:q}
fi
chmod +x nextclade
chmod +x {output.nextclade:q}

if ! command -v ./nextclade &>/dev/null; then
if ! command -v {output.nextclade:q} &>/dev/null; then
echo "[ERROR] Nextclade executable not found"
exit 1
fi

NEXTCLADE_VERSION="$(./nextclade --version)"
NEXTCLADE_VERSION="$({output.nextclade:q} --version)"
echo "[ INFO] Nextclade version: $NEXTCLADE_VERSION"
"""


rule download_nextclade_dataset:
"""Download Nextclade dataset"""
input:
"nextclade",
nextclade="data/nextclade",
output:
dataset="data/nextclade_data/{dataset_name}.zip",
benchmark:
f"benchmarks/download_nextclade_dataset_{database}_{{dataset_name}}.txt"
shell:
"""
./nextclade dataset get --name="{wildcards.dataset_name}" --output-zip={output.dataset} --verbose
{input.nextclade:q} dataset get --name="{wildcards.dataset_name}" --output-zip={output.dataset} --verbose
"""


Expand All @@ -179,7 +210,7 @@ rule run_wuhan_nextclade:
metrics which will ultimately end up in metadata.tsv.
"""
input:
nextclade_path="nextclade",
nextclade_path="data/nextclade",
dataset="data/nextclade_data/sars-cov-2.zip",
sequences=f"data/{database}/nextclade.sequences.fasta",
params:
Expand Down Expand Up @@ -214,7 +245,7 @@ rule run_21L_nextclade:
Like wuhan nextclade, but TSV only, no alignments output
"""
input:
nextclade_path="nextclade",
nextclade_path="data/nextclade",
dataset=lambda w: f"data/nextclade_data/sars-cov-2-21L.zip",
sequences=f"data/{database}/nextclade_21L.sequences.fasta",
output:
Expand All @@ -235,6 +266,7 @@ rule run_21L_nextclade:

rule nextclade_tsv_concat_versions:
input:
nextclade="data/nextclade",
tsv=f"data/{database}/nextclade{{reference}}_new_raw.tsv",
dataset=lambda w: f"data/nextclade_data/sars-cov-2{w.reference.replace('_','-')}.zip",
output:
Expand All @@ -245,7 +277,7 @@ rule nextclade_tsv_concat_versions:
"""
if [ -s {input.tsv} ]; then
# Get version numbers
nextclade_version="$(./nextclade --version)"
nextclade_version="$({input.nextclade:q} --version)"
dataset_version="$(unzip -p {input.dataset} pathogen.json | jq -r '.version.tag')"
timestamp="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"

Expand Down