Skip to content

Rename use_fp32_master_weights_fused to use_fp32_master_weights and r… #343

Rename use_fp32_master_weights_fused to use_fp32_master_weights and r…

Rename use_fp32_master_weights_fused to use_fp32_master_weights and r… #343

name: "BioNeMo MBridge Recipes CI"
on:
push:
branches:
- "pull-request/[0-9]+"
- "dependabot/**"
merge_group:
types: [checks_requested]
schedule:
- cron: "0 9 * * *" # Runs at 9 AM UTC daily (2 AM MST)
defaults:
run:
shell: bash -x -e -u -o pipefail {0}
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
changed-dirs:
runs-on: ubuntu-latest
outputs:
any_changed: ${{ steps.changed-files.outputs.any_changed }}
all_changed_files: ${{ steps.changed-files.outputs.all_changed_files }}
dirs: ${{ steps.set-dirs.outputs.dirs }}
labels: ${{ steps.set-dirs.outputs.labels }}
steps:
- id: get-pr-info
if: ${{ startsWith(github.ref_name, 'pull-request/') }}
uses: nv-gha-runners/get-pr-info@main
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Get merge-base commit
id: merge-base
run: |
# Get the merge-base between current branch and main
MERGE_BASE=$(git merge-base HEAD origin/main)
echo "merge-base=$MERGE_BASE" >> $GITHUB_OUTPUT
echo "Merge-base commit: $MERGE_BASE"
- name: Get changed files
id: changed-files
uses: step-security/changed-files@v46
with:
json: true
matrix: true
base_sha: ${{ steps.merge-base.outputs.merge-base }}
dir_names: true
dir_names_max_depth: 3
files: |
bionemo-recipes/recipes/*megatron/**
sub-packages/bionemo-recipeutils/**
sub-packages/bionemo-core/**
- id: set-dirs
name: Determine which directories to run
env:
EVENT_NAME: ${{ github.event_name }}
PR_INFO: ${{ steps.get-pr-info.outputs.pr-info }}
CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
run: |
# Get all *megatron recipe directories
ALL_DIRS=$(ls -d bionemo-recipes/recipes/*megatron/ 2>/dev/null | jq -R -s -c 'split("\n")[:-1] | map(rtrimstr("/"))')
# Helper to check for a PR label
has_label() {
[[ "$PR_INFO" != "null" && "$PR_INFO" != "" ]] && \
echo "$PR_INFO" | jq -e ".labels[]? | select(.name == \"$1\")" > /dev/null 2>&1
}
# --- Shared sub-package dependency handling ---
# MBridge recipes depend on shared sub-packages (bionemo-core, bionemo-recipeutils)
# installed from git in pyproject.toml. When those sub-packages change, we need to
# test all megatron recipes against the local version. Each recipe's .ci_build.sh
# handles reinstalling from the local checkout if present.
#
# To add a new megatron recipe that depends on shared sub-packages:
# 1. Add it as a directory under bionemo-recipes/recipes/ with a *megatron suffix
# 2. Ensure its .ci_build.sh includes the local sub-package override
# (see eden_megatron/.ci_build.sh for the pattern)
# Determine which directories to run
if [[ "$EVENT_NAME" == "schedule" ]]; then
echo "Scheduled run - running all megatron recipes"
DIRS="$ALL_DIRS"
elif has_label "ciflow:skip"; then
echo "Found 'ciflow:skip' label - skipping all recipe tests"
DIRS="[]"
elif has_label "ciflow:all-recipes"; then
echo "Found 'ciflow:all-recipes' label - running all megatron recipes"
DIRS="$ALL_DIRS"
else
# Start with megatron recipe directories that have direct changes
DIRS=$(echo "$ALL_DIRS" | jq -c --argjson changed "$CHANGED_FILES" '
map(select(. as $dir | $changed | index($dir) != null))
')
# If a shared sub-package changed, run ALL megatron recipes
SHARED_DEP_CHANGED=$(echo "$CHANGED_FILES" | jq 'map(select(startswith("sub-packages/bionemo-recipeutils") or startswith("sub-packages/bionemo-core"))) | length > 0')
if [[ "$SHARED_DEP_CHANGED" == "true" ]]; then
echo "Shared sub-package changed - running all megatron recipes"
DIRS="$ALL_DIRS"
fi
fi
# Assign Docker images to the selected directories
DIRS_WITH_IMAGES=$(echo "$DIRS" | jq -c '
map({
dir: .,
name: (. | sub("^bionemo-recipes/"; "")),
image: "svcbionemo023/bionemo-framework:pytorch26.04-py3-squashed"
})
')
echo "dirs=$DIRS_WITH_IMAGES" >> $GITHUB_OUTPUT
# Emit PR labels as a JSON array so downstream jobs can gate on ciflow:* labels.
if [[ "$PR_INFO" != "null" && "$PR_INFO" != "" ]]; then
LABELS=$(echo "$PR_INFO" | jq -c '[.labels[]?.name]' 2>/dev/null || echo "[]")
else
LABELS="[]"
fi
echo "labels=$LABELS" >> $GITHUB_OUTPUT
- name: Show output
run: |
echo "=== Changed Files Analysis ==="
echo "Current branch: ${{ github.ref_name }}"
echo "Merge-base commit: ${{ steps.merge-base.outputs.merge-base }}"
echo "Changed files compared to merge-base:"
echo '${{ steps.changed-files.outputs.all_changed_files }}' | jq -r '.[]' | sed 's/^/ - /'
echo "Total changed files: $(echo '${{ steps.changed-files.outputs.all_changed_files }}' | jq '. | length')"
echo '${{ toJSON(steps.changed-files.outputs) }}'
echo '${{ toJSON(steps.set-dirs.outputs) }}'
shell: bash
unit-tests:
needs: changed-dirs
runs-on: linux-amd64-gpu-l4-latest-1
if: ${{ needs.changed-dirs.outputs.dirs != '[]' }}
name: "mbridge-unit-tests (${{ matrix.recipe.name }})"
container:
image: ${{ matrix.recipe.image }}
options: --shm-size=16G
env:
CI: true
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HF_HOME: /cache/huggingface
strategy:
matrix:
recipe: ${{ fromJson(needs.changed-dirs.outputs.dirs) }}
fail-fast: false
steps:
- name: Show GPU info
run: nvidia-smi
- name: Setup proxy cache
uses: nv-gha-runners/setup-proxy-cache@main
- name: Checkout repository
uses: actions/checkout@v4
with:
sparse-checkout: |
${{ matrix.recipe.dir }}
sub-packages/bionemo-recipeutils
sub-packages/bionemo-core
sparse-checkout-cone-mode: false
- name: Cache Hugging Face models
uses: actions/cache@v4
with:
path: /cache/huggingface
key: ${{ runner.os }}-huggingface-${{ matrix.recipe.name }}-${{ github.sha }}
restore-keys: |
${{ runner.os }}-huggingface-${{ matrix.recipe.name }}-
${{ runner.os }}-huggingface-
- name: Install dependencies
working-directory: ${{ matrix.recipe.dir }}
run: |
if [ -f .ci_build.sh ]; then
bash .ci_build.sh
elif [ -f pyproject.toml ] || [ -f setup.py ]; then
PIP_CONSTRAINT= pip install -e .
echo "Installed ${{ matrix.recipe.dir }} as editable package"
elif [ -f requirements.txt ]; then
PIP_CONSTRAINT= pip install -r requirements.txt
echo "Installed ${{ matrix.recipe.dir }} from requirements.txt"
else
echo "No pyproject.toml, setup.py, or requirements.txt found in ${{ matrix.recipe.dir }}"
exit 1
fi
- name: Run tests
working-directory: ${{ matrix.recipe.dir }}
run: |
if [ -f .ci_test_env.sh ]; then
source .ci_test_env.sh
fi
pytest -v .
run-tests-notebooks:
needs: changed-dirs
runs-on: linux-amd64-gpu-l4-latest-1
# Mirrors the framework workflow's notebook-trigger pattern (label-only on PRs,
# auto on merge_group + nightly schedule). Currently scoped to evo2_megatron --
# the only megatron recipe with example notebooks.
if: |
contains(needs.changed-dirs.outputs.dirs, 'bionemo-recipes/recipes/evo2_megatron') &&
(
(github.event_name == 'schedule') ||
(github.event_name == 'merge_group') ||
contains(fromJSON(needs.changed-dirs.outputs.labels || '[]'), 'ciflow:all-recipes') ||
contains(fromJSON(needs.changed-dirs.outputs.labels || '[]'), 'ciflow:notebooks')
)
name: "mbridge-notebook-tests (evo2_megatron)"
container:
image: svcbionemo023/bionemo-framework:pytorch26.04-py3-squashed
options: --shm-size=16G
env:
CI: true
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HF_HOME: /cache/huggingface
BIONEMO_DATA_SOURCE: ngc
steps:
- name: Show GPU info
run: nvidia-smi
- name: Setup proxy cache
uses: nv-gha-runners/setup-proxy-cache@main
- name: Checkout repository
uses: actions/checkout@v4
with:
sparse-checkout: |
bionemo-recipes/recipes/evo2_megatron
sub-packages/bionemo-recipeutils
sub-packages/bionemo-core
sparse-checkout-cone-mode: false
- name: Cache Hugging Face models
uses: actions/cache@v4
with:
path: /cache/huggingface
key: ${{ runner.os }}-huggingface-evo2_megatron-notebooks-${{ github.sha }}
restore-keys: |
${{ runner.os }}-huggingface-evo2_megatron-notebooks-
${{ runner.os }}-huggingface-evo2_megatron-
${{ runner.os }}-huggingface-
- name: Install dependencies
working-directory: bionemo-recipes/recipes/evo2_megatron
run: |
bash .ci_build.sh
source .ci_test_env.sh
pip install nbval
- name: Run notebook tests
working-directory: bionemo-recipes/recipes/evo2_megatron
run: |
source .ci_test_env.sh
FAST_CI_MODE=1 pytest -v -s --nbval-lax -x -p no:python \
examples/lora-fine-tuning-tutorial.ipynb
verify-mbridge-recipe-tests:
needs:
- changed-dirs
- unit-tests
- run-tests-notebooks
runs-on: ubuntu-latest
if: always()
steps:
- name: Check test job statuses
run: |
if [[ "${{ contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') }}" == "true" ]]; then
echo "Some mbridge test jobs have failed or been cancelled!"
exit 1
else
echo "All mbridge test jobs have completed successfully or were skipped!"
exit 0
fi