Merge pull request #876 from ctastad/master

ctastad · web-flow · commit eda67d851627 · 2025-05-02T12:18:50.000-05:00
first commit - minerva hpc profile for icahn school of medicine at mo…
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -123,6 +123,7 @@ jobs:
           - "mjolnir_globe"
           - "mpcdf"
           - "mpcdf_viper"
+          - "mssm"
           - "munin"
           - "nci_gadi"
           - "nu_genomics"
diff --git a/README.md b/README.md
@@ -174,6 +174,7 @@ Currently documentation is available for the following systems:
 - [MJOLNIR_GLOBE](docs/mjolnir_globe.md)
 - [MPCDF](docs/mpcdf.md)
 - [MPCDF_VIPER](docs/mpcdf_viper.md)
+- [MSSM](docs/mssm.md)
 - [MUNIN](docs/munin.md)
 - [NCI GADI](docs/nci_gadi.md)
 - [NU_GENOMICS](docs/nu_genomics.md)
diff --git a/conf/mssm.config b/conf/mssm.config
@@ -0,0 +1,136 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config for Minerva HPC at Icahn School of Medicine at Mount Sinai
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Author: Christopher Tastad - Judy Cho Lab
+    Contact: christopher.tastad@mssm.edu
+    HPC Support: hpchelp@hpc.mssm.edu
+
+    IMPORTANT: Before running this pipeline, set the MINERVA_ALLOCATION environment
+    variable in your master submission script:
+
+    export MINERVA_ALLOCATION="acc_YOUR-PROJECT-NAME"
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+// Global default params
+params {
+    config_profile_description = 'Minerva HPC at Icahn School of Medicine at Mount Sinai'
+    config_profile_contact     = 'Christopher Tastad (@ctastad)'
+    config_profile_url         = 'https://labs.icahn.mssm.edu/minervalab/'
+
+    // Cluster-specific parameters
+    minerva_allocation         = System.getenv('MINERVA_ALLOCATION') ?: 'default_allocation'
+    max_cpus = 64
+    max_memory = '1.5.TB'
+    max_time = '336.h'
+}
+
+
+// Singularity configuration
+env {
+    SINGULARITY_CACHEDIR = "/sc/arion/work/${System.getenv('USER')}/singularity_cache"
+    SINGULARITY_TMPDIR = "/sc/arion/work/${System.getenv('USER')}/singularity_tmp"
+    SINGULARITY_LOCALCACHEDIR = "/sc/arion/work/${System.getenv('USER')}/singularity_cache"
+    SINGULARITY_PULLFOLDER = "/sc/arion/work/${System.getenv('USER')}/singularity_cache/pull"
+    SINGULARITY_DISABLE_CACHE = "no"
+
+}
+
+singularity {
+    enabled = true
+    autoMounts = true
+    cacheDir = "/sc/arion/work/${System.getenv('USER')}/singularity_cache"
+    pullTimeout = '120 min'
+
+    // Pass proxy settings to container
+    envWhitelist = ['http_proxy', 'https_proxy', 'all_proxy', 'no_proxy']
+}
+
+
+// LSF executor configuration
+executor {
+    name = 'lsf'
+    submitRateLimit = '2 sec'
+    // Specific LSF settings for proper memory handling
+    perJobMemLimit = false
+    perTaskReserve = true
+    max_time = '336.h'
+}
+
+// Process configuration
+process {
+    executor = 'lsf'
+    resourceLimits = [
+        cpus: 64,
+        memory: 1.5.TB,
+        time: 336.h
+    ]
+
+    // Dynamic queue selection based on job requirements
+    queue = {
+        if (task.time > 144.h) {
+            return 'long'
+        } else if (task.label && task.label.toString().contains('gpu') && task.time <= 30.min) {
+            return 'gpuexpress'
+        } else if (task.label && task.label.toString().contains('gpu')) {
+            return 'gpu'
+        } else if (task.time <= 12.h && task.cpus <= 8) {
+            return 'express'
+        } else {
+            return 'premium'
+        }
+    }
+
+    // Cluster options with proper memory handling
+    clusterOptions = {
+        def options = "-P ${System.getenv('MINERVA_ALLOCATION') ?: params.minerva_allocation}"
+
+        // Handle memory requests - ensure consistency between -M and rusage
+        if (task.memory) {
+            def mem = task.memory.toMega()
+            options += " -M ${mem}"
+        }
+
+        // Add GPU-specific options
+        if (task.label && task.label.toString().contains('gpu')) {
+            def gpuNum = task.label.toString().contains('high_gpu') ? 2 : 1
+            options += " -gpu num=${gpuNum}"
+        }
+
+        return options
+    }
+
+    // Add gpu awareness to container
+    withLabel: 'gpu|.*gpu.*' {
+        containerOptions = '--nv'
+    }
+
+    // ERROR HANDLING CONFIGURATION
+    // Default dynamic error strategy for most processes
+    errorStrategy = {
+        if (task.exitStatus in [130, 137, 140] && task.attempt <= 3)
+            return 'retry'
+        else if (task.exitStatus in [131..145] && task.attempt <= 1)
+            return 'retry'
+        else
+            return 'finish'
+    }
+    maxRetries = 3
+
+    // Special error handling labels (these override the dynamic strategy above)
+    withLabel:error_ignore {
+        errorStrategy = 'ignore'
+        maxRetries = 0
+    }
+
+    withLabel:error_retry {
+        errorStrategy = 'retry'
+        maxRetries = 2
+    }
+
+    // Set default failOnMissingField behavior (optional)
+    shell = ['/bin/bash', '-euo', 'pipefail']
+}
+
diff --git a/docs/mssm.md b/docs/mssm.md
@@ -0,0 +1,153 @@
+# nf-core/configs: MSSM Configuration
+
+This nextflow profiles has been configured to be run with nf-core maintained pipelines for use on the **Minerva HPC** cluster at the **Icahn School of Medicine at Mount Sinai**. All testing has been done within pipelines that follow the DLS2 framework.
+
+Run the pipeline with `-profile mssm`. This will download and launch the [`mssm.config`](../conf/mssm.config) which has been pre-configured with a setup suitable for the Minerva HPC cluster. Using this profile, a container image containing all of the required software will be downloaded, and converted to a Singularity image before execution of the pipeline. While this is typically governed by the configuration and execution framework, some manual adjustments will be needed at times. See custom configurations to address this.
+
+## Contact Information
+
+**HPC Support:** hpchelp@hpc.mssm.edu
+**Author:** Christopher Tastad - Judy Cho Lab
+**Email:** christopher.tastad@mssm.edu
+
+## Required Environment Setup
+
+Before running any nf-core pipeline on Minerva, you **must set the MINERVA_ALLOCATION environment variable** in your submission script:
+
+```bash
+export MINERVA_ALLOCATION="acc_YOUR-PROJECT-NAME"
+```
+
+## Module Requirements
+
+To run nf-core pipelines on Minerva, you need Nextflow and Singularity. Through experience, it has been found that the local nextflow module can produce some difficulties with execution, and the conda environment installation given by nf-core is preferred.
+
+```bash
+ml java
+ml anaconda3
+ml singularity-ce
+
+# Activate Nextflow from conda environment
+source /hpc/packages/minerva-centos7/anaconda3/2018.12/etc/profile.d/conda.sh
+conda init bash
+conda activate nextflow
+```
+
+## Proxy Configuration
+
+Minerva requires proxy settings to download containers from remote sources:
+
+```bash
+export http_proxy=http://172.28.7.1:3128
+export https_proxy=http://172.28.7.1:3128
+export all_proxy=http://172.28.7.1:3128
+export no_proxy=localhost,*.chimera.hpc.mssm.edu,172.28.0.0/16
+```
+
+## Configuration Details
+
+This profile includes:
+
+- **LSF executor** configuration optimized for Minerva
+- **Dynamic queue selection** based on job requirements:
+  - `express` queue for short jobs (≤12h, ≤8 CPUs)
+  - `premium` queue for standard jobs
+  - `long` queue for jobs >144h
+  - `gpu` and `gpuexpress` queues for GPU workloads
+- **Singularity container** support with proper cache directories
+- **Error handling** strategies with automatic retries
+
+## Example Submission Script
+
+```bash
+#!/bin/bash
+#BSUB -J nfcore-pipeline-job
+#BSUB -P acc_YOUR-PROJECT-NAME
+#BSUB -W 48:00
+#BSUB -q premium
+#BSUB -n 2
+#BSUB -R rusage[mem=8GB]
+#BSUB -R span[hosts=1]
+#BSUB -o output_%J.stdout
+#BSUB -eo error_%J.stderr
+#BSUB -L /bin/bash
+
+PROJ_DIR=/path/to/project/
+NFC_PIPE=nf-core/rnaseq
+NFC_VER=3.18.0
+NFC_PROFILE=mssm,singularity
+SAMPLESHEET=$PROJ_DIR/samplesheet.csv
+OUTDIR=$PROJ_DIR
+GENOME=GRCh38
+
+# Required environment variable
+export MINERVA_ALLOCATION='acc_YOUR-PROJECT-NAME'
+
+# Proxy settings
+export http_proxy=http://172.28.7.1:3128
+export https_proxy=http://172.28.7.1:3128
+export all_proxy=http://172.28.7.1:3128
+export no_proxy=localhost,*.chimera.hpc.mssm.edu,172.28.0.0/16
+
+# Load modules
+ml java
+ml anaconda3
+ml singularity-ce
+
+# Set up Nextflow environment
+source /hpc/packages/minerva-centos7/anaconda3/2018.12/etc/profile.d/conda.sh
+conda init bash
+conda activate nextflow
+
+cd $PROJ_DIR
+
+# Run pipeline
+nextflow run $NFC_PIPE \
+    -r $NFC_VER \
+    -profile $NFC_PROFILE \
+    -w /sc/arion/scratch/${USER}/work \
+    -c $PROJ_DIR/custom.config \
+    --input $SAMPLESHEET \
+    --outdir $OUTDIR \
+    --genome $GENOME
+```
+
+## Custom Configuration
+
+Users can supplement the base configuration by creating a `custom.config` file. Many processes require minor modifications address specific parameters of a dataset or condition. Given the generalized nature of the main config profile and diversity of process requirements, needing this flexibility is common.
+
+```nextflow
+process {
+  withName: 'PICARD_MARKDUPLICATES' {
+    ext.suffix = 'bam'  // Explicitly set the suffix to avoid using getExtension()
+  }
+
+  withName: 'STRINGTIE_STRINGTIE' {
+      memory = '24.GB'  // Increase to at least 2-3x the default
+  }
+}
+```
+
+## Troubleshooting
+
+### Common Issues
+
+- **Singularity cache errors**: If you encounter errors related to Singularity caching, check your scratch or work space allocation, and clean up the cache directory if needed. A common issue is related to the singularity `pullTimeout` argument. Large, remote images may exceed this and would benefit from being pulled into the cache manually.
+- **Memory issues**: Some processes may require more memory than the default allocation. Use a custom config to increase memory for specific processes.
+- **LSF job submission failures**: Ensure your MINERVA_ALLOCATION variable is set correctly and that you have sufficient allocation hours remaining.
+
+## Tested Versions
+
+This configuration has been tested with:
+
+- Nextflow: 24.10.3
+- Singularity-ce: 4.1.1
+- nf-core pipelines: DSL2 compatible (2022-2025)
+
+:::note
+You will need an account and allocation on the Minerva HPC cluster to run nf-core pipelines. For accounts and allocation requests, contact your lab administrator or hpchelp@hpc.mssm.edu.
+:::
+
+:::note
+All jobs will be submitted to the cluster via LSF scheduler. For technical assistance with the HPC environment, contact hpchelp@hpc.mssm.edu.
+:::
diff --git a/nfcore_custom.config b/nfcore_custom.config
@@ -277,6 +277,9 @@ profiles {
     mpcdf_viper {
         includeConfig "${params.custom_config_base}/conf/mpcdf_viper.config"
     }
+    mssm {
+        includeConfig "${params.custom_config_base}/conf/mssm.config"
+    }
     munin {
         includeConfig "${params.custom_config_base}/conf/munin.config"
     }

Original file line number	Diff line number	Diff line change
`@@ -277,6 +277,9 @@ profiles {`
`277`	`277`	`mpcdf_viper {`
`278`	`278`	`includeConfig "${params.custom_config_base}/conf/mpcdf_viper.config"`
`279`	`279`	`}`
	`280`	`+ mssm {`
	`281`	`+ includeConfig "${params.custom_config_base}/conf/mssm.config"`
	`282`	`+ }`
`280`	`283`	`munin {`
`281`	`284`	`includeConfig "${params.custom_config_base}/conf/munin.config"`
`282`	`285`	`}`