Skip to content

Commit 3a9a247

Browse files
authored
Merge pull request #1102 from RaqManzano/update-cambridge-config
Update Cambridge config for current CSD3 partitions.
2 parents 47dcc11 + 9325477 commit 3a9a247

3 files changed

Lines changed: 196 additions & 66 deletions

File tree

.github/CODEOWNERS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
**/roslin** @sguizard @donalddunbar
1212
**/lrz_cm4** @nschan
1313
**/crg** @joseespinosa
14+
**/cambridge** @RaqManzano
1415
**/iris** @nikhil
1516
**/mahuika** @jen-reeve
1617
**/purdue_** @aseetharam

conf/cambridge.config

Lines changed: 71 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,82 @@
1-
// Description is overwritten with user specific flags
1+
// nf-core/configs: Cambridge CSD3 cluster profile
2+
// Available partitions:
3+
// - icelake : 76 CPUs, 256 GB RAM
4+
// - icelake-himem : 76 CPUs, 512 GB RAM
5+
// - sapphire : 112 CPUs, 512 GB RAM
6+
//
7+
// The profile defaults to the broadly available `icelake` partition but allows
8+
// users to override it with `--partition`. Walltime is inferred from the SLURM
9+
// account name when possible: projects containing `-SL3-` use a 12 h cap,
10+
// otherwise the profile assumes the common SL1/SL2 36 h limit.
11+
212
params {
3-
config_profile_description = 'Cambridge HPC cluster profile.'
4-
// FIXME EmelineFavreau was the last to edit this
5-
config_profile_contact = 'Andries van Tonder (ajv37@cam.ac.uk)'
6-
config_profile_url = "https://docs.hpc.cam.ac.uk/hpc"
7-
partition = null
8-
project = null
9-
max_memory = 192.GB
10-
max_cpus = 56
11-
max_time = 12.h
13+
config_profile_description = 'Cambridge HPC CSD3 cluster profile.'
14+
config_profile_contact = 'Raquel Manzano (rm889@cam.ac.uk) and Andries van Tonder (ajv37@cam.ac.uk)'
15+
config_profile_url = 'https://docs.hpc.cam.ac.uk/hpc'
16+
17+
partition = 'icelake'
18+
project = null
19+
20+
// Compatibility with nf-core schema validation across pipeline versions.
21+
schema_ignore_params = 'partition,project,max_memory,max_cpus,max_time,csd_time,csd_parts,csd_selected,validationSchemaIgnoreParams'
22+
validationSchemaIgnoreParams = 'partition,project,max_memory,max_cpus,max_time,csd_time,csd_parts,csd_selected,schema_ignore_params,validationSchemaIgnoreParams'
23+
}
24+
25+
validation {
26+
ignoreParams = ['partition', 'project', 'max_memory', 'max_cpus', 'max_time', 'csd_time', 'csd_parts', 'csd_selected', 'schema_ignore_params', 'validationSchemaIgnoreParams']
1227
}
1328

14-
// Description is overwritten with user specific flags
29+
params.csd_time = {
30+
params.project?.toUpperCase()?.contains('-SL3-') ? 12.h : 36.h
31+
}.call()
32+
33+
params.csd_parts = [
34+
icelake : [memory: 256.GB, cpus: 76, time: params.csd_time],
35+
'icelake-himem': [memory: 512.GB, cpus: 76, time: params.csd_time],
36+
sapphire : [memory: 512.GB, cpus: 112, time: params.csd_time]
37+
]
38+
39+
params.csd_selected = {
40+
def selected = params.csd_parts[params.partition]
41+
42+
if (!selected) {
43+
System.err.println("ERROR: cambridge params.partition must be one of 'icelake', 'icelake-himem', or 'sapphire' (got '${params.partition}').")
44+
System.exit(1)
45+
}
46+
47+
selected
48+
}.call()
49+
50+
params.max_memory = params.csd_selected.memory
51+
params.max_cpus = params.csd_selected.cpus
52+
params.max_time = params.csd_selected.time
53+
1554
singularity {
1655
enabled = true
1756
autoMounts = true
1857
}
1958

2059
process {
21-
resourceLimits = [
22-
memory: 192.GB,
23-
cpus: 56,
24-
time: 12.h
25-
]
60+
resourceLimits = params.csd_selected
61+
62+
beforeScript = """
63+
. /etc/profile.d/modules.sh
64+
module purge
65+
module load rhel8/default-${params.partition == 'sapphire' ? 'sar' : 'icl'}
66+
"""
67+
2668
executor = 'slurm'
27-
clusterOptions = "-A ${params.project} -p ${params.partition}"
69+
queue = params.partition
70+
clusterOptions = { params.project ? "--account=${params.project}" : '' }
71+
cache = 'lenient'
72+
scratch = true
73+
}
74+
75+
executor {
76+
name = 'slurm'
77+
queueSize = 200
78+
pollInterval = '5 min'
79+
queueStatInterval = '5 min'
80+
submitRateLimit = '10 sec'
81+
exitReadTimeout = '30 min'
2882
}

docs/cambridge.md

Lines changed: 124 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,87 +1,162 @@
11
# nf-core/configs: Cambridge HPC Configuration
22

3-
All nf-core pipelines have been successfully configured for use on the Cambridge HPC cluster at the [The University of Cambridge](https://www.cam.ac.uk/).
4-
To use, run the pipeline with `-profile cambridge`. This will download and launch the [`cambridge.config`](../conf/cambridge.config) which has been pre-configured
5-
with a setup suitable for the Cambridge HPC cluster. Using this profile, either a docker image containing all of the required software will be downloaded,
6-
and converted to a Singularity image or a Singularity image downloaded directly before execution of the pipeline.
3+
All nf-core pipelines can be run on the [Cambridge HPC cluster](https://docs.hpc.cam.ac.uk/hpc/index.html) at the University of Cambridge using `-profile cambridge`.
4+
This will download and use the [`cambridge.config`](../conf/cambridge.config)
5+
institutional profile, which is configured for running pipelines on CSD3 with
6+
Singularity containers.
77

88
### Install Nextflow
99

10-
The latest version of Nextflow is not installed by default on the Cambridge HPC cluster CSD3. You can install it with conda:
10+
The latest version of Nextflow is not installed by default on CSD3.
11+
12+
The recommended option is the standard Nextflow self-installing package:
1113

1214
```
13-
module load miniconda/3
15+
# Check that Java 17+ is available
16+
java -version
17+
18+
# Download Nextflow
19+
curl -s https://get.nextflow.io | bash
1420
15-
# set up Bioconda according to the Bioconda documentation, notably setting up channels
16-
conda config --add channels defaults
17-
conda config --add channels bioconda
18-
conda config --add channels conda-forge
21+
# Make it executable
22+
chmod +x nextflow
1923
20-
# create the environment env_nf, and install the tool nextflow
21-
conda create --name env_nf nextflow
24+
# Move it to a personal bin directory in hpc-work
25+
mkdir -p $HOME/rds/hpc-work/bin
26+
mv nextflow $HOME/rds/hpc-work/bin/
2227
23-
# activate the environment containing nextflow
24-
conda activate env_nf
28+
# Add that directory to your PATH if needed
29+
export PATH="$HOME/rds/hpc-work/bin:$PATH"
2530
26-
# once done with the environment, deactivate
27-
conda deactivate
31+
# Confirm the installation
32+
nextflow info
2833
```
2934

30-
Alternatively, you can install Nextflow into a directory you have write access to.
31-
Follow [these instructions](https://www.nextflow.io/docs/latest/getstarted.html#) from the Nextflow documentation. This alternative method requires also to update java.
35+
To make the `PATH` change persistent across sessions, add the `export PATH=...`
36+
line to your `~/.bashrc` or equivalent shell startup file.
3237

33-
```
34-
# move to desired directory on HPC
35-
cd /home/<username>/path/to/dir
38+
See the official installation guide for the latest details and Java
39+
requirements:
3640

37-
# get the newest version
38-
wget -qO- https://get.nextflow.io | bash
41+
- [nf-core / Nextflow installation guide](https://nf-co.re/docs/usage/getting_started/installation)
3942

40-
# update java version to the latest
41-
wget https://download.oracle.com/java/20/latest/jdk-20_linux-x64_bin.tar.gz
42-
tar xvfz jdk-20_linux-x64_bin.tar.gz
43+
If you prefer a user-managed package manager, a simple option is to install
44+
`micromamba` and then follow the nf-core conda-style instructions for creating
45+
an environment with `nextflow`:
4346

44-
# if all tools are compatible with the java version you chose, add these lines to .bashrc
45-
export JAVA_HOME=/home/<username>/path/to/dir/jdk-20.0.1
46-
export PATH=/home/<username>/path/to/dir/jdk-20.0.1/bin:$PATH
47+
- [micromamba installation guide](https://mamba.readthedocs.io/en/stable/installation/micromamba-installation.html)
48+
- [nf-core conda installation instructions](https://nf-co.re/docs/usage/getting_started/installation#conda-installation)
4749

48-
# Once above is done `java --version` should return `java 20.0.1 2023-04-18`
49-
java --version
50+
`pixi` may also work well for personal environment management; see the
51+
[pixi documentation](https://pixi.prefix.dev/latest/). However, this profile
52+
does not currently provide tested `pixi` instructions, so `micromamba` is the
53+
more conservative recommendation here.
54+
55+
### Set up Singularity cache
5056

57+
Singularity allows the use of containers and will use a caching strategy. First,
58+
you might want to set the `NXF_SINGULARITY_CACHEDIR` bash environment variable,
59+
pointing at a directory with sufficient space. If not, it will be
60+
automatically assigned to the current directory.
61+
62+
```
63+
# do this once per login, or add this line to .bashrc
64+
export NXF_SINGULARITY_CACHEDIR=$HOME/rds/hpc-work/nxf-singularity-cache
5165
```
5266

53-
### Set up Singularity cache
67+
On CSD3, Singularity is available by default, so no additional module loading
68+
should be required.
5469

55-
Singularity allows the use of containers and will use a caching strategy. First, you might want to set the `NXF_SINGULARITY_CACHEDIR` bash environment variable, pointing at your hpc-work location. If not, it will be automatically assigned to the current directory.
70+
### Run Nextflow
71+
72+
Here is an example with the nf-core pipeline sarek ([read documentation here](https://nf-co.re/sarek/3.3.2)).
73+
The profile defaults to the `icelake` partition, but users can switch to
74+
`icelake-himem` or `sapphire` with `--partition`. The user should also provide
75+
their SLURM project / account with `--project`.
76+
77+
#### Choosing a partition
78+
79+
As a rough guide, `icelake` is the default general-purpose choice for most
80+
workflows. `icelake-himem` is the better option when processes need more memory
81+
per CPU, for example memory-hungry tasks or jobs using only a small number of
82+
CPUs but requiring substantial RAM. `sapphire` provides newer Sapphire Rapids
83+
nodes with 112 CPUs and about 4.5 GiB RAM per CPU (512 GB per node), so it may
84+
be a better fit for higher-CPU jobs than `icelake`.
85+
86+
#### Example
5687

5788
```
58-
# do this once per login, or add these lines to .bashrc
59-
export NXF_SINGULARITY_CACHEDIR=/home/<username>/rds/hpc-work/path/to/cache/dir
89+
# Launch the nf-core pipeline for a test database
90+
# with the Cambridge profile
91+
nextflow run nf-core/sarek -profile test,cambridge --partition icelake --project NAME-SL2-CPU --outdir nf-sarek-test
6092
```
6193

62-
Once done, and ready to use Nextflow, you can check that the Singularity module is loaded by default when logging on the cluster.
94+
If the project name contains `-SL3-`, the profile applies a 12 h walltime cap.
95+
Otherwise it assumes the standard SL1 / SL2 36 h limit.
96+
97+
#### Running Nextflow on CSD3
98+
99+
We recommend starting Nextflow inside a `screen` or `tmux`
100+
session so that the Nextflow manager process keeps running after you disconnect
101+
your SSH session.
63102

64103
```
65-
module list
104+
# Start a tmux session
105+
tmux new -s nextflow
106+
107+
# Or start a screen session
108+
screen -S nextflow
66109
67-
# If singularity is not loaded:
68-
module load singularity
110+
# Re-attach later if needed
111+
tmux attach -t nextflow
112+
screen -r nextflow
69113
```
70114

71-
### Run Nextflow
115+
Detaching from `tmux` leaves the workflow running in the background with
116+
`Ctrl-b` then `d`. For `screen`, use `Ctrl-a` then `d`.
72117

73-
Here is an example with the nf-core pipeline sarek ([read documentation here](https://nf-co.re/sarek/3.3.2)).
74-
The user includes the project name and the node.
118+
You can then logout of the HPC and reattach to the session later.
119+
Before logging out, make sure to **note the node you’re on**.
120+
Suppose your login node was called `login-p-3`, you can later log back into this specific node as follows:
75121

122+
```bash
123+
ssh username@login-p-3.hpc.cam.ac.uk
76124
```
77-
# Launch the nf-core pipeline for a test database
78-
# with the Cambridge profile
79-
nextflow run nf-core/sarek -profile test,cambridge.config --partition "cclake" --project "NAME-SL3-CPU" --outdir nf-sarek-test
125+
126+
Then, you can re-attach to the `tmux`/`screen` session:
127+
128+
```bash
129+
tmux attach -t nextflow
130+
screen -r nextflow
80131
```
81132

82-
All of the intermediate files required to run the pipeline will be stored in the `work/` directory. It is recommended to delete this directory after the pipeline
83-
has finished successfully because it can get quite large, and all of the main output files will be saved in the `results/` directory anyway.
133+
#### Limit Nextflow JVM memory (recommended)
134+
135+
If needed, you can limit the memory used by the Nextflow manager process by
136+
setting:
137+
138+
```bash
139+
export NXF_JVM_ARGS='-Xms2g -Xmx4g'
140+
```
141+
142+
This is a conservative example that should work for most runs. If the Nextflow
143+
manager process still runs into memory errors, increase `-Xmx` accordingly.
144+
This must be set **before** launching `nextflow run ...`. If you want to use
145+
this setting by default, you can add the export line to your `~/.bashrc`.
146+
147+
#### Large runs
148+
149+
For large runs, for example workflows with many samples or many tasks, the
150+
Nextflow manager process can itself use substantial memory. In those cases, it
151+
is better to launch `nextflow run ...` inside an interactive `srun` session or
152+
submit it via `sbatch`, rather than running it directly on a login node.
153+
154+
#### `work` directory
155+
156+
All of the intermediate files required to run the pipeline will be stored in
157+
the `work/` directory. It is recommended to **delete** this directory after the
158+
pipeline has finished successfully because it can get quite large, and all of
159+
the main output files will be saved in the `--outdir` directory anyway.
84160

85161
> NB: You will need an account to use the Cambridge HPC cluster in order to run the pipeline. If in doubt contact IT.
86-
> NB: Nextflow will need to submit the jobs via SLURM to the Cambridge HPC cluster and as such the commands above will have to be executed on one of the login
87-
> nodes. If in doubt contact IT.
162+
> NB: Nextflow will need to submit the jobs via SLURM to the Cambridge HPC cluster and as such the commands above will have to be executed on one of the login nodes. If in doubt contact IT.

0 commit comments

Comments
 (0)