Skip to content

Commit e02bc14

Browse files
authored
Merge branch 'major-release-wangen' into inline-barcode-trimming
2 parents 7f01bef + b057f0a commit e02bc14

8 files changed

Lines changed: 118 additions & 20 deletions

File tree

.github/workflows/ci.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,12 @@ jobs:
102102
- name: ADAPTERREMOVAL Run the basic pipeline with preserve5p end and merged reads only options
103103
run: |
104104
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --preserve5p --mergedonly
105+
- name: ADAPTER LIST Run the basic pipeline using an adapter list
106+
run: |
107+
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --clip_adapters_list 'https://github.com/nf-core/test-datasets/raw/eager/databases/adapters/adapter-list.txt'
108+
- name: ADAPTER LIST Run the basic pipeline using an adapter list, skipping adapter removal
109+
run: |
110+
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --clip_adapters_list 'https://github.com/nf-core/test-datasets/raw/eager/databases/adapters/adapter-list.txt' --skip_adapterremoval
105111
- name: POST_AR_FASTQ_TRIMMING Run the basic pipeline post-adapterremoval FASTQ trimming
106112
run: |
107113
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_post_ar_trimming
@@ -126,6 +132,9 @@ jobs:
126132
- name: BAM_FILTERING Run basic mapping pipeline with post-mapping length filtering
127133
run: |
128134
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --clip_readlength 0 --run_bam_filtering --bam_filter_minreadlength 50
135+
- name: PRESEQ Run basic mapping pipeline with different preseq mode
136+
run: |
137+
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --preseq_mode 'lc_extrap' --preseq_maxextrap 10000 --preseq_bootstrap 10
129138
- name: DEDUPLICATION Test with dedup
130139
run: |
131140
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --dedupper 'dedup' --dedup_all_merged

.github/workflows/linting.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ jobs:
107107
- name: Install dependencies
108108
run: |
109109
python -m pip install --upgrade pip
110-
pip install nf-core
110+
pip install nf-core==1.14
111111
112112
- name: Run nf-core lint
113113
env:

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,15 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
77

88
### `Added`
99

10+
- [#651](https://github.com/nf-core/eager/issues/651) - Adds removal of adapters specified in an AdapterRemoval adapter list file
11+
- [#769](https://github.com/nf-core/eager/issues/769) - Adds lc_extrap mode to preseq (requested by @roberta-davidson)
1012
- [#642](https://github.com/nf-core/eager/issues/642) and [#431](https://github.com/nf-core/eager/issues/431) adds post-adapter removal barcode/fastq trimming
1113

1214
### `Fixed`
1315

1416
- [#771](https://github.com/nf-core/eager/issues/771) Remove legacy code
1517
- Improved output documentation for MultiQC general stats table (thanks to @KathrinNaegele and @esalmela)
18+
- Improved output documentation for BowTie2 (thanks to @isinaltinkaya)
1619

1720
### `Dependencies`
1821

assets/multiqc_config.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,9 @@ top_modules:
8686
- '*_postfilterflagstat.stats'
8787
- 'dedup'
8888
- 'picard'
89-
- 'preseq'
89+
- 'preseq':
90+
path_filters:
91+
- '*.preseq'
9092
- 'damageprofiler'
9193
- 'mtnucratio'
9294
- 'qualimap'

docs/output.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,7 @@ Ancient DNA samples typically have low endogenous DNA values, as most of the DNA
334334
<img src="images/output/bowtie2/bowtie2_alignment_scores.png" width="75%" height = "75%">
335335
</p>
336336

337-
The main additional useful information compared to [Samtools](#samtools) is that these plots can inform you how many reads had multiple places on the reference the read could align to. This can occur with low complexity reads or reads derived from e.g. repetitive regions on the genome. If you have large amounts of multi-mapping reads, this can be a warning flag that there is an issue either with the reference genome or library itself (e.g. over-amplification of low-complexity regions or library construction artefacts). You should investigate cases like this more closely before using the data downstream.
337+
The main additional useful information compared to [Samtools](#samtools) is that these plots can inform you how many reads had multiple places on the reference the read could align to. This can occur with low complexity reads or reads derived from e.g. repetitive regions on the genome. If you have large amounts of multi-mapping reads, this can be a warning flag that there is an issue either with the reference genome or library itself (e.g. library construction artefacts). You should investigate cases like this more closely before using the data downstream.
338338

339339
### MALT
340340

@@ -655,7 +655,7 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir
655655
* `samtools/`: this contains two sub-directories. `stats/` contain the raw mapping statistics files (ending in `.stats`) from directly after mapping. `filter/` contains BAM files that have had a mapping quality filter applied (set by the `--bam_mapping_quality_threshold` flag) and a corresponding index file. Furthermore, if you selected `--bam_discard_unmapped`, you will find your separate file with only unmapped reads in the format you selected. Note unmapped read BAM files will _not_ have an index file.
656656
* `deduplication/`: this contains a sub-directory called `dedup/`, inside here are sample specific directories. Each directory contains a BAM file containing mapped reads but with PCR duplicates removed, a corresponding index file and two stats file. `.hist.` contains raw data for a deduplication histogram used for tools like preseq (see below), and the `.log` contains overall summary deduplication statistics.
657657
* `endorSpy/`: this contains all JSON files exported from the endorSpy endogenous DNA calculation tool. The JSON files are generated specifically for display in the MultiQC general statistics table and is otherwise very likely not useful for you.
658-
* `preseq/`: this contains a `.ccurve` file for every BAM file that had enough deduplication statistics to generate a complexity curve for estimating the amount unique reads that will be yield if the library is re-sequenced. You can use this file for plotting e.g. in `R` to find your sequencing target depth.
658+
* `preseq/`: this contains a `.preseq` file for every BAM file that had enough deduplication statistics to generate a complexity curve for estimating the amount unique reads that will be yield if the library is re-sequenced. You can use this file for plotting e.g. in `R` to find your sequencing target depth.
659659
* `qualimap/`: this contains a sub-directory for every sample, which includes a qualimap report and associated raw statistic files. You can open the `.html` file in your internet browser to see the in-depth report (this will be more detailed than in MultiQC). This includes stuff like percent coverage, depth coverage, GC content and so on of your mapped reads.
660660
* `damageprofiler/`: this contains sample specific directories containing raw statistics and damage plots from DamageProfiler. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files.
661661
* `pmdtools/`: this contains raw output statistics of pmdtools (estimates of frequencies of substitutions), and BAM files which have been filtered to remove reads that do not have a Post-mortem damage (PMD) score of `--pmdtools_threshold`.

main.nf

Lines changed: 49 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,15 @@ if ( params.skip_collapse && params.skip_trim ) {
4646
}
4747

4848
// Bedtools validation
49-
if(params.run_bedtools_coverage && !params.anno_file ){
49+
if( params.run_bedtools_coverage && !params.anno_file ){
5050
exit 1, "[nf-core/eager] error: you have turned on bedtools coverage, but not specified a BED or GFF file with --anno_file. Please validate your parameters."
5151
}
5252

53+
// Bedtools validation
54+
if( !params.skip_preseq && !( params.preseq_mode == 'c_curve' || params.preseq_mode == 'lc_extrap' ) ) {
55+
exit 1, "[nf-core/eager] error: you are running preseq with a unsupported mode. See documentation for more information. You gave: ${params.preseq_mode}."
56+
}
57+
5358
// BAM filtering validation
5459
if (!params.run_bam_filtering && params.bam_mapping_quality_threshold != 0) {
5560
exit 1, "[nf-core/eager] error: please turn on BAM filtering if you want to perform mapping quality filtering! Provide: --run_bam_filtering."
@@ -227,6 +232,20 @@ if( params.bt2_index && params.mapper == 'bowtie2' ){
227232
bwa_index_bwamem = Channel.empty()
228233
}
229234

235+
// Adapter removal adapter-list setup
236+
if ( !params.clip_adapters_list ) {
237+
Channel
238+
.fromPath("$projectDir/assets/nf-core_eager_dummy2.txt", checkIfExists: true)
239+
.ifEmpty { exit 1, "[nf-core/eager] error: adapters list file not found. Please check input. Supplied: --clip_adapters_list '${params.clip_adapters_list}'." }
240+
.into {ch_adapterlist}
241+
} else {
242+
Channel
243+
.fromPath("${params.clip_adapters_list}", checkIfExists: true)
244+
.ifEmpty { exit 1, "[nf-core/eager] error: adapters list file not found. Please check input. Supplied: --clip_adapters_list '${params.clip_adapters_list}'." }
245+
.into {ch_adapterlist}
246+
}
247+
248+
230249
// SexDetermination channel set up and bedfile validation
231250
if (!params.sexdeterrmine_bedfile) {
232251
ch_bed_for_sexdeterrmine = Channel.fromPath("$projectDir/assets/nf-core_eager_dummy.txt")
@@ -765,25 +784,27 @@ process adapter_removal {
765784

766785
input:
767786
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(r1), file(r2) from ch_fastp_for_adapterremoval
787+
path adapterlist from ch_adapterlist.collect().dump(tag: "Adapter list")
768788

769789
output:
770790
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("output/*{combined.fq,.se.truncated,pair1.truncated}.gz") into ch_output_from_adapterremoval_r1
771791
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("output/*pair2.truncated.gz") optional true into ch_output_from_adapterremoval_r2
772792
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("output/*.settings") into ch_adapterremoval_logs
773-
793+
774794
when:
775795
!params.skip_adapterremoval
776796

777797
script:
778-
base = "${r1.baseName}_L${lane}"
798+
def base = "${r1.baseName}_L${lane}"
799+
def adapters_to_remove = !params.clip_adapters_list ? "--adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor}" : "--adapter-list ${adapterlist}"
779800
//This checks whether we skip trimming and defines a variable respectively
780801
def preserve5p = params.preserve5p ? '--preserve5p' : '' // applies to any AR command - doesn't affect output file combination
781802

782803
if ( seqtype == 'PE' && !params.skip_collapse && !params.skip_trim && !params.mergedonly && !params.preserve5p ) {
783804
"""
784805
mkdir -p output
785806
786-
AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
807+
AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities ${adapters_to_remove} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
787808
788809
cat *.collapsed.gz *.collapsed.truncated.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz > output/${base}.pe.combined.tmp.fq.gz
789810
@@ -797,7 +818,7 @@ process adapter_removal {
797818
"""
798819
mkdir -p output
799820
800-
AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
821+
AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities ${adapters_to_remove} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
801822
802823
cat *.collapsed.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz > output/${base}.pe.combined.tmp.fq.gz
803824
@@ -810,7 +831,7 @@ process adapter_removal {
810831
} else if ( seqtype == 'PE' && !params.skip_collapse && !params.skip_trim && params.mergedonly && !params.preserve5p ) {
811832
"""
812833
mkdir -p output
813-
AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
834+
AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities ${adapters_to_remove} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
814835
815836
cat *.collapsed.gz *.collapsed.truncated.gz > output/${base}.pe.combined.tmp.fq.gz
816837
@@ -823,7 +844,7 @@ process adapter_removal {
823844
} else if ( seqtype == 'PE' && !params.skip_collapse && !params.skip_trim && params.mergedonly && params.preserve5p ) {
824845
"""
825846
mkdir -p output
826-
AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
847+
AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities ${adapters_to_remove} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
827848
828849
cat *.collapsed.gz > output/${base}.pe.combined.tmp.fq.gz
829850
@@ -864,15 +885,15 @@ process adapter_removal {
864885
} else if ( seqtype == 'PE' && params.skip_collapse && !params.skip_trim ) {
865886
"""
866887
mkdir -p output
867-
AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} ${preserve5p} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
888+
AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} ${preserve5p} --trimns --trimqualities ${adapters_to_remove} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
868889
869890
mv ${base}.pe.pair*.truncated.gz *.settings output/
870891
"""
871892
} else if ( seqtype != 'PE' && !params.skip_trim ) {
872893
//SE, collapse not possible, trim reads only
873894
"""
874895
mkdir -p output
875-
AdapterRemoval --file1 ${r1} --basename ${base}.se --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} ${preserve5p} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
896+
AdapterRemoval --file1 ${r1} --basename ${base}.se --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} ${preserve5p} --trimns --trimqualities ${adapters_to_remove} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
876897
mv *.settings *.se.truncated.gz output/
877898
"""
878899
} else if ( seqtype != 'PE' && params.skip_trim ) {
@@ -1982,21 +2003,33 @@ process preseq {
19822003
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(input) from ch_input_for_preseq
19832004

19842005
output:
1985-
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("${input.baseName}.ccurve") into ch_preseq_for_multiqc
2006+
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("${input.baseName}.preseq") into ch_preseq_for_multiqc
19862007

19872008
script:
19882009
pe_mode = params.skip_collapse && seqtype == "PE" ? '-P' : ''
1989-
if(!params.skip_deduplication && params.dedupper == "dedup"){
2010+
if(!params.skip_deduplication && params.preseq_mode == 'c_curve' && params.dedupper == "dedup"){
2011+
"""
2012+
preseq c_curve -s ${params.preseq_step_size} -o ${input.baseName}.preseq -H ${input}
2013+
"""
2014+
} else if( !params.skip_deduplication && params.preseq_mode == 'c_curve' && params.dedupper == "markduplicates"){
2015+
"""
2016+
preseq c_curve -s ${params.preseq_step_size} -o ${input.baseName}.preseq -B ${input} ${pe_mode}
2017+
"""
2018+
} else if ( params.skip_deduplication && params.preseq_mode == 'c_curve' ) {
2019+
"""
2020+
preseq c_curve -s ${params.preseq_step_size} -o ${input.baseName}.preseq -B ${input} ${pe_mode}
2021+
"""
2022+
} else if(!params.skip_deduplication && params.preseq_mode == 'lc_extrap' && params.dedupper == "dedup"){
19902023
"""
1991-
preseq c_curve -s ${params.preseq_step_size} -o ${input.baseName}.ccurve -H ${input}
2024+
preseq lc_extrap -s ${params.preseq_step_size} -o ${input.baseName}.preseq -H ${input} -n ${params.preseq_bootstrap} -e ${params.preseq_maxextrap} -cval ${params.preseq_cval} -x ${params.preseq_terms}
19922025
"""
1993-
} else if( !params.skip_deduplication && params.dedupper == "markduplicates"){
2026+
} else if( !params.skip_deduplication && params.preseq_mode == 'lc_extrap' && params.dedupper == "markduplicates"){
19942027
"""
1995-
preseq c_curve -s ${params.preseq_step_size} -o ${input.baseName}.ccurve -B ${input} ${pe_mode}
2028+
preseq lc_extrap -s ${params.preseq_step_size} -o ${input.baseName}.preseq -B ${input} ${pe_mode} -n ${params.preseq_bootstrap} -e ${params.preseq_maxextrap} -cval ${params.preseq_cval} -x ${params.preseq_terms}
19962029
"""
1997-
} else if ( params.skip_deduplication ) {
2030+
} else if ( params.skip_deduplication && params.preseq_mode == 'lc_extrap' ) {
19982031
"""
1999-
preseq c_curve -s ${params.preseq_step_size} -o ${input.baseName}.ccurve -B ${input} ${pe_mode}
2032+
preseq lc_extrap -s ${params.preseq_step_size} -o ${input.baseName}.preseq -B ${input} ${pe_mode} -n ${params.preseq_bootstrap} -e ${params.preseq_maxextrap} -cval ${params.preseq_cval} -x ${params.preseq_terms}
20002033
"""
20012034
}
20022035
}

nextflow.config

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ params {
6666
//Read clipping and merging parameters
6767
clip_forward_adaptor = 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC'
6868
clip_reverse_adaptor = 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA'
69+
clip_adapters_list = null
6970
clip_readlength = 30
7071
clip_min_read_quality = 20
7172
min_adap_overlap = 1
@@ -113,6 +114,11 @@ params {
113114

114115
//Preseq settings
115116
preseq_step_size = 1000
117+
preseq_mode = 'c_curve'
118+
preseq_bootstrap = 100
119+
preseq_maxextrap = 10000000000
120+
preseq_cval = 0.95
121+
preseq_terms = 100
116122

117123
//DamageProfiler settings
118124
damageprofiler_length = 100

0 commit comments

Comments
 (0)