Skip to content

Commit b057f0a

Browse files
authored
Merge pull request #780 from nf-core/preseq-extension
Add LC_Extrap mode for @robert-davidson
2 parents c0c2997 + ff3cef3 commit b057f0a

8 files changed

Lines changed: 78 additions & 11 deletions

File tree

.github/workflows/ci.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,9 @@ jobs:
126126
- name: BAM_FILTERING Run basic mapping pipeline with post-mapping length filtering
127127
run: |
128128
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --clip_readlength 0 --run_bam_filtering --bam_filter_minreadlength 50
129+
- name: PRESEQ Run basic mapping pipeline with different preseq mode
130+
run: |
131+
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --preseq_mode 'lc_extrap' --preseq_maxextrap 10000 --preseq_bootstrap 10
129132
- name: DEDUPLICATION Test with dedup
130133
run: |
131134
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --dedupper 'dedup' --dedup_all_merged

.github/workflows/linting.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ jobs:
107107
- name: Install dependencies
108108
run: |
109109
python -m pip install --upgrade pip
110-
pip install nf-core
110+
pip install nf-core==1.14
111111
112112
- name: Run nf-core lint
113113
env:

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
88
### `Added`
99

1010
- [#651](https://github.com/nf-core/eager/issues/651) - Adds removal of adapters specified in an AdapterRemoval adapter list file
11+
- [#769](https://github.com/nf-core/eager/issues/769) - Adds lc_extrap mode to preseq (requested by @roberta-davidson)
1112

1213
### `Fixed`
1314

assets/multiqc_config.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,9 @@ top_modules:
8686
- '*_postfilterflagstat.stats'
8787
- 'dedup'
8888
- 'picard'
89-
- 'preseq'
89+
- 'preseq':
90+
path_filters:
91+
- '*.preseq'
9092
- 'damageprofiler'
9193
- 'mtnucratio'
9294
- 'qualimap'

docs/output.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -653,7 +653,7 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir
653653
* `samtools/`: this contains two sub-directories. `stats/` contain the raw mapping statistics files (ending in `.stats`) from directly after mapping. `filter/` contains BAM files that have had a mapping quality filter applied (set by the `--bam_mapping_quality_threshold` flag) and a corresponding index file. Furthermore, if you selected `--bam_discard_unmapped`, you will find your separate file with only unmapped reads in the format you selected. Note unmapped read BAM files will _not_ have an index file.
654654
* `deduplication/`: this contains a sub-directory called `dedup/`, inside here are sample specific directories. Each directory contains a BAM file containing mapped reads but with PCR duplicates removed, a corresponding index file and two stats file. `.hist.` contains raw data for a deduplication histogram used for tools like preseq (see below), and the `.log` contains overall summary deduplication statistics.
655655
* `endorSpy/`: this contains all JSON files exported from the endorSpy endogenous DNA calculation tool. The JSON files are generated specifically for display in the MultiQC general statistics table and is otherwise very likely not useful for you.
656-
* `preseq/`: this contains a `.ccurve` file for every BAM file that had enough deduplication statistics to generate a complexity curve for estimating the amount unique reads that will be yield if the library is re-sequenced. You can use this file for plotting e.g. in `R` to find your sequencing target depth.
656+
* `preseq/`: this contains a `.preseq` file for every BAM file that had enough deduplication statistics to generate a complexity curve for estimating the amount unique reads that will be yield if the library is re-sequenced. You can use this file for plotting e.g. in `R` to find your sequencing target depth.
657657
* `qualimap/`: this contains a sub-directory for every sample, which includes a qualimap report and associated raw statistic files. You can open the `.html` file in your internet browser to see the in-depth report (this will be more detailed than in MultiQC). This includes stuff like percent coverage, depth coverage, GC content and so on of your mapped reads.
658658
* `damageprofiler/`: this contains sample specific directories containing raw statistics and damage plots from DamageProfiler. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files.
659659
* `pmdtools/`: this contains raw output statistics of pmdtools (estimates of frequencies of substitutions), and BAM files which have been filtered to remove reads that do not have a Post-mortem damage (PMD) score of `--pmdtools_threshold`.

main.nf

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,15 @@ if ( params.skip_collapse && params.skip_trim ) {
4646
}
4747

4848
// Bedtools validation
49-
if(params.run_bedtools_coverage && !params.anno_file ){
49+
if( params.run_bedtools_coverage && !params.anno_file ){
5050
exit 1, "[nf-core/eager] error: you have turned on bedtools coverage, but not specified a BED or GFF file with --anno_file. Please validate your parameters."
5151
}
5252

53+
// Bedtools validation
54+
if( !params.skip_preseq && !( params.preseq_mode == 'c_curve' || params.preseq_mode == 'lc_extrap' ) ) {
55+
exit 1, "[nf-core/eager] error: you are running preseq with a unsupported mode. See documentation for more information. You gave: ${params.preseq_mode}."
56+
}
57+
5358
// BAM filtering validation
5459
if (!params.run_bam_filtering && params.bam_mapping_quality_threshold != 0) {
5560
exit 1, "[nf-core/eager] error: please turn on BAM filtering if you want to perform mapping quality filtering! Provide: --run_bam_filtering."
@@ -1915,21 +1920,33 @@ process preseq {
19151920
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(input) from ch_input_for_preseq
19161921

19171922
output:
1918-
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("${input.baseName}.ccurve") into ch_preseq_for_multiqc
1923+
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("${input.baseName}.preseq") into ch_preseq_for_multiqc
19191924

19201925
script:
19211926
pe_mode = params.skip_collapse && seqtype == "PE" ? '-P' : ''
1922-
if(!params.skip_deduplication && params.dedupper == "dedup"){
1927+
if(!params.skip_deduplication && params.preseq_mode == 'c_curve' && params.dedupper == "dedup"){
1928+
"""
1929+
preseq c_curve -s ${params.preseq_step_size} -o ${input.baseName}.preseq -H ${input}
1930+
"""
1931+
} else if( !params.skip_deduplication && params.preseq_mode == 'c_curve' && params.dedupper == "markduplicates"){
1932+
"""
1933+
preseq c_curve -s ${params.preseq_step_size} -o ${input.baseName}.preseq -B ${input} ${pe_mode}
1934+
"""
1935+
} else if ( params.skip_deduplication && params.preseq_mode == 'c_curve' ) {
1936+
"""
1937+
preseq c_curve -s ${params.preseq_step_size} -o ${input.baseName}.preseq -B ${input} ${pe_mode}
1938+
"""
1939+
} else if(!params.skip_deduplication && params.preseq_mode == 'lc_extrap' && params.dedupper == "dedup"){
19231940
"""
1924-
preseq c_curve -s ${params.preseq_step_size} -o ${input.baseName}.ccurve -H ${input}
1941+
preseq lc_extrap -s ${params.preseq_step_size} -o ${input.baseName}.preseq -H ${input} -n ${params.preseq_bootstrap} -e ${params.preseq_maxextrap} -cval ${params.preseq_cval} -x ${params.preseq_terms}
19251942
"""
1926-
} else if( !params.skip_deduplication && params.dedupper == "markduplicates"){
1943+
} else if( !params.skip_deduplication && params.preseq_mode == 'lc_extrap' && params.dedupper == "markduplicates"){
19271944
"""
1928-
preseq c_curve -s ${params.preseq_step_size} -o ${input.baseName}.ccurve -B ${input} ${pe_mode}
1945+
preseq lc_extrap -s ${params.preseq_step_size} -o ${input.baseName}.preseq -B ${input} ${pe_mode} -n ${params.preseq_bootstrap} -e ${params.preseq_maxextrap} -cval ${params.preseq_cval} -x ${params.preseq_terms}
19291946
"""
1930-
} else if ( params.skip_deduplication ) {
1947+
} else if ( params.skip_deduplication && params.preseq_mode == 'lc_extrap' ) {
19311948
"""
1932-
preseq c_curve -s ${params.preseq_step_size} -o ${input.baseName}.ccurve -B ${input} ${pe_mode}
1949+
preseq lc_extrap -s ${params.preseq_step_size} -o ${input.baseName}.preseq -B ${input} ${pe_mode} -n ${params.preseq_bootstrap} -e ${params.preseq_maxextrap} -cval ${params.preseq_cval} -x ${params.preseq_terms}
19331950
"""
19341951
}
19351952
}

nextflow.config

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,11 @@ params {
109109

110110
//Preseq settings
111111
preseq_step_size = 1000
112+
preseq_mode = 'c_curve'
113+
preseq_bootstrap = 100
114+
preseq_maxextrap = 10000000000
115+
preseq_cval = 0.95
116+
preseq_terms = 100
112117

113118
//DamageProfiler settings
114119
damageprofiler_length = 100

nextflow_schema.json

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -767,12 +767,51 @@
767767
"description": "Options for calculating library complexity (i.e. how many unique reads are present).",
768768
"default": "",
769769
"properties": {
770+
"preseq_mode": {
771+
"type": "string",
772+
"default": "c_curve",
773+
"description": "Specify which mode of preseq to run.",
774+
"fa_icon": "fas fa-toggle-on",
775+
"help_text": "Specify which mode of preseq to run.\n\nFrom the [PreSeq documentation](http://smithlabresearch.org/wp-content/uploads/manual.pdf): \n\n`c curve` is used to compute the expected complexity curve of a mapped read file with a hypergeometric\nformula\n\n`lc extrap` is used to generate the expected yield for theoretical larger experiments and bounds on the\nnumber of distinct reads in the library and the associated confidence intervals, which is computed by\nbootstrapping the observed duplicate counts histogram",
776+
"enum": [
777+
"c_curve",
778+
"lc_extrap"
779+
]
780+
},
770781
"preseq_step_size": {
771782
"type": "integer",
772783
"default": 1000,
773784
"description": "Specify the step size of Preseq.",
774785
"fa_icon": "fas fa-shoe-prints",
775786
"help_text": "Can be used to configure the step size of Preseq's `c_curve` method. Can be useful when only few and thus shallow sequencing results are used for extrapolation.\n\n> Modifies preseq c_curve parameter: `-s`"
787+
},
788+
"preseq_maxextrap": {
789+
"type": "integer",
790+
"default": 10000000000,
791+
"description": "Specify the maximum extrapolation (lc_extrap mode only)",
792+
"fa_icon": "fas fa-ban",
793+
"help_text": "Specify the maximum extrapolation that `lc_extrap` mode will perform.\n\n> Modifies preseq lc_extrap parameter: `-e`"
794+
},
795+
"preseq_terms": {
796+
"type": "integer",
797+
"default": 100,
798+
"description": "Specify the maximum number of terms for extrapolation (lc_extrap mode only)",
799+
"fa_icon": "fas fa-sort-numeric-up-alt",
800+
"help_text": "Specify the maximum number of terms that `lc_extrap` mode will use.\n\n> Modifies preseq lc_extrap parameter: `-x`"
801+
},
802+
"preseq_bootstrap": {
803+
"type": "integer",
804+
"default": 100,
805+
"description": "Specify number of bootstraps to perform (lc_extrap mode only)",
806+
"fa_icon": "fab fa-bootstrap",
807+
"help_text": "Specify the number of bootstraps `lc_extrap` mode will perform to calculate confidence intervals.\n\n> Modifies preseq lc_extrap parameter: `-n`"
808+
},
809+
"preseq_cval": {
810+
"type": "number",
811+
"default": 0.95,
812+
"description": "Specify confidence interval level (lc_extrap mode only)",
813+
"fa_icon": "fas fa-check-circle",
814+
"help_text": "Specify the allowed level of confidence intervals used for `lc_extrap` mode.\n\n> Modifies preseq lc_extrap parameter: `-c`"
776815
}
777816
},
778817
"fa_icon": "fas fa-bezier-curve",

0 commit comments

Comments
 (0)