diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0b753a453d..459b490e11 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,6 +32,7 @@ jobs: NXF_EDGE: "1" test: - "aligner" + - "alignment_to_fastq" - "annotation" - "cnvkit" - "controlfreec" diff --git a/CHANGELOG.md b/CHANGELOG.md index c9df2d2c11..5ea23e3fd2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -103,6 +103,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#599](https://github.com/nf-core/sarek/pull/599) - Add checks for correct data type for `params.step` - [#599](https://github.com/nf-core/sarek/pull/599) - Add checks for no empty `--tools` with `--step variant_calling` or `--step annotation` - [#600](https://github.com/nf-core/sarek/pull/600) - Remove `nf-core lint` warnings +- [#602](https://github.com/nf-core/sarek/pull/602/) - Fixed bug in `alignment_to_fastq` and added tests ### Deprecated diff --git a/conf/test.config b/conf/test.config index 3cc63737f9..e61e2e9945 100644 --- a/conf/test.config +++ b/conf/test.config @@ -26,7 +26,7 @@ params { max_time = '8.h' // Input data - input = "${baseDir}/tests/csv/3.0/fastq_single.csv" + input = "${projectDir}/tests/csv/3.0/fastq_single.csv" // Small reference genome genome = null @@ -48,37 +48,37 @@ params { profiles { annotation { - params.input = "${baseDir}/tests/csv/3.0/vcf_single.csv" + params.input = "${projectDir}/tests/csv/3.0/vcf_single.csv" params.step = 'annotate' } no_intervals { params.no_intervals = true } pair { - params.input = "${baseDir}/tests/csv/3.0/fastq_pair.csv" + params.input = "${projectDir}/tests/csv/3.0/fastq_pair.csv" } markduplicates_bam { - params.input = "${baseDir}/tests/csv/3.0/mapped_single_bam.csv" + params.input = "${projectDir}/tests/csv/3.0/mapped_single_bam.csv" params.step = 'markduplicates' } markduplicates_cram { - params.input = "${baseDir}/tests/csv/3.0/mapped_single_cram.csv" + params.input = "${projectDir}/tests/csv/3.0/mapped_single_cram.csv" params.step = 'markduplicates' } prepare_recalibration_bam { - params.input = "${baseDir}/tests/csv/3.0/mapped_single_bam.csv" + params.input = "${projectDir}/tests/csv/3.0/mapped_single_bam.csv" params.step = 'prepare_recalibration' } prepare_recalibration_cram { - params.input = "${baseDir}/tests/csv/3.0/mapped_single_cram.csv" + params.input = "${projectDir}/tests/csv/3.0/mapped_single_cram.csv" params.step = 'prepare_recalibration' } recalibrate_bam { - params.input = "${baseDir}/tests/csv/3.0/prepare_recalibration_single_bam.csv" + params.input = "${projectDir}/tests/csv/3.0/prepare_recalibration_single_bam.csv" params.step = 'recalibrate' } recalibrate_cram { - params.input = "${baseDir}/tests/csv/3.0/prepare_recalibration_single_cram.csv" + params.input = "${projectDir}/tests/csv/3.0/prepare_recalibration_single_cram.csv" params.step = 'recalibrate' } save_bam_mapped { @@ -100,7 +100,7 @@ profiles { params.nucleotides_per_second = 20 } tools { - params.input = "${baseDir}/tests/csv/3.0/recalibrated.csv" + params.input = "${projectDir}/tests/csv/3.0/recalibrated.csv" params.dbsnp = params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz'] params.fasta = params.test_data['homo_sapiens']['genome']['genome_21_fasta'] params.germline_resource = params.test_data['homo_sapiens']['genome']['gnomad_r2_1_1_21_vcf_gz'] @@ -113,7 +113,7 @@ profiles { params.nucleotides_per_second = 20 } tools_germline { - params.input = "${baseDir}/tests/csv/3.0/recalibrated_germline.csv" + params.input = "${projectDir}/tests/csv/3.0/recalibrated_germline.csv" params.dbsnp = params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz'] params.known_indels = params.test_data['homo_sapiens']['genome']['mills_and_1000g_indels_21_vcf_gz'] params.fasta = params.test_data['homo_sapiens']['genome']['genome_21_fasta'] @@ -124,7 +124,7 @@ profiles { params.nucleotides_per_second = 20 } tools_tumoronly { - params.input = "${baseDir}/tests/csv/3.0/recalibrated_tumoronly.csv" + params.input = "${projectDir}/tests/csv/3.0/recalibrated_tumoronly.csv" params.dbsnp = params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz'] params.fasta = params.test_data['homo_sapiens']['genome']['genome_21_fasta'] params.germline_resource = params.test_data['homo_sapiens']['genome']['gnomad_r2_1_1_21_vcf_gz'] @@ -137,7 +137,7 @@ profiles { params.nucleotides_per_second = 20 } tools_somatic { - params.input = "${baseDir}/tests/csv/3.0/recalibrated_somatic.csv" + params.input = "${projectDir}/tests/csv/3.0/recalibrated_somatic.csv" params.chr_dir = params.test_data['homo_sapiens']['genome']['genome_21_chromosomes_dir'] params.dbsnp = params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz'] params.fasta = params.test_data['homo_sapiens']['genome']['genome_21_fasta'] @@ -158,14 +158,14 @@ profiles { params.trim_fastq = true } umi { - params.input = "${baseDir}/tests/csv/3.0/fastq_umi.csv" + params.input = "${projectDir}/tests/csv/3.0/fastq_umi.csv" params.umi_read_structure = '7M1S+T' } use_gatk_spark { params.use_gatk_spark = 'baserecalibrator,markduplicates' } variantcalling_channels { - params.input = "${baseDir}/tests/csv/3.0/recalibrated.csv" + params.input = "${projectDir}/tests/csv/3.0/recalibrated.csv" params.fasta = params.test_data['homo_sapiens']['genome']['genome_21_fasta'] params.intervals = params.test_data['homo_sapiens']['genome']['genome_21_multi_interval_bed'] params.wes = true @@ -173,4 +173,7 @@ profiles { params.nucleotides_per_second = 20 } + alignment_to_fastq { + params.input = "${projectDir}/tests/csv/3.0/bam_for_remapping.csv" + } } diff --git a/conf/test_full.config b/conf/test_full.config index a32df0385b..cda0459c61 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -15,7 +15,7 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - input = "${baseDir}/tests/csv/3.0/test_full_data.csv" + input = "${projectDir}/tests/csv/3.0/test_full_data.csv" wes = true diff --git a/subworkflows/nf-core/alignment_to_fastq.nf b/subworkflows/nf-core/alignment_to_fastq.nf index 81720133d4..ec855d6ca2 100644 --- a/subworkflows/nf-core/alignment_to_fastq.nf +++ b/subworkflows/nf-core/alignment_to_fastq.nf @@ -50,17 +50,13 @@ workflow ALIGNMENT_TO_FASTQ { // join Mapped & unmapped fastq unmapped_reads = COLLATE_FASTQ_UNMAP.out.reads - .map{ meta, reads -> - fq_1 = reads.find{ it.toString().endsWith("_1.fq.gz") } - fq_2 = reads.find{ it.toString().endsWith("_2.fq.gz") } - [meta, [ fq_1, fq_2]] + .map{ meta, reads_R1_R2, reads_other, reads_singleton -> + [meta, reads_R1_R2] } mapped_reads = COLLATE_FASTQ_MAP.out.reads - .map{ meta, reads -> - fq_1 = reads.find{ it.toString().endsWith("_1.fq.gz") } - fq_2 = reads.find{ it.toString().endsWith("_2.fq.gz") } - [meta, [ fq_1, fq_2]] + .map{ meta, reads_R1_R2, reads_other, reads_singleton -> + [meta, reads_R1_R2] } reads_to_concat = mapped_reads.join(unmapped_reads) diff --git a/tests/csv/3.0/bam_for_remapping.csv b/tests/csv/3.0/bam_for_remapping.csv new file mode 100644 index 0000000000..9bc47d674f --- /dev/null +++ b/tests/csv/3.0/bam_for_remapping.csv @@ -0,0 +1,2 @@ +patient,gender,status,sample,lane,bam,bai +test,XX,0,test,1,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam.bai diff --git a/tests/test_bam_remap.yml b/tests/test_bam_remap.yml new file mode 100644 index 0000000000..840c67b6e0 --- /dev/null +++ b/tests/test_bam_remap.yml @@ -0,0 +1,17 @@ +- name: Run alignment to fastq and then remap on bam files + command: nextflow run main.nf -profile test,alignment_to_fastq,docker -c ./tests/nextflow.config + tags: + - alignment_to_fastq + + files: + - path: results/cat/test-1_1.merged.fastq.gz + - path: results/cat/test-1_2.merged.fastq.gz + - path: results/csv/markduplicates.csv + - path: results/csv/markduplicates_no_table.csv + - path: results/csv/recalibrated.csv + - path: results/multiqc/multiqc_report.html + - path: results/pipeline_info + - path: results/preprocessing/test + - path: results/reports + - path: results/samtools + - path: results/collate diff --git a/workflows/sarek.nf b/workflows/sarek.nf index f265f8eae5..05716260a9 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -1020,10 +1020,14 @@ def extract_csv(csv_file) { // start from BAM } else if (row.lane && row.bam) { + if (!row.bai) { + log.error "BAM index (bai) should be provided." + } meta.id = "${row.sample}-${row.lane}".toString() def bam = file(row.bam, checkIfExists: true) + def bai = file(row.bai, checkIfExists: true) def CN = params.seq_center ? "CN:${params.seq_center}\\t" : '' - def read_group = "\"@RG\\tID:${row_sample}_${row.lane}\\t${CN}PU:${row.lane}\\tSM:${row.sample}\\tLB:${row.sample}\\tPL:${params.seq_platform}\"" + def read_group = "\"@RG\\tID:${row.sample}_${row.lane}\\t${CN}PU:${row.lane}\\tSM:${row.sample}\\tLB:${row.sample}\\tPL:${params.seq_platform}\"" meta.numLanes = numLanes.toInteger() meta.read_group = read_group.toString() @@ -1031,7 +1035,7 @@ def extract_csv(csv_file) { meta.size = 1 // default number of splitted fastq - if (params.step == 'mapping') return [meta, bam] + if (params.step == 'mapping') return [meta, bam, bai] else { log.error "Samplesheet contains ubam files but step is `$params.step`. Please check your samplesheet or adjust the step parameter.\nhttps://nf-co.re/sarek/usage#input-samplesheet-configurations" System.exit(1)