|
1 | 1 | #!/usr/bin/env Rscript |
2 | 2 |
|
3 | | -#' We expect to be called from snakemake script directive, so having |
4 | | -#' `snakemake` object with `snakemake@input` etc containing paths. |
| 3 | +library(cli) |
| 4 | + |
| 5 | +# This creates a snakemake object such as is passed to us from |
| 6 | +# snakemake. Useful for debugging interactively. |
| 7 | +if (!exists("snakemake")) { |
| 8 | + cli_h1("0. Parsing commandline options (no snakemake object found)") |
| 9 | + library(optparse) |
| 10 | + selfarg <- grep("--file", commandArgs(trailingOnly = FALSE), value = TRUE) |
| 11 | + self <- sub("--file=", "", selfarg) |
| 12 | + sourcedir <- dirname(self) |
| 13 | + |
| 14 | + parser <- OptionParser( |
| 15 | + option_list = list( |
| 16 | + make_option("--in-samplesheet", help = "PRJ/qiime_mapping.tsv"), |
| 17 | + make_option("--in-multiqc", help = "PRJ.ref_hg38g.qc.quant_salmon_sa.group_ALL.qc_multiqc/multiqc_report_data"), |
| 18 | + make_option("--in-gtf", help = "references/hg38g/ALL.gtf"), |
| 19 | + make_option("--out-counts", default = "out.counts.rds"), |
| 20 | + make_option("--out-transcripts", default ="out.txcounts.rds"), |
| 21 | + make_option("--out-stats", default = "out.stats.rds"), |
| 22 | + make_option("--out-log"), |
| 23 | + make_option("--input-type", default = "Salmon"), |
| 24 | + make_option("--set-version", default = "0.0"), |
| 25 | + make_option("--set-label", default = "unknown"), |
| 26 | + make_option("--threads", default = 4) |
| 27 | + ), |
| 28 | + usage = "usage: %prog [options] count_file ...", |
| 29 | + description = paste( |
| 30 | + sep="\n", |
| 31 | + "Creates unified SummarizedExperiment RDS", |
| 32 | + "", |
| 33 | + "Example:", |
| 34 | + "%prog \\", |
| 35 | + " --in-sample-sheet PROJ/qiime_mapping.tsv \\", |
| 36 | + " --in-multiqc PROJ.ref_hg38g.qc.quant_salmon_sa.group_ALL.qc_multiqc/multiqc_report_data \\", |
| 37 | + " --in-gtf references/hg38g/ALL.gtf", |
| 38 | + " PROJ.ref_hg38g.qc.quant_salmon_sa/*/quant.sf" |
| 39 | + ) |
| 40 | + ) |
| 41 | + opt <- parse_args2(parser, args = commandArgs(trailingOnly = TRUE)) |
| 42 | + |
| 43 | + Snakemake <- methods::setClass( |
| 44 | + "Snakemake", |
| 45 | + slots = c( |
| 46 | + input = "list", |
| 47 | + output = "list", |
| 48 | + params = "list", |
| 49 | + wildcards = "list", |
| 50 | + threads = "numeric", |
| 51 | + log = "list", |
| 52 | + resources = "list", |
| 53 | + config = "list", |
| 54 | + rule = "character", |
| 55 | + bench_iteration = "numeric", |
| 56 | + scriptdir = "character", |
| 57 | + source = "function" |
| 58 | + ) |
| 59 | + ) |
| 60 | + |
| 61 | + snakemake <- Snakemake( |
| 62 | + input = list( |
| 63 | + counts = opt$args, |
| 64 | + meta = opt$options$in_samplesheet, |
| 65 | + multiqc = opt$options$in_multiqc, |
| 66 | + gtf = opt$options$in_gtf |
| 67 | + ), |
| 68 | + output = list( |
| 69 | + counts = opt$options$out_counts, |
| 70 | + transcripts = opt$options$out_transcripts, |
| 71 | + stats = opt$options$out_stats |
| 72 | + ), |
| 73 | + log = list(opt$options$out_log), |
| 74 | + threads = opt$options$threads, |
| 75 | + params = list( |
| 76 | + version = opt$options$set_version, |
| 77 | + label = opt$options$set_label, |
| 78 | + input_type = opt$options$input_type |
| 79 | + ), |
| 80 | + scriptdir = sourcedir, |
| 81 | + source = function(...) { |
| 82 | + wd <- getwd() |
| 83 | + setwd(snakemake@scriptdir) |
| 84 | + source(...) |
| 85 | + setwd(wd) |
| 86 | + } |
| 87 | + ) |
5 | 88 |
|
6 | | -#' We also need to redirect our output to log ourselves... |
7 | | -logfile <- file(snakemake@log[[1]], open = "wt") |
8 | | -sink(logfile) |
9 | | -sink(logfile, type = "message") |
| 89 | + if (length(snakemake@input$counts) == 0) { |
| 90 | + cli_abort("Missing input count files") |
| 91 | + } |
| 92 | + if (any(!fs::file_exists(snakemake@input$counts))) { |
| 93 | + cli_abort("Some input count files are missing") |
| 94 | + } |
| 95 | + if (!fs::file_exists(snakemake@input$meta)) { |
| 96 | + cli_abort("Sample sheet is missing") |
| 97 | + } |
| 98 | + if (!fs::file_exists(snakemake@input$multiqc)) { |
| 99 | + cli_abort("MultiQC data folder is missing") |
| 100 | + } |
| 101 | + if (!fs::file_exists(snakemake@input$gtf)) { |
| 102 | + cli_abort("GTF file is missing") |
| 103 | + } |
| 104 | +} |
10 | 105 |
|
11 | | -message("Importing ", snakemake@params$input_type, " data into R") |
| 106 | +#' We need to redirect our output to log if running from snakemake... |
12 | 107 |
|
13 | | -message("1. ----------- Loading packages ----------") |
| 108 | +if (!is.null(snakemake@log[[1]])) { |
| 109 | + logfile <- file(snakemake@log[[1]], open = "wt") |
| 110 | + sink(logfile) |
| 111 | + sink(logfile, type = "message") |
| 112 | +} |
| 113 | + |
| 114 | +cli_alert("Importing {snakemake@params$input_type} data into R") |
| 115 | + |
| 116 | +cli_h1("1. Loading packages") |
14 | 117 | library(tximport) |
15 | 118 | library(readr) |
16 | 119 | library(GenomicFeatures) |
@@ -63,113 +166,7 @@ get_idcols <- function(table, ids, allow_short = FALSE) { |
63 | 166 | res |
64 | 167 | } |
65 | 168 |
|
66 | | -load_all_fastqc <- function(sample_sheet, path) { |
67 | | - fastqc <- NULL |
68 | | - for (n in c(1, 2)) { |
69 | | - suffix <- c("", "_1")[n] |
70 | | - trimmed <- n == 2 |
71 | | - fastqc_fn <- fs::path( |
72 | | - path, |
73 | | - str_glue("multiqc_fastqc{suffix}.txt") |
74 | | - ) |
75 | | - if (fs::file_exists(fastqc_fn)) { |
76 | | - fastqc_data <- load_fastqc(fastqc_fn, trimmed) |
77 | | - if (is.null(fastqc)) { |
78 | | - fastqc <- fastqc_data |
79 | | - } else { |
80 | | - fastqc <- bind_rows(fastqc, fastqc_data) |
81 | | - } |
82 | | - } |
83 | | - } |
84 | | - fastqc |
85 | | -} |
86 | | - |
87 | | -load_fastqc <- function(fastqc_fn, trimmed) { |
88 | | - message(" Loading ", if (trimmed) "trimmed" else "raw", |
89 | | - " read fastqc data") |
90 | | - fastqc_data <- read_tsv(fastqc_fn, show_col_types = FALSE) %>% |
91 | | - # Sequence length can be `95` or `95-151`, split here |
92 | | - tidyr::separate( |
93 | | - col = `Sequence length`, |
94 | | - into = c("read_len_min", "read_len_max"), |
95 | | - convert = TRUE, |
96 | | - fill = "right" |
97 | | - ) %>% |
98 | | - transmute( |
99 | | - # filename is just sample.R[12].fq.gz, ignoring |
100 | | - fastqc_id = sub(".R[12]$", "", Sample), |
101 | | - mate = if_else(str_ends(Sample, "R2"), "R2", "R1"), |
102 | | - num_reads = `Total Sequences`, |
103 | | - read_len_min, |
104 | | - read_len_max = ifelse( |
105 | | - is.na(read_len_max), read_len_min, read_len_max |
106 | | - ), |
107 | | - read_len_avg = avg_sequence_length, |
108 | | - pct_gc = `%GC`, |
109 | | - # This is the percentage of unique reads (dedup/total) |
110 | | - pct_unique = total_deduplicated_percentage, |
111 | | - trimmed = trimmed |
112 | | - ) |
113 | | - |
114 | | - message(" Determining sample sheet column matching fastqc ids") |
115 | | - # Find columns identifying the fastqc files in sample sheet |
116 | | - fastqc_idcolumns <- get_idcols(sample_sheet, fastqc_data$fastqc_id, |
117 | | - allow_short = TRUE) |
118 | | - message(" FastQC files identified by: ", |
119 | | - paste(fastqc_idcolumns, collapse = ", ")) |
120 | | - |
121 | | - # Extract paths |
122 | | - # |
123 | | - # FIXME: This is a bad hack. We should get the detected fwd |
124 | | - # and rev read from YMP somehow. Also, this won't work |
125 | | - # for SRR sources. |
126 | | - fastq_id_to_path <- sample_sheet %>% |
127 | | - # One row per FQ file |
128 | | - pivot_longer( |
129 | | - cols = where( |
130 | | - ~any(str_detect(., "(fastq|fq).gz$"), na.rm = TRUE) |
131 | | - ), |
132 | | - values_to = "fastq_file_path", |
133 | | - names_to = "path_col" |
134 | | - ) %>% |
135 | | - mutate( |
136 | | - # Make sure ID columns are character |
137 | | - across(all_of(fastqc_idcolumns), as.character), |
138 | | - # Deduce mate from file name |
139 | | - mate = if_else( |
140 | | - str_detect(basename(fastq_file_path), "(_|\\.)R1"), |
141 | | - "R1", "R2" |
142 | | - ), |
143 | | - # Pick ID column |
144 | | - fastqc_id = as.character(.data[[fastqc_idcolumns[1]]]) |
145 | | - ) %>% |
146 | | - # Remove anything we can't uniquely match |
147 | | - group_by(fastqc_id, mate) %>% |
148 | | - filter(n() == 1) %>% |
149 | | - ungroup() |
150 | 169 |
|
151 | | - # Merge this into the fastqc df |
152 | | - fastqc_data <- fastqc_data %>% |
153 | | - left_join( |
154 | | - fastq_id_to_path, |
155 | | - by = c("fastqc_id", "mate") |
156 | | - ) %>% |
157 | | - # move ids to front |
158 | | - relocate(any_of(fastqc_idcolumns)) %>% |
159 | | - # remove convenience fastqc_id |
160 | | - select(-fastqc_id) |
161 | | - |
162 | | - if (any(is.na(fastqc_data$fastq_file_path))) { |
163 | | - message( |
164 | | - "WARNING: ", |
165 | | - "Failed to identify fastq file paths for all samples" |
166 | | - ) |
167 | | - message("---- BEGIN fastqc data w/o file path ----") |
168 | | - print(filter(fastqc_data, is.na(fastq_file_path))) |
169 | | - message("---- END fastqc data w/o file path ----") |
170 | | - } |
171 | | - fastqc_data |
172 | | -} |
173 | 170 |
|
174 | 171 | is_header_only_csv <- function(files, col_types) { |
175 | 172 | future_sapply(files, function(fn) { |
@@ -295,11 +292,44 @@ metadata <- list( |
295 | 292 |
|
296 | 293 | message("2.2. ----------- Loading MultiQC Report Data ----------") |
297 | 294 |
|
298 | | -metadata$fastqc <- load_all_fastqc(sample_sheet, snakemake@input$multiqc) |
299 | | -if (!is.null(metadata$fastqc)) { |
300 | | - # Show this (tibble, so will be short) |
301 | | - print(metadata$fastqc) |
| 295 | +multiqc_data_file <- fs::path(snakemake@input$multiqc, "multiqc_data.json") |
| 296 | +multiqc <- jsonlite::fromJSON(multiqc_data_file) |
| 297 | + |
| 298 | +extract_counts <- function(col, json) { |
| 299 | + counts <- sapply( |
| 300 | + names(json), |
| 301 | + \(id) as.integer(json[[id]]["Total Sequences"]) |
| 302 | + ) |
| 303 | + res <- enframe(counts, name="ids", value = col) %>% |
| 304 | + separate("ids", c("ids", "mate"), sep = "\\.", fill="right") |
| 305 | + err <- group_by(res, ids) %>% filter(n_distinct(.data[[col]]) != 1) |
| 306 | + if (nrow(err)) { |
| 307 | + cli_alert("Found errors importing {col}:") |
| 308 | + print(n=1000, err) |
| 309 | + cli_abort("Failed to import read counts from multiqc fastqc report") |
| 310 | + } |
| 311 | + res <- res %>% group_by(ids) %>% |
| 312 | + summarize("{col}":=unique(.data[[col]]), .groups="drop") |
302 | 313 | } |
| 314 | +fastqc_trimmed <- extract_counts( |
| 315 | + "trimmed_reads", multiqc$report_saved_raw_data$multiqc_fastqc |
| 316 | +) |
| 317 | +fastqc_raw <- extract_counts( |
| 318 | + "raw_reads", multiqc$report_saved_raw_data$multiqc_fastqc_1 |
| 319 | +) |
| 320 | +fastqc <- full_join(fastqc_raw, fastqc_trimmed, by = "ids") |
| 321 | +fastqc_idcols <- get_idcols(sample_sheet, fastqc$ids, allow_short = TRUE) |
| 322 | +cli_alert_info("FastQC data identified by '{fastqc_idcols}'") |
| 323 | + |
| 324 | +sample_sheet <- sample_sheet %>% |
| 325 | + left_join(fastqc, by = set_names("ids", fastqc_idcols[1])) |
| 326 | +err <- filter(sample_sheet, is.na(trimmed_reads) | is.na(raw_reads)) |
| 327 | +if (nrow(err) > 0) { |
| 328 | + cli_alert("Read counts missing for some samples:") |
| 329 | + print(n=1000, err) |
| 330 | + cli_abort("Incomplete read count information from multiqc fastqc report") |
| 331 | +} |
| 332 | + |
303 | 333 |
|
304 | 334 | message("2.3. ----------- Loading GTF ----------") |
305 | 335 | message("Filename = ", snakemake@input$gtf) |
@@ -446,10 +476,12 @@ if (snakemake@params$input_type == "ExonSE") { |
446 | 476 | txi_genes <- tximport(gene_files, type = "rsem", |
447 | 477 | txIn = FALSE, txOut = FALSE) |
448 | 478 |
|
449 | | - ## Something inside of tximport seems to reset the log sink on the |
450 | | - ## second call. Resetting it here: |
451 | | - sink(logfile) |
452 | | - sink(logfile, type = "message") |
| 479 | + if (!is.null(logfile)) { |
| 480 | + ## Something inside of tximport seems to reset the log sink on the |
| 481 | + ## second call. Resetting it here: |
| 482 | + sink(logfile) |
| 483 | + sink(logfile, type = "message") |
| 484 | + } |
453 | 485 | } |
454 | 486 |
|
455 | 487 | message("7. ----------- Assembling SummarizedExperiment ----------") |
@@ -586,53 +618,3 @@ if (snakemake@params$input_type == "ExonSE") { |
586 | 618 | saveRDS(metadata(gse), snakemake@output$stats) |
587 | 619 | } |
588 | 620 | message("done") |
589 | | -q() |
590 | | - |
591 | | - |
592 | | -if (FALSE) { |
593 | | - # This creates a snakemake object such as is passed to us from |
594 | | - # snakemake. Useful for debugging interactively. |
595 | | - Snakemake <- methods::setClass( |
596 | | - "Snakemake", |
597 | | - slots = c( |
598 | | - input = "list", |
599 | | - output = "list", |
600 | | - params = "list", |
601 | | - wildcards = "list", |
602 | | - threads = "numeric", |
603 | | - log = "list", |
604 | | - resources = "list", |
605 | | - config = "list", |
606 | | - rule = "character", |
607 | | - bench_iteration = "numeric", |
608 | | - scriptdir = "character", |
609 | | - source = "function" |
610 | | - ) |
611 | | - ) |
612 | | - setwd("/Seibold/tmp/pipeline/work") |
613 | | - project <- "gala" |
614 | | - snakemake <- Snakemake( |
615 | | - scriptdir = "/Seibold/tmp/pipeline/work/virprof/rules", |
616 | | - source = function(...) { |
617 | | - wd <- getwd() |
618 | | - setwd(snakemake@scriptdir) |
619 | | - source(...) |
620 | | - setwd(wd) |
621 | | - } |
622 | | - ) |
623 | | - snakemake@input$counts <- fs::dir_ls( |
624 | | - path = paste0(project, ".ref_hg38g.qc.quant_salmon_sa"), |
625 | | - glob = "*/quant.sf", |
626 | | - recurse = TRUE |
627 | | - ) |
628 | | - snakemake@input$meta <- file.path(project, "qiime_mapping.tsv") |
629 | | - snakemake@input$multiqc <- paste0( |
630 | | - project, |
631 | | - ".ref_hg38g.qc.quant_salmon_sa.group_ALL.qc_multiqc/multiqc_report_data" |
632 | | - ) |
633 | | - snakemake@input$gtf <- "references/hg38g/ALL.gtf" |
634 | | - snakemake@threads <- 16 |
635 | | - snakemake@params$version <- "0.0.0" |
636 | | - snakemake@params$label <- "testing manually" |
637 | | - snakemake@params$input_type <- "Salmon" |
638 | | -} |
0 commit comments