Skip to content

Commit 3b18665

Browse files
committed
refactor(rna-seq): commit old refactor of rnaseq
1 parent 7b2cf47 commit 3b18665

1 file changed

Lines changed: 154 additions & 172 deletions

File tree

rules/tximport.R

Lines changed: 154 additions & 172 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,119 @@
11
#!/usr/bin/env Rscript
22

3-
#' We expect to be called from snakemake script directive, so having
4-
#' `snakemake` object with `snakemake@input` etc containing paths.
3+
library(cli)
4+
5+
# This creates a snakemake object such as is passed to us from
6+
# snakemake. Useful for debugging interactively.
7+
if (!exists("snakemake")) {
8+
cli_h1("0. Parsing commandline options (no snakemake object found)")
9+
library(optparse)
10+
selfarg <- grep("--file", commandArgs(trailingOnly = FALSE), value = TRUE)
11+
self <- sub("--file=", "", selfarg)
12+
sourcedir <- dirname(self)
13+
14+
parser <- OptionParser(
15+
option_list = list(
16+
make_option("--in-samplesheet", help = "PRJ/qiime_mapping.tsv"),
17+
make_option("--in-multiqc", help = "PRJ.ref_hg38g.qc.quant_salmon_sa.group_ALL.qc_multiqc/multiqc_report_data"),
18+
make_option("--in-gtf", help = "references/hg38g/ALL.gtf"),
19+
make_option("--out-counts", default = "out.counts.rds"),
20+
make_option("--out-transcripts", default ="out.txcounts.rds"),
21+
make_option("--out-stats", default = "out.stats.rds"),
22+
make_option("--out-log"),
23+
make_option("--input-type", default = "Salmon"),
24+
make_option("--set-version", default = "0.0"),
25+
make_option("--set-label", default = "unknown"),
26+
make_option("--threads", default = 4)
27+
),
28+
usage = "usage: %prog [options] count_file ...",
29+
description = paste(
30+
sep="\n",
31+
"Creates unified SummarizedExperiment RDS",
32+
"",
33+
"Example:",
34+
"%prog \\",
35+
" --in-sample-sheet PROJ/qiime_mapping.tsv \\",
36+
" --in-multiqc PROJ.ref_hg38g.qc.quant_salmon_sa.group_ALL.qc_multiqc/multiqc_report_data \\",
37+
" --in-gtf references/hg38g/ALL.gtf",
38+
" PROJ.ref_hg38g.qc.quant_salmon_sa/*/quant.sf"
39+
)
40+
)
41+
opt <- parse_args2(parser, args = commandArgs(trailingOnly = TRUE))
42+
43+
Snakemake <- methods::setClass(
44+
"Snakemake",
45+
slots = c(
46+
input = "list",
47+
output = "list",
48+
params = "list",
49+
wildcards = "list",
50+
threads = "numeric",
51+
log = "list",
52+
resources = "list",
53+
config = "list",
54+
rule = "character",
55+
bench_iteration = "numeric",
56+
scriptdir = "character",
57+
source = "function"
58+
)
59+
)
60+
61+
snakemake <- Snakemake(
62+
input = list(
63+
counts = opt$args,
64+
meta = opt$options$in_samplesheet,
65+
multiqc = opt$options$in_multiqc,
66+
gtf = opt$options$in_gtf
67+
),
68+
output = list(
69+
counts = opt$options$out_counts,
70+
transcripts = opt$options$out_transcripts,
71+
stats = opt$options$out_stats
72+
),
73+
log = list(opt$options$out_log),
74+
threads = opt$options$threads,
75+
params = list(
76+
version = opt$options$set_version,
77+
label = opt$options$set_label,
78+
input_type = opt$options$input_type
79+
),
80+
scriptdir = sourcedir,
81+
source = function(...) {
82+
wd <- getwd()
83+
setwd(snakemake@scriptdir)
84+
source(...)
85+
setwd(wd)
86+
}
87+
)
588

6-
#' We also need to redirect our output to log ourselves...
7-
logfile <- file(snakemake@log[[1]], open = "wt")
8-
sink(logfile)
9-
sink(logfile, type = "message")
89+
if (length(snakemake@input$counts) == 0) {
90+
cli_abort("Missing input count files")
91+
}
92+
if (any(!fs::file_exists(snakemake@input$counts))) {
93+
cli_abort("Some input count files are missing")
94+
}
95+
if (!fs::file_exists(snakemake@input$meta)) {
96+
cli_abort("Sample sheet is missing")
97+
}
98+
if (!fs::file_exists(snakemake@input$multiqc)) {
99+
cli_abort("MultiQC data folder is missing")
100+
}
101+
if (!fs::file_exists(snakemake@input$gtf)) {
102+
cli_abort("GTF file is missing")
103+
}
104+
}
10105

11-
message("Importing ", snakemake@params$input_type, " data into R")
106+
#' We need to redirect our output to log if running from snakemake...
12107

13-
message("1. ----------- Loading packages ----------")
108+
if (!is.null(snakemake@log[[1]])) {
109+
logfile <- file(snakemake@log[[1]], open = "wt")
110+
sink(logfile)
111+
sink(logfile, type = "message")
112+
}
113+
114+
cli_alert("Importing {snakemake@params$input_type} data into R")
115+
116+
cli_h1("1. Loading packages")
14117
library(tximport)
15118
library(readr)
16119
library(GenomicFeatures)
@@ -63,113 +166,7 @@ get_idcols <- function(table, ids, allow_short = FALSE) {
63166
res
64167
}
65168

66-
load_all_fastqc <- function(sample_sheet, path) {
67-
fastqc <- NULL
68-
for (n in c(1, 2)) {
69-
suffix <- c("", "_1")[n]
70-
trimmed <- n == 2
71-
fastqc_fn <- fs::path(
72-
path,
73-
str_glue("multiqc_fastqc{suffix}.txt")
74-
)
75-
if (fs::file_exists(fastqc_fn)) {
76-
fastqc_data <- load_fastqc(fastqc_fn, trimmed)
77-
if (is.null(fastqc)) {
78-
fastqc <- fastqc_data
79-
} else {
80-
fastqc <- bind_rows(fastqc, fastqc_data)
81-
}
82-
}
83-
}
84-
fastqc
85-
}
86-
87-
load_fastqc <- function(fastqc_fn, trimmed) {
88-
message(" Loading ", if (trimmed) "trimmed" else "raw",
89-
" read fastqc data")
90-
fastqc_data <- read_tsv(fastqc_fn, show_col_types = FALSE) %>%
91-
# Sequence length can be `95` or `95-151`, split here
92-
tidyr::separate(
93-
col = `Sequence length`,
94-
into = c("read_len_min", "read_len_max"),
95-
convert = TRUE,
96-
fill = "right"
97-
) %>%
98-
transmute(
99-
# filename is just sample.R[12].fq.gz, ignoring
100-
fastqc_id = sub(".R[12]$", "", Sample),
101-
mate = if_else(str_ends(Sample, "R2"), "R2", "R1"),
102-
num_reads = `Total Sequences`,
103-
read_len_min,
104-
read_len_max = ifelse(
105-
is.na(read_len_max), read_len_min, read_len_max
106-
),
107-
read_len_avg = avg_sequence_length,
108-
pct_gc = `%GC`,
109-
# This is the percentage of unique reads (dedup/total)
110-
pct_unique = total_deduplicated_percentage,
111-
trimmed = trimmed
112-
)
113-
114-
message(" Determining sample sheet column matching fastqc ids")
115-
# Find columns identifying the fastqc files in sample sheet
116-
fastqc_idcolumns <- get_idcols(sample_sheet, fastqc_data$fastqc_id,
117-
allow_short = TRUE)
118-
message(" FastQC files identified by: ",
119-
paste(fastqc_idcolumns, collapse = ", "))
120-
121-
# Extract paths
122-
#
123-
# FIXME: This is a bad hack. We should get the detected fwd
124-
# and rev read from YMP somehow. Also, this won't work
125-
# for SRR sources.
126-
fastq_id_to_path <- sample_sheet %>%
127-
# One row per FQ file
128-
pivot_longer(
129-
cols = where(
130-
~any(str_detect(., "(fastq|fq).gz$"), na.rm = TRUE)
131-
),
132-
values_to = "fastq_file_path",
133-
names_to = "path_col"
134-
) %>%
135-
mutate(
136-
# Make sure ID columns are character
137-
across(all_of(fastqc_idcolumns), as.character),
138-
# Deduce mate from file name
139-
mate = if_else(
140-
str_detect(basename(fastq_file_path), "(_|\\.)R1"),
141-
"R1", "R2"
142-
),
143-
# Pick ID column
144-
fastqc_id = as.character(.data[[fastqc_idcolumns[1]]])
145-
) %>%
146-
# Remove anything we can't uniquely match
147-
group_by(fastqc_id, mate) %>%
148-
filter(n() == 1) %>%
149-
ungroup()
150169

151-
# Merge this into the fastqc df
152-
fastqc_data <- fastqc_data %>%
153-
left_join(
154-
fastq_id_to_path,
155-
by = c("fastqc_id", "mate")
156-
) %>%
157-
# move ids to front
158-
relocate(any_of(fastqc_idcolumns)) %>%
159-
# remove convenience fastqc_id
160-
select(-fastqc_id)
161-
162-
if (any(is.na(fastqc_data$fastq_file_path))) {
163-
message(
164-
"WARNING: ",
165-
"Failed to identify fastq file paths for all samples"
166-
)
167-
message("---- BEGIN fastqc data w/o file path ----")
168-
print(filter(fastqc_data, is.na(fastq_file_path)))
169-
message("---- END fastqc data w/o file path ----")
170-
}
171-
fastqc_data
172-
}
173170

174171
is_header_only_csv <- function(files, col_types) {
175172
future_sapply(files, function(fn) {
@@ -295,11 +292,44 @@ metadata <- list(
295292

296293
message("2.2. ----------- Loading MultiQC Report Data ----------")
297294

298-
metadata$fastqc <- load_all_fastqc(sample_sheet, snakemake@input$multiqc)
299-
if (!is.null(metadata$fastqc)) {
300-
# Show this (tibble, so will be short)
301-
print(metadata$fastqc)
295+
multiqc_data_file <- fs::path(snakemake@input$multiqc, "multiqc_data.json")
296+
multiqc <- jsonlite::fromJSON(multiqc_data_file)
297+
298+
extract_counts <- function(col, json) {
299+
counts <- sapply(
300+
names(json),
301+
\(id) as.integer(json[[id]]["Total Sequences"])
302+
)
303+
res <- enframe(counts, name="ids", value = col) %>%
304+
separate("ids", c("ids", "mate"), sep = "\\.", fill="right")
305+
err <- group_by(res, ids) %>% filter(n_distinct(.data[[col]]) != 1)
306+
if (nrow(err)) {
307+
cli_alert("Found errors importing {col}:")
308+
print(n=1000, err)
309+
cli_abort("Failed to import read counts from multiqc fastqc report")
310+
}
311+
res <- res %>% group_by(ids) %>%
312+
summarize("{col}":=unique(.data[[col]]), .groups="drop")
302313
}
314+
fastqc_trimmed <- extract_counts(
315+
"trimmed_reads", multiqc$report_saved_raw_data$multiqc_fastqc
316+
)
317+
fastqc_raw <- extract_counts(
318+
"raw_reads", multiqc$report_saved_raw_data$multiqc_fastqc_1
319+
)
320+
fastqc <- full_join(fastqc_raw, fastqc_trimmed, by = "ids")
321+
fastqc_idcols <- get_idcols(sample_sheet, fastqc$ids, allow_short = TRUE)
322+
cli_alert_info("FastQC data identified by '{fastqc_idcols}'")
323+
324+
sample_sheet <- sample_sheet %>%
325+
left_join(fastqc, by = set_names("ids", fastqc_idcols[1]))
326+
err <- filter(sample_sheet, is.na(trimmed_reads) | is.na(raw_reads))
327+
if (nrow(err) > 0) {
328+
cli_alert("Read counts missing for some samples:")
329+
print(n=1000, err)
330+
cli_abort("Incomplete read count information from multiqc fastqc report")
331+
}
332+
303333

304334
message("2.3. ----------- Loading GTF ----------")
305335
message("Filename = ", snakemake@input$gtf)
@@ -446,10 +476,12 @@ if (snakemake@params$input_type == "ExonSE") {
446476
txi_genes <- tximport(gene_files, type = "rsem",
447477
txIn = FALSE, txOut = FALSE)
448478

449-
## Something inside of tximport seems to reset the log sink on the
450-
## second call. Resetting it here:
451-
sink(logfile)
452-
sink(logfile, type = "message")
479+
if (!is.null(logfile)) {
480+
## Something inside of tximport seems to reset the log sink on the
481+
## second call. Resetting it here:
482+
sink(logfile)
483+
sink(logfile, type = "message")
484+
}
453485
}
454486

455487
message("7. ----------- Assembling SummarizedExperiment ----------")
@@ -586,53 +618,3 @@ if (snakemake@params$input_type == "ExonSE") {
586618
saveRDS(metadata(gse), snakemake@output$stats)
587619
}
588620
message("done")
589-
q()
590-
591-
592-
if (FALSE) {
593-
# This creates a snakemake object such as is passed to us from
594-
# snakemake. Useful for debugging interactively.
595-
Snakemake <- methods::setClass(
596-
"Snakemake",
597-
slots = c(
598-
input = "list",
599-
output = "list",
600-
params = "list",
601-
wildcards = "list",
602-
threads = "numeric",
603-
log = "list",
604-
resources = "list",
605-
config = "list",
606-
rule = "character",
607-
bench_iteration = "numeric",
608-
scriptdir = "character",
609-
source = "function"
610-
)
611-
)
612-
setwd("/Seibold/tmp/pipeline/work")
613-
project <- "gala"
614-
snakemake <- Snakemake(
615-
scriptdir = "/Seibold/tmp/pipeline/work/virprof/rules",
616-
source = function(...) {
617-
wd <- getwd()
618-
setwd(snakemake@scriptdir)
619-
source(...)
620-
setwd(wd)
621-
}
622-
)
623-
snakemake@input$counts <- fs::dir_ls(
624-
path = paste0(project, ".ref_hg38g.qc.quant_salmon_sa"),
625-
glob = "*/quant.sf",
626-
recurse = TRUE
627-
)
628-
snakemake@input$meta <- file.path(project, "qiime_mapping.tsv")
629-
snakemake@input$multiqc <- paste0(
630-
project,
631-
".ref_hg38g.qc.quant_salmon_sa.group_ALL.qc_multiqc/multiqc_report_data"
632-
)
633-
snakemake@input$gtf <- "references/hg38g/ALL.gtf"
634-
snakemake@threads <- 16
635-
snakemake@params$version <- "0.0.0"
636-
snakemake@params$label <- "testing manually"
637-
snakemake@params$input_type <- "Salmon"
638-
}

0 commit comments

Comments
 (0)