diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R index 49d4528..7028b49 100644 --- a/DIMS/AssignToBins.R +++ b/DIMS/AssignToBins.R @@ -1,19 +1,22 @@ # load required packages +library("argparse") suppressPackageStartupMessages(library("xcms")) -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) +parser <- ArgumentParser(description = "AssignToBins") -mzml_filepath <- cmd_args[1] -breaks_filepath <- cmd_args[2] -resol <- as.numeric(cmd_args[3]) +parser$add_argument("--mzML_filepath", dest = "mzml_filepath", + help = "File path for the mzML file", required = TRUE) +parser$add_argument("--breaks_filepath", dest = "breaks_filepath", + help = "File path for the breaks RData file", required = TRUE) + +args <- parser$parse_args() # load breaks_file: contains breaks_fwhm, breaks_fwhm_avg, # trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos -load(breaks_filepath) +load(args$breaks_filepath) # get sample name -techrep_name <- sub("\\..*$", "", basename(mzml_filepath)) +techrep_name <- sub("\\..*$", "", basename(args$mzml_filepath)) options(digits = 16) @@ -26,7 +29,7 @@ neg_bins <- bins dims_thresh <- 100 # read in the data for 1 sample -raw_data <- suppressMessages(xcms::xcmsRaw(mzml_filepath)) +raw_data <- suppressMessages(xcms::xcmsRaw(args$mzml_filepath)) # Generate a matrix with retention times and intensities raw_data_matrix <- xcms::rawMat(raw_data) @@ -41,13 +44,13 @@ neg_times_trimmed <- neg_times[neg_times > trim_left_neg & neg_times < trim_righ # get TIC intensities for areas between trim_left and trim_right tic_intensity_persample <- cbind(raw_data@scantime, raw_data@tic) colnames(tic_intensity_persample) <- c("retention_time", "tic_intensity") -tic_intensity_pos <- tic_intensity_persample[tic_intensity_persample[ , "retention_time"] > min(pos_times_trimmed) & - tic_intensity_persample[ , "retention_time"] < max(pos_times_trimmed), ] -tic_intensity_neg <- tic_intensity_persample[tic_intensity_persample[ , "retention_time"] > min(neg_times_trimmed) & - tic_intensity_persample[ , "retention_time"] < max(neg_times_trimmed), ] +tic_intensity_pos <- tic_intensity_persample[tic_intensity_persample[, "retention_time"] > min(pos_times_trimmed) & + tic_intensity_persample[, "retention_time"] < max(pos_times_trimmed), ] +tic_intensity_neg <- tic_intensity_persample[tic_intensity_persample[, "retention_time"] > min(neg_times_trimmed) & + tic_intensity_persample[, "retention_time"] < max(neg_times_trimmed), ] # calculate weighted mean of intensities for pos and neg separately -mean_pos <- weighted.mean(tic_intensity_pos[ , "tic_intensity"], tic_intensity_pos[ , "tic_intensity"]) -mean_neg <- weighted.mean(tic_intensity_neg[ , "tic_intensity"], tic_intensity_neg[ , "tic_intensity"]) +mean_pos <- weighted.mean(tic_intensity_pos[, "tic_intensity"], tic_intensity_pos[, "tic_intensity"]) +mean_neg <- weighted.mean(tic_intensity_neg[, "tic_intensity"], tic_intensity_neg[, "tic_intensity"]) # intensity per scan should be at least 80% of weighted mean dims_thresh_pos <- 0.8 * mean_pos dims_thresh_neg <- 0.8 * mean_neg @@ -67,17 +70,17 @@ neg_raw_data_matrix <- raw_data_matrix[neg_index, ] # Get index for binning intensity values bin_indices_pos <- cut( - pos_raw_data_matrix[, "mz"], + pos_raw_data_matrix[, "mz"], breaks_fwhm, - include.lowest = TRUE, - right = TRUE, + include.lowest = TRUE, + right = TRUE, labels = FALSE ) bin_indices_neg <- cut( - neg_raw_data_matrix[, "mz"], - breaks_fwhm, - include.lowest = TRUE, - right = TRUE, + neg_raw_data_matrix[, "mz"], + breaks_fwhm, + include.lowest = TRUE, + right = TRUE, labels = FALSE ) diff --git a/DIMS/AssignToBins.nf b/DIMS/AssignToBins.nf index d0a7098..df3b089 100644 --- a/DIMS/AssignToBins.nf +++ b/DIMS/AssignToBins.nf @@ -1,7 +1,7 @@ process AssignToBins { tag "DIMS AssignToBins ${file_id}" label 'AssignToBins' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: @@ -13,7 +13,9 @@ process AssignToBins { script: """ - Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R $mzML_file $breaks_file $params.resolution + Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R \\ + --mzML_filepath $mzML_file \\ + --breaks_filepath $breaks_file """ } diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R index 1c727d1..6376339 100644 --- a/DIMS/AverageTechReplicates.R +++ b/DIMS/AverageTechReplicates.R @@ -1,28 +1,37 @@ # adapted from 3-AverageTechReplicates.R # load packages +library("argparse") library("ggplot2") library("gridExtra") -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) +parser <- ArgumentParser(description = "AverageTechReplicates") -init_file <- cmd_args[1] -nr_replicates <- as.numeric(cmd_args[2]) -run_name <- cmd_args[3] -dims_matrix <- cmd_args[4] -highest_mz_file <- cmd_args[5] -highest_mz <- get(load(highest_mz_file)) -breaks_filepath <- cmd_args[6] -thresh2remove <- as.numeric(cmd_args[7]) +parser$add_argument("--init_filepath", dest = "init_file", + help = "File path for the init RData file", required = TRUE) +parser$add_argument("-n", "--nr_replicates", dest = "nr_replicates", type = "integer", + help = "Number of replicates", required = TRUE) +parser$add_argument("--run_name", dest = "run_name", + help = "The run name/analysis ID", required = TRUE) +parser$add_argument("--matrix", dest = "dims_matrix", + help = "The matrix used, e.g. Plasma, Research, ...") +parser$add_argument("--highest_mz_file", dest = "highest_mz_file", + help = "File path for the highest Mz RData file", required = TRUE) +parser$add_argument("--breaks_filepath", dest = "breaks_filepath", + help = "File path for the breaks RData file", required = TRUE) + +args <- parser$parse_args() + +highest_mz <- get(load(args$highest_mz_file)) +thresh2remove <- 1000000000 remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) { # collect list of samples to remove from replication pattern remove_from_group <- NULL - for (sample_nr in 1:length(repl_pattern)){ + for (sample_nr in seq_along(repl_pattern)){ repl_pattern_1sample <- repl_pattern[[sample_nr]] remove <- NULL - for (file_nr in 1:length(repl_pattern_1sample)) { + for (file_nr in seq_along(repl_pattern_1sample)) { if (repl_pattern_1sample[file_nr] %in% bad_samples) { remove <- c(remove, file_nr) } @@ -41,11 +50,11 @@ remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) { } # load init_file: contains repl_pattern -load(init_file) +load(args$init_file) # load breaks_file: contains breaks_fwhm, breaks_fwhm_avg, # trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos -load(breaks_filepath) +load(args$breaks_filepath) # lower the threshold for non Plasma matrices if (dims_matrix != "Plasma") { @@ -64,7 +73,7 @@ if (highest_mz > 700) { remove_neg <- NULL remove_pos <- NULL cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n") -for (sample_nr in 1:length(repl_pattern)) { +for (sample_nr in seq_along(repl_pattern)) { tech_reps <- as.vector(unlist(repl_pattern[sample_nr])) tech_reps_array_pos <- NULL tech_reps_array_neg <- NULL @@ -72,7 +81,7 @@ for (sample_nr in 1:length(repl_pattern)) { sum_pos <- 0 nr_pos <- 0 nr_neg <- 0 - for (file_nr in 1:length(tech_reps)) { + for (file_nr in seq_along(tech_reps)) { load(paste(tech_reps[file_nr], ".RData", sep = "")) cat("\n\nParsing", tech_reps[file_nr]) # negative scanmode @@ -96,7 +105,7 @@ for (sample_nr in 1:length(repl_pattern)) { } tech_reps_array_pos <- cbind(tech_reps_array_pos, peak_list$pos) } - # save to file + # save to file if (nr_neg != 0) { sum_neg[, 1] <- sum_neg[, 1] / nr_neg colnames(sum_neg) <- names(repl_pattern)[sample_nr] @@ -109,25 +118,25 @@ for (sample_nr in 1:length(repl_pattern)) { } } -pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates) +pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, args$nr_replicates) repl_pattern_filtered <- pattern_list$pattern save(repl_pattern_filtered, file = "negative_repl_pattern.RData") write.table( - remove_neg, - file = "miss_infusions_negative.txt", - row.names = FALSE, - col.names = FALSE, + remove_neg, + file = "miss_infusions_negative.txt", + row.names = FALSE, + col.names = FALSE, sep = "\t" ) -pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates) +pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, args$nr_replicates) repl_pattern_filtered <- pattern_list$pattern save(repl_pattern_filtered, file = "positive_repl_pattern.RData") write.table( - remove_pos, - file = "miss_infusions_positive.txt", - row.names = FALSE, - col.names = FALSE, + remove_pos, + file = "miss_infusions_positive.txt", + row.names = FALSE, + col.names = FALSE, sep = "\t" ) @@ -150,10 +159,10 @@ for (file in tic_files) { # create a list with information for all TIC plots tic_plot_list <- list() plot_nr <- 0 -for (sample_nr in c(1:length(repl_pattern))) { +for (sample_nr in seq_along(repl_pattern)) { tech_reps <- as.vector(unlist(repl_pattern[sample_nr])) sample_name <- names(repl_pattern)[sample_nr] - for (file_nr in 1:length(tech_reps)) { + for (file_nr in seq_along(tech_reps)) { plot_nr <- plot_nr + 1 # read file with retention time, intensity and dims_threshold values repl1_nr <- read.table(paste0(tech_reps[file_nr], "_TIC.txt")) @@ -163,7 +172,7 @@ for (sample_nr in c(1:length(repl_pattern))) { # for replicates with bad TIC, determine what color the border of the plot should be bad_color_pos <- tech_reps[file_nr] %in% remove_pos bad_color_neg <- tech_reps[file_nr] %in% remove_neg - if (bad_color_neg & bad_color_pos) { + if (bad_color_neg && bad_color_pos) { plot_color <- "#F8766D" } else if (bad_color_pos) { plot_color <- "#ED8141" @@ -191,19 +200,19 @@ for (sample_nr in c(1:length(repl_pattern))) { } # create a layout matrix dependent on number of replicates -layout <- matrix(1:(10 * nr_replicates), 10, nr_replicates, TRUE) +layout <- matrix(1:(10 * args$nr_replicates), 10, args$nr_replicates, TRUE) # put TIC plots in matrix tic_plot_pdf <- marrangeGrob( grobs = tic_plot_list, - nrow = 10, ncol = nr_replicates, + nrow = 10, ncol = args$nr_replicates, layout_matrix = layout, top = quote(paste( - "TICs of run", run_name, + "TICs of run", args$run_name, " \n colors: red = both modes misinjection, orange = pos mode misinjection, purple = neg mode misinjection \n ", g, "/", npages )) ) # save to file -ggsave(filename = paste0(run_name, "_TICplots.pdf"), +ggsave(filename = paste0(args$run_name, "_TICplots.pdf"), tic_plot_pdf, width = 21, height = 29.7, units = "cm") diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf index f03d553..12606cf 100644 --- a/DIMS/AverageTechReplicates.nf +++ b/DIMS/AverageTechReplicates.nf @@ -1,7 +1,7 @@ process AverageTechReplicates { tag "DIMS AverageTechReplicates" label 'AverageTechReplicates' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: @@ -23,13 +23,13 @@ process AverageTechReplicates { script: """ - Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_file \ - $params.nr_replicates \ - $analysis_id \ - $matrix \ - $highest_mz_file \ - $breaks_file \ - $params.threshold_tics + Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R \\ + --init_filepath $init_file \\ + --nr_replicates $params.nr_replicates \\ + --run_name $analysis_id \\ + --matrix $matrix \\ + --highest_mz_file $highest_mz_file \\ + --breaks_filepath $breaks_file """ } diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R index 4bd408e..265db5d 100755 --- a/DIMS/CollectFilled.R +++ b/DIMS/CollectFilled.R @@ -1,14 +1,17 @@ -## adapted from 10-collectSamplesFilled.R +# load package +library("argparse") -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) +parser <- ArgumentParser(description = "CollectFilled") -scripts_dir <- cmd_args[1] -ppm <- as.numeric(cmd_args[2]) -z_score <- as.numeric(cmd_args[3]) +parser$add_argument("--scripts_dir", dest = "scripts_dir", + help = "File path to the directory containing functions used in this script", required = TRUE) +parser$add_argument("-z", "--z_score", dest = "z_score", type = "integer", + help = "Numeric value, z-score = 1", required = TRUE) -source(paste0(scripts_dir, "merge_duplicate_rows.R")) -source(paste0(scripts_dir, "calculate_zscores.R")) +args <- parser$parse_args() + +source(paste0(args$scripts_dir, "merge_duplicate_rows.R")) +source(paste0(args$scripts_dir, "calculate_zscores.R")) # for each scan mode, collect all filled peak group lists scanmodes <- c("positive", "negative") @@ -17,7 +20,7 @@ for (scanmode in scanmodes) { filled_files <- list.files("./", full.names = TRUE, pattern = paste0(scanmode, "_identified_filled")) # load files and combine into one object outlist_total <- NULL - for (file_nr in 1:length(filled_files)) { + for (file_nr in seq_along(filled_files)) { peakgrouplist_filled <- get(load(filled_files[file_nr])) outlist_total <- rbind(outlist_total, peakgrouplist_filled) } @@ -29,7 +32,7 @@ for (scanmode in scanmodes) { pattern_file <- paste0(scanmode, "_repl_pattern.RData") repl_pattern <- get(load(pattern_file)) # calculate Z-scores - if (z_score == 1) { + if (args$z_score == 1) { outlist_stats <- calculate_zscores(outlist_total, adducts = FALSE) nr_removed_samples <- length(which(repl_pattern[] == "character(0)")) order_index_int <- order(colnames(outlist_stats)[8:(length(repl_pattern) - nr_removed_samples + 7)]) @@ -47,7 +50,7 @@ for (scanmode in scanmodes) { outlist_stats_more <- cbind(outlist_stats_more, tmp) outlist_total <- outlist_stats_more } - + # make a copy of the outlist outlist_ident <- outlist_total # take care of NAs in theormz_noise diff --git a/DIMS/CollectFilled.nf b/DIMS/CollectFilled.nf index 53b1967..1f17444 100644 --- a/DIMS/CollectFilled.nf +++ b/DIMS/CollectFilled.nf @@ -1,7 +1,7 @@ process CollectFilled { tag "DIMS CollectFilled" label 'CollectFilled' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: @@ -14,6 +14,8 @@ process CollectFilled { script: """ - Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R $params.scripts_dir $params.ppm $params.zscore + Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R \\ + --scripts_dir $params.scripts_dir \\ + --z_score $params.zscore """ } diff --git a/DIMS/CollectSumAdducts.R b/DIMS/CollectSumAdducts.R index 28b5bf0..10752ad 100755 --- a/DIMS/CollectSumAdducts.R +++ b/DIMS/CollectSumAdducts.R @@ -1,13 +1,16 @@ ## Combining all AdductSums part files for each scanmode and # combine intensities if present in both scanmodes +library("argparse") suppressMessages(library("dplyr")) -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) +parser <- ArgumentParser(description = "GenerateExcel") -preprocessing_scripts_dir <- cmd_args[1] +parser$add_argument("--preprocessing_scripts_dir", dest = "preprocessing_scripts_dir", + help = "File path to the directory containing functions used in this script", required = TRUE) -source(paste0(preprocessing_scripts_dir, "collect_sum_adducts_functions.R")) +args <- parser$parse_args() + +source(paste0(args$preprocessing_scripts_dir, "collect_sum_adducts_functions.R")) # collect all AdductSums part files for each scanmode and save to RData file outlist_tot_pos <- combine_sum_adduct_parts("positive") diff --git a/DIMS/CollectSumAdducts.nf b/DIMS/CollectSumAdducts.nf index 279cd0e..06aa82c 100644 --- a/DIMS/CollectSumAdducts.nf +++ b/DIMS/CollectSumAdducts.nf @@ -1,7 +1,7 @@ process CollectSumAdducts { tag "DIMS CollectSumAdducts" label 'CollectSumAdducts' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: @@ -13,6 +13,7 @@ process CollectSumAdducts { script: """ - Rscript ${baseDir}/CustomModules/DIMS/CollectSumAdducts.R $params.preprocessing_scripts_dir + Rscript ${baseDir}/CustomModules/DIMS/CollectSumAdducts.R \\ + --preprocessing_scripts_dir $params.preprocessing_scripts_dir """ } diff --git a/DIMS/FillMissing.nf b/DIMS/FillMissing.nf index b0d1850..7678c95 100644 --- a/DIMS/FillMissing.nf +++ b/DIMS/FillMissing.nf @@ -1,7 +1,7 @@ process FillMissing { tag "DIMS FillMissing ${peakgrouplist_file}" label 'FillMissing' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R index d07fd20..0943c32 100644 --- a/DIMS/GenerateBreaks.R +++ b/DIMS/GenerateBreaks.R @@ -1,15 +1,17 @@ -## adapted from 1-generateBreaksFwhm.HPC.R ## - -# load required package +# load required packages +load("argparse") suppressPackageStartupMessages(library("xcms")) -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) +parser <- ArgumentParser(description = "GenerateBreaks") + +parser$add_argument("--raw_file", dest = "raw_filepath", + help = "File path to a raw file", required = TRUE) +parser$add_argument("--trim", dest = "trim", type = "integer", + help = "Trim precentage, numeric value", required = TRUE) +parser$add_argument("--resolution", dest = "resolution", type = "integer", + help = "Resolution of the MS machine, numeric value", required = TRUE) -filepath <- cmd_args[1] -outdir <- cmd_args[2] -trim <- as.numeric(cmd_args[3]) -resol <- as.numeric(cmd_args[4]) +args <- parser$parse_args() # initialize trim_left_pos <- NULL @@ -21,19 +23,19 @@ breaks_fwhm_avg <- NULL bins <- NULL # read in mzML file -raw_data <- suppressMessages(xcms::xcmsRaw(filepath)) +raw_data <- suppressMessages(xcms::xcmsRaw(args$raw_filepath)) # Get time values for positive and negative scans pos_times <- raw_data@scantime[raw_data@polarity == "positive"] neg_times <- raw_data@scantime[raw_data@polarity == "negative"] # trim (remove) scans at the start and end for positive -trim_left_pos <- round(pos_times[length(pos_times) * (trim * 1.5)]) # 15% aan het begin -trim_right_pos <- round(pos_times[length(pos_times) * (1 - (trim * 0.5))]) # 5% aan het eind +trim_left_pos <- round(pos_times[length(pos_times) * (args$trim * 1.5)]) # 15% aan het begin +trim_right_pos <- round(pos_times[length(pos_times) * (1 - (args$trim * 0.5))]) # 5% aan het eind # trim (remove) scans at the start and end for negative -trim_left_neg <- round(neg_times[length(neg_times) * trim]) -trim_right_neg <- round(neg_times[length(neg_times) * (1 - trim)]) +trim_left_neg <- round(neg_times[length(neg_times) * args$trim]) +trim_right_neg <- round(neg_times[length(neg_times) * (1 - args$trim)]) # Mass range m/z low_mz <- raw_data@mzrange[1] @@ -46,8 +48,8 @@ segment <- seq(from = low_mz, to = high_mz, length.out = nr_segments + 1) # determine start and end of each bin. for (i in 1:nr_segments) { start_segment <- segment[i] - end_segment <- segment[i+1] - resol_mz <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200))) + end_segment <- segment[i + 1] + resol_mz <- args$resolution * (1 / sqrt(2) ^ (log2(start_segment / 200))) fwhm_segment <- start_segment / resol_mz breaks_fwhm <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment)) # average the m/z instead of start value diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf index af617a8..ac0c284 100644 --- a/DIMS/GenerateBreaks.nf +++ b/DIMS/GenerateBreaks.nf @@ -1,7 +1,7 @@ process GenerateBreaks { tag "DIMS GenerateBreaks" label 'GenerateBreaks' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: @@ -14,6 +14,9 @@ process GenerateBreaks { script: """ - Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R $mzML_file ./ $params.trim $params.resolution + Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R \\ + --raw_file $mzML_file \\ + --trim $params.trim \\ + --resolution $params.resolution """ } diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R index 5366264..fda09e7 100644 --- a/DIMS/GenerateExcel.R +++ b/DIMS/GenerateExcel.R @@ -1,4 +1,5 @@ # load required packages +library("argparse") library("ggplot2") library("reshape2") library("openxlsx") @@ -6,17 +7,23 @@ suppressMessages(library("tidyr")) suppressMessages(library("dplyr")) suppressMessages(library("stringr")) -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) +parser <- ArgumentParser(description = "GenerateExcel") -project <- cmd_args[1] -hmdb_rlvc_file <- cmd_args[2] -z_score <- as.numeric(cmd_args[3]) -export_scripts_dir <- cmd_args[4] -path_metabolite_groups <- cmd_args[5] +parser$add_argument("--project", dest = "project", + help = "Project name", required = TRUE) +parser$add_argument("--hmdb_rlvc_file", dest = "hmdb_rlvc_file", + help = "File path to the HMDB relevance file", required = TRUE) +parser$add_argument("-z", "--z_score", dest = "z_score", type = "integer", + help = "Numeric value, z-score = 1", required = TRUE) +parser$add_argument("--export_scripts_dir", dest = "export_scripts_dir", + help = "File path to the directory containing functions used in this script", required = TRUE) +parser$add_argument("--path_metabolite_groups", dest = "path_metabolite_groups", + help = "File path to the directory containing the metabolite groups", required = TRUE) + +args <- parser$parse_args() # load in function scripts -source(paste0(export_scripts_dir, "generate_excel_functions.R")) +source(paste0(args$export_scripts_dir, "generate_excel_functions.R")) # set the number of digits for floats options(digits = 16) @@ -35,7 +42,7 @@ perc <- 5 outlier_threshold <- 2 # load HMDB rlvnc table -load(hmdb_rlvc_file) +load(args$hmdb_rlvc_file) # load outlist object load("AdductSums_combined.RData") @@ -64,7 +71,7 @@ wb_intensities <- openxlsx::createWorkbook("SinglePatient") openxlsx::addWorksheet(wb_intensities, sheetname) # Add Z-scores and create plots -if (z_score == 1) { +if (args$z_score == 1) { dir.create(paste0(outdir, "/plots"), showWarnings = FALSE) wb_helix_intensities <- openxlsx::createWorkbook("SinglePatient") openxlsx::addWorksheet(wb_helix_intensities, sheetname) @@ -121,13 +128,13 @@ if (z_score == 1) { # get Helix IDs for extra Excel file metabolite_files <- list.files( - path = paste(path_metabolite_groups, "Diagnostics", sep = "/"), + path = paste(args$path_metabolite_groups, "Diagnostics", sep = "/"), pattern = "*.txt", full.names = FALSE, recursive = FALSE ) metab_df_helix <- NULL for (file_index in seq_along(metabolite_files)) { infile <- metabolite_files[file_index] - metab_list <- read.table(paste(path_metabolite_groups, "Diagnostics", infile, sep = "/"), + metab_list <- read.table(paste(args$path_metabolite_groups, "Diagnostics", infile, sep = "/"), sep = "\t", header = TRUE, quote = "" ) metab_df_helix <- rbind(metab_df_helix, metab_list) @@ -243,7 +250,7 @@ if (z_score == 1) { plots_present = TRUE ) openxlsx::writeData(wb_helix_intensities, sheet = 1, outlist_helix, startCol = 1) - openxlsx::saveWorkbook(wb_helix_intensities, paste0(outdir, "/Helix_", project, ".xlsx"), overwrite = TRUE) + openxlsx::saveWorkbook(wb_helix_intensities, paste0(outdir, "/Helix_", args$project, ".xlsx"), overwrite = TRUE) rm(wb_helix_intensities) # reorder outlist for Excel file @@ -267,6 +274,6 @@ if (z_score == 1) { # write Excel file openxlsx::writeData(wb_intensities, sheet = 1, outlist, startCol = 1) -openxlsx::saveWorkbook(wb_intensities, paste0(outdir, "/", project, ".xlsx"), overwrite = TRUE) +openxlsx::saveWorkbook(wb_intensities, paste0(outdir, "/", args$project, ".xlsx"), overwrite = TRUE) rm(wb_intensities) unlink("plots", recursive = TRUE) diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf index 552a8ee..21fefd4 100644 --- a/DIMS/GenerateExcel.nf +++ b/DIMS/GenerateExcel.nf @@ -1,7 +1,7 @@ process GenerateExcel { tag "DIMS GenerateExcel" label 'GenerateExcel' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: @@ -18,6 +18,11 @@ process GenerateExcel { script: """ - Rscript ${baseDir}/CustomModules/DIMS/GenerateExcel.R $analysis_id $relevance_file $params.zscore $params.export_scripts_dir $params.path_metabolite_groups + Rscript ${baseDir}/CustomModules/DIMS/GenerateExcel.R \\ + --project $analysis_id \\ + --hmdb_rlvc_file $relevance_file \\ + --z_score $params.zscore \\ + --export_scripts_dir $params.export_scripts_dir \\ + --path_metabolite_groups $params.path_metabolite_groups """ } diff --git a/DIMS/GenerateQCOutput.R b/DIMS/GenerateQCOutput.R index 4e3394d..2bc87a5 100644 --- a/DIMS/GenerateQCOutput.R +++ b/DIMS/GenerateQCOutput.R @@ -1,3 +1,4 @@ +library("argparse") library("ggplot2") library("reshape2") library("openxlsx") @@ -6,23 +7,30 @@ suppressMessages(library("dplyr")) # set the number of digits for floats options(digits = 16) -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) +parser <- ArgumentParser(description = "GenerateQCOutput") -init_file <- cmd_args[1] -project <- cmd_args[2] -dims_matrix <- cmd_args[3] -z_score <- cmd_args[4] -sst_components_file <- cmd_args[5] -export_scripts_dir <- cmd_args[6] +parser$add_argument("--init_file_path", dest = "init_file", + help = "File path for the init RData file", required = TRUE) +parser$add_argument("--project", dest = "project", + help = "Project name", required = TRUE) +parser$add_argument("--matrix", dest = "dims_matrix", + help = "The matrix used, e.g. Plasma, Research, ...", required = TRUE) +parser$add_argument("-z", "--z_score", dest = "z_score", type = "integer", + help = "Numeric value, z-score = 1", required = TRUE) +parser$add_argument("--sst_components_file", dest = "sst_components_file", + help = "File path to the SST components file", required = TRUE) +parser$add_argument("--export_scripts_dir", dest = "export_scripts_dir", + help = "File path to the directory containing functions used in this script", required = TRUE) + +args <- parser$parse_args() outdir <- "./" # load in function scripts -source(paste0(export_scripts_dir, "generate_qc_output_functions.R")) +source(paste0(args$export_scripts_dir, "generate_qc_output_functions.R")) # load init files -load(init_file) +load(args$init_file) # load outlist from GenerateExcel load("outlist.RData") # load combined adducts for each scanmodus @@ -38,7 +46,7 @@ dir.create(paste0(outdir, "/plots"), showWarnings = FALSE) control_label <- "C" #### CHECK NUMBER OF CONTROLS #### -if (z_score == 1) { +if (args$z_score == 1) { file_name <- "Check_number_of_controls.txt" min_num_controls <- 25 check_number_of_controls(outlist, min_num_controls, file_name) @@ -67,14 +75,14 @@ outlist_tot_neg <- outlist_tot_neg[, samples_both_modes] outlist_tot_pos <- outlist_tot_pos[, samples_both_modes] # Retrieve IS summed adducts -is_summed <- get_internal_standards(is_list, "summed", repl_pattern, dims_matrix, rundate, project) +is_summed <- get_internal_standards(is_list, "summed", repl_pattern, args$dims_matrix, rundate, args$project) # Retrieve IS positive mode -is_pos <- get_internal_standards(is_list, "pos", outlist_tot_pos, dims_matrix, rundate, project) +is_pos <- get_internal_standards(is_list, "pos", outlist_tot_pos, args$dims_matrix, rundate, args$project) # Retrieve IS negative mode -is_neg <- get_internal_standards(is_list, "neg", outlist_tot_neg, dims_matrix, rundate, project) +is_neg <- get_internal_standards(is_list, "neg", outlist_tot_neg, args$dims_matrix, rundate, args$project) # Save results -save(is_pos, is_neg, is_summed, file = paste0(outdir, "/", project, "_IS_results.RData")) +save(is_pos, is_neg, is_summed, file = paste0(outdir, "/", args$project, "_IS_results.RData")) # number of samples, for plotting length and width sample_count <- length(repl_pattern) @@ -138,7 +146,7 @@ is_sum_selection <- c( ) # add minimal intensity lines based on matrix (DBS or Plasma) and machine mode (neg, pos, sum) -if (dims_matrix == "DBS") { +if (args$dims_matrix == "DBS") { add_min_intens_lines <- TRUE hline_data_neg <- data.frame( @@ -155,7 +163,7 @@ if (dims_matrix == "DBS") { int_line = c(1300000, 2500000, 500000, 1800000, 1400000), HMDB_name = is_sum_selection ) -} else if (dims_matrix == "Plasma") { +} else if (args$dims_matrix == "Plasma") { add_min_intens_lines <- TRUE hline_data_neg <- data.frame( @@ -239,7 +247,7 @@ patterns <- c("^(P1002\\.)[[:digit:]]+_", "^(P1003\\.)[[:digit:]]+_", "^(P1005\\ positive_controls_index <- grepl(pattern = paste(patterns, collapse = "|"), column_list) positive_control_list <- column_list[positive_controls_index] -if (z_score == 1) { +if (args$z_score == 1) { # find if one or more positive control samples are missing pos_contr_warning <- c() if (all(sapply(c("^P1002", "^P1003", "^P1005"), @@ -280,16 +288,16 @@ if (z_score == 1) { positive_control$Zscore <- as.numeric(positive_control$Zscore) # extra information added to excel for future reference. made in beginning of this script - positive_control$Matrix <- dims_matrix + positive_control$Matrix <- args$dims_matrix positive_control$Rundate <- rundate - positive_control$Project <- project + positive_control$Project <- args$project # Save results - save(positive_control, file = paste0(outdir, "/", project, "_positive_control.RData")) + save(positive_control, file = paste0(outdir, "/", args$project, "_positive_control.RData")) # round the Z-scores to 2 digits positive_control$Zscore <- round_df(positive_control$Zscore, 2) write.xlsx(positive_control, - file = paste0(outdir, "/", project, "_positive_control.xlsx"), + file = paste0(outdir, "/", args$project, "_positive_control.xlsx"), sheetName = "Sheet1", col.names = TRUE, row.names = TRUE, append = FALSE ) } @@ -315,7 +323,7 @@ is_neg_intensities <- get_is_intensities(outlist_tot_neg, is_codes = is_codes) is_pos_intensities <- get_is_intensities(outlist_tot_pos, is_codes = is_codes) # SST components. -sst_comp <- read.csv(sst_components_file, header = TRUE, sep = "\t") +sst_comp <- read.csv(args$sst_components_file, header = TRUE, sep = "\t") sst_list <- outlist %>% filter(HMDB_code %in% sst_comp$HMDB_ID) sst_colnrs <- grep("P1001", colnames(sst_list)) @@ -352,7 +360,7 @@ setColWidths(wb, 3, cols = 1, widths = 24) addWorksheet(wb, "SST components") openxlsx::writeData(wb, sheet = 4, sst_list_intensities) setColWidths(wb, 4, cols = 1:3, widths = 24) -xlsx_name <- paste0(outdir, "/", project, "_IS_SST.xlsx") +xlsx_name <- paste0(outdir, "/", args$project, "_IS_SST.xlsx") openxlsx::saveWorkbook(wb, xlsx_name, overwrite = TRUE) rm(wb) diff --git a/DIMS/GenerateQCOutput.nf b/DIMS/GenerateQCOutput.nf index eb17301..6e4c994 100644 --- a/DIMS/GenerateQCOutput.nf +++ b/DIMS/GenerateQCOutput.nf @@ -1,7 +1,7 @@ process GenerateQCOutput { tag "DIMS GenerateQCOutput" label 'GenerateQCOutput' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: @@ -20,11 +20,12 @@ process GenerateQCOutput { script: """ - Rscript ${baseDir}/CustomModules/DIMS/GenerateQCOutput.R $init_file \ - $analysis_id \ - $params.matrix \ - $params.zscore \ - $params.sst_components_file \ - $params.export_scripts_dir + Rscript ${baseDir}/CustomModules/DIMS/GenerateQCOutput.R \\ + --init_file_path $init_file \\ + --project $analysis_id \\ + --matrix $params.matrix \\ + --z_score $params.zscore \\ + --sst_components_file $params.sst_components_file \\ + --export_scripts_dir $params.export_scripts_dir """ } diff --git a/DIMS/GenerateViolinPlots.nf b/DIMS/GenerateViolinPlots.nf index 1c4b532..6136ec3 100755 --- a/DIMS/GenerateViolinPlots.nf +++ b/DIMS/GenerateViolinPlots.nf @@ -1,7 +1,7 @@ process GenerateViolinPlots { tag "DIMS GenerateViolinPlots" label 'GenerateViolinPlots' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: diff --git a/DIMS/HMDBparts.nf b/DIMS/HMDBparts.nf index 5f19f75..4b1fa5a 100644 --- a/DIMS/HMDBparts.nf +++ b/DIMS/HMDBparts.nf @@ -1,7 +1,7 @@ process HMDBparts { tag "DIMS HMDBparts" label 'HMDBparts' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: diff --git a/DIMS/HMDBparts_main.R b/DIMS/HMDBparts_main.R index 14335bf..c1bb218 100644 --- a/DIMS/HMDBparts_main.R +++ b/DIMS/HMDBparts_main.R @@ -1,34 +1,38 @@ -## adapted from hmdb_part_adductSums.R +# load package +load("argparse") -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) +parser <- ArgumentParser(description = "HMDBparts_main") -db_file <- cmd_args[1] -breaks_file <- cmd_args[2] +parser$add_argument("--hmdb_db_file", dest = "hmdb_db_file", + help = "File path the HMDB databse file", required = TRUE) +parser$add_argument("--breaks_file", dest = "breaks_file", + help = "File containing all the breaks/bins", required = TRUE) -load(db_file) -load(breaks_file) +args <- parser$parse_args() -# Cut up HMDB minus adducts minus isotopes into small parts +load(args$db_file) +load(args$breaks_file) + +# Cut up HMDB minus adducts minus isotopes into small parts scanmodes <- c("positive", "negative") for (scanmode in scanmodes) { if (scanmode == "negative") { column_label <- "MNeg" - HMDB_add_iso <- HMDB_add_iso.Neg + hmdb_add_iso <- HMDB_add_iso.Neg } else if (scanmode == "positive") { column_label <- "Mpos" - HMDB_add_iso <- HMDB_add_iso.Pos + hmdb_add_iso <- HMDB_add_iso.Pos } # filter mass range measured - outlist <- HMDB_add_iso[which(HMDB_add_iso[ , column_label] >= breaks_fwhm[1] & - HMDB_add_iso[ ,column_label] <= breaks_fwhm[length(breaks_fwhm)]), ] + outlist <- hmdb_add_iso[which(hmdb_add_iso[, column_label] >= breaks_fwhm[1] & + hmdb_add_iso[, column_label] <= breaks_fwhm[length(breaks_fwhm)]), ] # remove adducts and isotopes, put internal standard at the beginning outlist <- outlist[grep("HMDB", rownames(outlist), fixed = TRUE), ] outlist <- outlist[-grep("_", rownames(outlist), fixed = TRUE), ] # sort on m/z value - outlist <- outlist[order(outlist[ , column_label]), ] + outlist <- outlist[order(outlist[, column_label]), ] nr_rows <- dim(outlist)[1] # size of hmdb parts in lines: @@ -37,12 +41,12 @@ for (scanmode in scanmodes) { check <- 0 # generate hmdb parts - if (nr_rows >= sub & (floor(nr_rows / sub)) >= 2) { + if (nr_rows >= sub && (floor(nr_rows / sub)) >= 2) { for (i in 1:floor(nr_rows / sub)) { start <- -(sub - 1) + i * sub end <- i * sub outlist_part <- outlist[c(start:end), ] - save(outlist_part, file=paste0(scanmode, "_hmdb_main.", i, ".RData")) + save(outlist_part, file = paste0(scanmode, "_hmdb_main.", i, ".RData")) } } @@ -53,4 +57,4 @@ for (scanmode in scanmodes) { outlist_part <- outlist[c(start:end), ] save(outlist_part, file = paste0(scanmode, "_hmdb_main.", i + 1, ".RData")) -} \ No newline at end of file +} diff --git a/DIMS/HMDBparts_main.nf b/DIMS/HMDBparts_main.nf index b38bac0..2ae3457 100644 --- a/DIMS/HMDBparts_main.nf +++ b/DIMS/HMDBparts_main.nf @@ -1,7 +1,7 @@ process HMDBparts_main { tag "DIMS HMDBparts_main" label 'HMDBparts_main' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: @@ -14,6 +14,8 @@ process HMDBparts_main { script: """ - Rscript ${baseDir}/CustomModules/DIMS/HMDBparts_main.R $hmdb_db_file $breaks_file + Rscript ${baseDir}/CustomModules/DIMS/HMDBparts_main.R \\ + --hmdb_db_file $hmdb_db_file \\ + --breaks_file $breaks_file """ } diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R index 44d4996..d5f0570 100644 --- a/DIMS/MakeInit.R +++ b/DIMS/MakeInit.R @@ -1,14 +1,18 @@ -## adapted from makeInit in old pipeline +# load required packages +library("argparse") -# define parameters -args <- commandArgs(trailingOnly = TRUE) +parser <- ArgumentParser(description = "MakeInit") -sample_sheet <- read.csv(args[1], sep = "\t") -nr_replicates <- as.numeric(args[2]) +parser$add_argument("--sample_sheet", dest = "sample_sheet", + help = "Samplesheet txt file", required = TRUE) +parser$add_argument("--nr_replicates", dest = "nr_replicates", type = "integer", + help = "Number of replicates, numeric value", required = TRUE) -sample_names <- trimws(as.vector(unlist(sample_sheet[1]))) -nr_sample_groups <- length(sample_names) / nr_replicates -group_names <- trimws(as.vector(unlist(sample_sheet[2]))) +args <- parser$parse_args() + +sample_names <- trimws(as.vector(unlist(args$sample_sheet[1]))) +nr_sample_groups <- length(sample_names) / args$nr_replicates +group_names <- trimws(as.vector(unlist(args$sample_sheet[2]))) group_names <- gsub("[^-.[:alnum:]]", "_", group_names) group_names_unique <- unique(group_names) @@ -16,8 +20,8 @@ group_names_unique <- unique(group_names) repl_pattern <- c() for (sample_group in 1:nr_sample_groups) { tmp <- c() - for (repl in nr_replicates:1) { - index <- ((sample_group * nr_replicates) - repl) + 1 + for (repl in args$nr_replicates:1) { + index <- ((sample_group * args$nr_replicates) - repl) + 1 tmp <- c(tmp, sample_names[index]) } repl_pattern <- c(repl_pattern, list(tmp)) diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf index 7aae0e4..793d8f0 100644 --- a/DIMS/MakeInit.nf +++ b/DIMS/MakeInit.nf @@ -1,7 +1,7 @@ process MakeInit { tag "DIMS MakeInit" label 'MakeInit' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: @@ -13,6 +13,8 @@ process MakeInit { script: """ - Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R $samplesheet $nr_replicates + Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R \\ + --samplesheet $samplesheet \\ + --nr_replicates $nr_replicates """ } diff --git a/DIMS/PeakFinding.nf b/DIMS/PeakFinding.nf index 0097d12..0e25893 100644 --- a/DIMS/PeakFinding.nf +++ b/DIMS/PeakFinding.nf @@ -1,7 +1,7 @@ process PeakFinding { tag "DIMS PeakFinding ${rdata_file}" label 'PeakFinding' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: diff --git a/DIMS/PeakGrouping.nf b/DIMS/PeakGrouping.nf index 0ecb18c..7f3f17e 100644 --- a/DIMS/PeakGrouping.nf +++ b/DIMS/PeakGrouping.nf @@ -1,7 +1,7 @@ process PeakGrouping { tag "DIMS PeakGrouping ${hmdbpart_file}" label 'PeakGrouping' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: diff --git a/DIMS/SpectrumPeakFinding.nf b/DIMS/SpectrumPeakFinding.nf index 3590265..a8ae5a8 100644 --- a/DIMS/SpectrumPeakFinding.nf +++ b/DIMS/SpectrumPeakFinding.nf @@ -1,7 +1,7 @@ process SpectrumPeakFinding { tag "DIMS SpectrumPeakFinding" label 'SpectrumPeakFinding' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: diff --git a/DIMS/SumAdducts.nf b/DIMS/SumAdducts.nf index 4b3f965..a667e6e 100644 --- a/DIMS/SumAdducts.nf +++ b/DIMS/SumAdducts.nf @@ -1,7 +1,7 @@ process SumAdducts { tag "DIMS SumAdducts ${hmdbpart_main_file}" label 'SumAdducts' - container = 'docker://umcugenbioinf/dims:1.3' + container = 'docker://umcugenbioinf/dims:1.4' shell = ['/bin/bash', '-euo', 'pipefail'] input: