Skip to content
21 changes: 9 additions & 12 deletions DIMS/CollectFilled.R
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
## adapted from 10-collectSamplesFilled.R

# define parameters
cmd_args <- commandArgs(trailingOnly = TRUE)

scripts_dir <- cmd_args[1]
preprocessing_scripts_dir <- cmd_args[1]
ppm <- as.numeric(cmd_args[2])
z_score <- as.numeric(cmd_args[3])

source(paste0(scripts_dir, "merge_duplicate_rows.R"))
source(paste0(scripts_dir, "calculate_zscores.R"))
source(paste0(preprocessing_scripts_dir, "collect_filled_functions.R"))

# for each scan mode, collect all filled peak group lists
scanmodes <- c("positive", "negative")
Expand All @@ -17,7 +14,7 @@ for (scanmode in scanmodes) {
filled_files <- list.files("./", full.names = TRUE, pattern = paste0(scanmode, "_identified_filled"))
# load files and combine into one object
outlist_total <- NULL
for (file_nr in 1:length(filled_files)) {
for (file_nr in seq_along(filled_files)) {
peakgrouplist_filled <- get(load(filled_files[file_nr]))
outlist_total <- rbind(outlist_total, peakgrouplist_filled)
}
Expand Down Expand Up @@ -47,14 +44,14 @@ for (scanmode in scanmodes) {
outlist_stats_more <- cbind(outlist_stats_more, tmp)
outlist_total <- outlist_stats_more
}

# make a copy of the outlist
outlist_ident <- outlist_total
# take care of NAs in theormz_noise
outlist_ident$theormz_noise[which(is.na(outlist_ident$theormz_noise))] <- 0
outlist_ident$theormz_noise <- as.numeric(outlist_ident$theormz_noise)
outlist_ident$theormz_noise[which(is.na(outlist_ident$theormz_noise))] <- 0
outlist_ident$theormz_noise <- as.numeric(outlist_ident$theormz_noise)
# select identified peak groups if ppm deviation is within limits
if (z_score == 1) {
outlist_ident$ppmdev <- as.numeric(outlist_ident$ppmdev)
outlist_ident <- outlist_ident[which(outlist_ident["ppmdev"] >= -ppm & outlist_ident["ppmdev"] <= ppm), ]
}

# Extra output in Excel-readable format:
remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp")
Expand Down
2 changes: 1 addition & 1 deletion DIMS/CollectFilled.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ process CollectFilled {

script:
"""
Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R $params.scripts_dir $params.ppm $params.zscore
Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R $params.preprocessing_scripts_dir $params.ppm $params.zscore
"""
}
26 changes: 7 additions & 19 deletions DIMS/FillMissing.R
Original file line number Diff line number Diff line change
@@ -1,25 +1,12 @@
## adapted from 9-runFillMissing.R

# define parameters
cmd_args <- commandArgs(trailingOnly = TRUE)

peakgrouplist_file <- cmd_args[1]
scripts_dir <- cmd_args[2]
preprocessing_scripts_dir <- cmd_args[2]
thresh <- as.numeric(cmd_args[3])
resol <- as.numeric(cmd_args[4])
ppm <- as.numeric(cmd_args[5])
outdir <- "./"

# load in function scripts
source(paste0(scripts_dir, "replace_zeros.R"))
source(paste0(scripts_dir, "fit_optim.R"))
source(paste0(scripts_dir, "get_fwhm.R"))
source(paste0(scripts_dir, "get_stdev.R"))
source(paste0(scripts_dir, "estimate_area.R"))
source(paste0(scripts_dir, "optimize_gaussfit.R"))
source(paste0(scripts_dir, "identify_noisepeaks.R"))
source(paste0(scripts_dir, "get_element_info.R"))
source(paste0(scripts_dir, "atomic_info.R"))
source(paste0(preprocessing_scripts_dir, "fill_missing_functions.R"))

# determine scan mode
if (grepl("_pos", peakgrouplist_file)) {
Expand All @@ -33,12 +20,13 @@ pattern_file <- paste0(scanmode, "_repl_pattern.RData")
repl_pattern <- get(load(pattern_file))

# load peak group list and determine output file name
outpgrlist_identified <- get(load(peakgrouplist_file))

outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file)
peakgroup_list <- get(load(peakgrouplist_file))

# replace missing values (zeros) with random noise
peakgrouplist_filled <- replace_zeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm)
peakgrouplist_filled <- fill_missing_intensities(peakgroup_list, repl_pattern, thresh)

# set name of output file
outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file)

# save output
save(peakgrouplist_filled, file = outputfile_name)
2 changes: 1 addition & 1 deletion DIMS/FillMissing.nf
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ process FillMissing {

script:
"""
Rscript ${baseDir}/CustomModules/DIMS/FillMissing.R $peakgrouplist_file $params.scripts_dir $params.thresh $params.resolution $params.ppm
Rscript ${baseDir}/CustomModules/DIMS/FillMissing.R $peakgrouplist_file $params.preprocessing_scripts_dir $params.thresh
"""
}
1 change: 1 addition & 0 deletions DIMS/GenerateQCOutput.R
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,7 @@ if (length(sst_colnrs) > 0) {
sst_list_intensities <- sst_list[, intensity_col_ids]
}
for (col_nr in seq_len(ncol(sst_list_intensities))) {
sst_list_intensities <- as.data.frame(sst_list_intensities)
sst_list_intensities[, col_nr] <- as.numeric(sst_list_intensities[, col_nr])
if (grepl("Zscore", colnames(sst_list_intensities)[col_nr])) {
sst_list_intensities[, col_nr] <- round(sst_list_intensities[, col_nr], 2)
Expand Down
2 changes: 1 addition & 1 deletion DIMS/Utils/calculate_zscores.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ calculate_zscores <- function(peakgroup_list, adducts) {
ppmdev[i] <- NA
}
}
peakgroup_list <- cbind(peakgroup_list[, 1:6], ppmdev = ppmdev, peakgroup_list[, 7:ncol(peakgroup_list)])
peakgroup_list <- cbind(peakgroup_list[, 1:4], ppmdev = ppmdev, peakgroup_list[, 5:ncol(peakgroup_list)])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could the indices be changes to column names, for readability and if column orders change in the future?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ppmdev column is already present in the peak grouplist, so this section is refactored. Referring to columns by number has been removed.

}
}

Expand Down
4 changes: 0 additions & 4 deletions DIMS/Utils/merge_duplicate_rows.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,6 @@ merge_duplicate_rows <- function(peakgroup_list) {
single_peakgroup[, "assi_HMDB"] <- collapse("assi_HMDB", peakgroup_list, peaklist_index)
single_peakgroup[, "iso_HMDB"] <- collapse("iso_HMDB", peakgroup_list, peaklist_index)
single_peakgroup[, "HMDB_code"] <- collapse("HMDB_code", peakgroup_list, peaklist_index)
single_peakgroup[, "assi_noise"] <- collapse("assi_noise", peakgroup_list, peaklist_index)
if (single_peakgroup[, "assi_noise"] == ";") single_peakgroup[, "assi_noise"] <- NA
single_peakgroup[, "theormz_noise"] <- collapse("theormz_noise", peakgroup_list, peaklist_index)
if (single_peakgroup[, "theormz_noise"] == "0;0") single_peakgroup[, "theormz_noise"] <- NA
single_peakgroup[, "all_hmdb_ids"] <- collapse("all_hmdb_ids", peakgroup_list, peaklist_index)
single_peakgroup[, "sec_hmdb_ids"] <- collapse("sec_hmdb_ids", peakgroup_list, peaklist_index)
if (single_peakgroup[, "sec_hmdb_ids"] == ";") single_peakgroup[, "sec_hmdb_ids"] < NA
Expand Down
119 changes: 119 additions & 0 deletions DIMS/preprocessing/collect_filled_functions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# CollectFilled functions

collapse <- function(column_label, peakgroup_list, index_dup) {
#' Collapse identification info for peak groups with the same mass
#'
#' @param column_label: Name of column in peakgroup_list (string)
#' @param peakgroup_list: Peak group list (matrix)
#' @param index_dup: Index of duplicate peak group (integer)
#'
#' @return collapsed_items: Semicolon-separated list of info (string)
# get the item(s) that need to be collapsed
list_items <- as.vector(peakgroup_list[index_dup, column_label])
# remove NA
if (length(which(is.na(list_items))) > 0) {
list_items <- list_items[-which(is.na(list_items))]
}
collapsed_items <- paste(list_items, collapse = ";")
return(collapsed_items)
}

merge_duplicate_rows <- function(peakgroup_list) {
#' Merge identification info for peak groups with the same mass
#'
#' @param peakgroup_list: Peak group list (matrix)
#'
#' @return peakgroup_list_dedup: de-duplicated peak group list (matrix)

options(digits = 16)
collect <- NULL
remove <- NULL

# check for peak groups with identical mass
index_dup <- which(duplicated(peakgroup_list[, "mzmed.pgrp"]))

while (length(index_dup) > 0) {
# get the index for the peak group which is double
peaklist_index <- which(peakgroup_list[, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])
single_peakgroup <- peakgroup_list[peaklist_index[1], , drop = FALSE]

# use function collapse to concatenate info
single_peakgroup[, "assi_HMDB"] <- collapse("assi_HMDB", peakgroup_list, peaklist_index)
single_peakgroup[, "iso_HMDB"] <- collapse("iso_HMDB", peakgroup_list, peaklist_index)
single_peakgroup[, "HMDB_code"] <- collapse("HMDB_code", peakgroup_list, peaklist_index)
single_peakgroup[, "all_hmdb_ids"] <- collapse("all_hmdb_ids", peakgroup_list, peaklist_index)
single_peakgroup[, "sec_hmdb_ids"] <- collapse("sec_hmdb_ids", peakgroup_list, peaklist_index)
if (single_peakgroup[, "sec_hmdb_ids"] == ";") single_peakgroup[, "sec_hmdb_ids"] < NA

# keep track of deduplicated entries
collect <- rbind(collect, single_peakgroup)
remove <- c(remove, peaklist_index)

# remove current entry from index
index_dup <- index_dup[-which(peakgroup_list[index_dup, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])]
}

# remove duplicate entries
if (!is.null(remove)) {
peakgroup_list <- peakgroup_list[-remove, ]
}
# append deduplicated entries
peakgroup_list_dedup <- rbind(peakgroup_list, collect)
return(peakgroup_list_dedup)
}

calculate_zscores <- function(peakgroup_list) {
#' Calculate Z-scores for peak groups based on average and standard deviation of controls
#'
#' @param peakgroup_list: Peak group list (matrix)
#' @param sort_col: Column to sort on (string)
#' @param adducts: Parameter indicating whether there are adducts in the list (boolean)
#'
#' @return peakgroup_list_dedup: de-duplicated peak group list (matrix)

case_label <- "P"
control_label <- "C"
# get index for new column names
startcol <- ncol(peakgroup_list) + 3

# calculate mean and standard deviation for Control group
ctrl_cols <- grep(control_label, colnames(peakgroup_list), fixed = TRUE)
case_cols <- grep(case_label, colnames(peakgroup_list), fixed = TRUE)
int_cols <- c(ctrl_cols, case_cols)
# set all zeros to NA
peakgroup_list[, int_cols][peakgroup_list[, int_cols] == 0] <- NA
ctrl_ints <- peakgroup_list[, ctrl_cols, drop = FALSE]
peakgroup_list$avg.ctrls <- apply(ctrl_ints, 1, function(x) mean(as.numeric(x), na.rm = TRUE))
peakgroup_list$sd.ctrls <- apply(ctrl_ints, 1, function(x) sd(as.numeric(x), na.rm = TRUE))

# set new column names and calculate Z-scores
colnames_zscores <- NULL
for (col_index in int_cols) {
col_name <- colnames(peakgroup_list)[col_index]
colnames_zscores <- c(colnames_zscores, paste0(col_name, "_Zscore"))
zscores_1col <- (as.numeric(as.vector(unlist(peakgroup_list[, col_index]))) -
peakgroup_list$avg.ctrls) / peakgroup_list$sd.ctrls
peakgroup_list <- cbind(peakgroup_list, zscores_1col)
}

# apply new column names to columns at end plus avg and sd columns
colnames(peakgroup_list)[startcol:ncol(peakgroup_list)] <- colnames_zscores

# add ppm deviation column
zscore_cols <- grep("Zscore", colnames(peakgroup_list), fixed = TRUE)
# calculate ppm deviation
for (row_index in seq_len(nrow(peakgroup_list))) {
if (!is.na(peakgroup_list$theormz_HMDB[row_index]) &&
!is.null(peakgroup_list$theormz_HMDB[row_index]) &&
(peakgroup_list$theormz_HMDB[row_index] != "")) {
peakgroup_list$ppmdev[row_index] <- 10^6 * (as.numeric(as.vector(peakgroup_list$mzmed.pgrp[row_index])) -
as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index]))) /
as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index]))
} else {
peakgroup_list$ppmdev[row_index] <- NA
}
}

return(peakgroup_list)
}

36 changes: 36 additions & 0 deletions DIMS/preprocessing/fill_missing_functions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
fill_missing_intensities <- function(peakgroup_list, repl_pattern, thresh, not_random = FALSE) {
#' Replace intensities that are zero with random value
#'
#' @param peakgroup_list: Peak groups (matrix)
#' @param repl_pattern: Replication pattern (list of strings)
#' @param thresh: Value for threshold between noise and signal (integer)
#'
#' @return final_outlist: peak groups with filled-in intensities (matrix)

# for unit test, turn off randomness
if (not_random) {
set.seed(123)
}

# replace missing intensities with random values around threshold
if (!is.null(peakgroup_list)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe move if not null statement to main script. Also missing else, what happens if it is null?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would like to keep the main script as 'clean' as possible, so I prefer to keep the if statement inside the function.
If peakgroup_list is null, nothing happens; a null object is saved and passed to the next step.

for (sample_index in seq_along(names(repl_pattern))) {
sample_peaks <- peakgroup_list[, names(repl_pattern)[sample_index]]
zero_intensity <- which(sample_peaks <= 0)
if (!length(zero_intensity)) {
next
}
for (zero_index in seq_along(zero_intensity)) {
peakgroup_list[zero_intensity[zero_index], names(repl_pattern)[sample_index]] <- rnorm(n = 1,
mean = thresh,
sd = 100)
}
}

# Add column with average intensity; find intensity columns first
int_cols <- which(colnames(peakgroup_list) %in% names(repl_pattern))
peakgroup_list <- cbind(peakgroup_list, "avg.int" = apply(peakgroup_list[, int_cols], 1, mean))

return(peakgroup_list)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Return is within the if-statement, what happens if peakgroup_list is null?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See comment above; a null object is returned.

}
}
45 changes: 45 additions & 0 deletions DIMS/tests/testthat/test_fill_missing.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# unit tests for FillMissing
# function: fill_missing_intensities
source("../../preprocessing/fill_missing_functions.R")

# test fill_missing_intensities
testthat::test_that("missing values are corretly filled with random values", {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add function name that is tested

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function names have not been added in the test_that line for any other unit test; this is a good idea for the general standardization for version 3.5.

# create peakgroup_list to test on in diagnostics setting
test_peakgroup_list <- data.frame(matrix(NA, nrow = 4, ncol = 23))
colnames(test_peakgroup_list) <- c("mzmed.pgrp", "nrsamples", "ppmdev", "assi_HMDB", "all_hmdb_names",
"iso_HMDB", "HMDB_code", "all_hmdb_ids", "sec_hmdb_ids", "theormz_HMDB",
"C101.1", "C102.1", "P2.1", "P3.1",
"avg.int", "assi_noise", "theormz_noise", "avg.ctrls", "sd.ctrls",
"C101.1_Zscore", "C102.1_Zscore", "P2.1_Zscore", "P3.1_Zscore")
test_peakgroup_list[, c(1)] <- 300 + runif(4)
test_peakgroup_list[, c(2, 3)] <- runif(8)
test_peakgroup_list[, "HMDB_code"] <- c("HMDB1234567", "HMDB1234567_1", "HMDB1234567_2", "HMDB1234567_7")
test_peakgroup_list[, "all_hmdb_ids"] <- paste(test_peakgroup_list[, "HMDB_code"],
test_peakgroup_list[, "HMDB_code"], sep = ";")
test_peakgroup_list[, "all_hmdb_names"] <- paste(test_peakgroup_list[, "assi_HMDB"],
test_peakgroup_list[, "assi_HMDB"], sep = ";")
test_peakgroup_list[, grep("C", colnames(test_peakgroup_list))] <- 1000 * (1:16)
Comment on lines +8 to +21
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same code as test_sum_intensities_adducts.R. Maybe make a txt file with the test_peakgroup_list table and load the table in the test_that() function

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. test_sum_adducts.R will be modified in version 3.5.

test_peakgroup_list[, grep("P", colnames(test_peakgroup_list))] <- 0
test_repl_pattern <- c(list(1), list(2), list(3), list(4))
names(test_repl_pattern) <- c("C101.1", "C102.1", "P2.1", "P3.1")
test_thresh <- 2000

# create a large peak group list to test for negative values
test_large_peakgroup_list <- rbind(test_peakgroup_list, test_peakgroup_list)
for (index in 1:15) {
test_large_peakgroup_list <- rbind(test_large_peakgroup_list, test_large_peakgroup_list)
}
# for the sake of time, leave only one intensity column with zeros
test_large_peakgroup_list$P2.1 <- 1

expect_equal(round(fill_missing_intensities(test_peakgroup_list, test_repl_pattern, test_thresh, not_random = TRUE)$P2.1),
c(1944, 1977, 2156, 2007), TRUE, tolerance = 0.1)
# fill_missing_intensities should not produce any negative values, even if a large quantity of numbers are filled in
start.time <- Sys.time()
expect_gt(min(fill_missing_intensities(test_large_peakgroup_list, test_repl_pattern, test_thresh, not_random = FALSE)$P3.1),
0, TRUE)
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken

})
Loading