UMCUGenetics · mraves2 · Oct 13, 2025 · Dec 9, 2025 · Dec 9, 2025 · Dec 9, 2025
diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R
@@ -1,14 +1,11 @@
-## adapted from 10-collectSamplesFilled.R
-
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-scripts_dir <- cmd_args[1]
+preprocessing_scripts_dir <- cmd_args[1]
 ppm <- as.numeric(cmd_args[2])
 z_score <- as.numeric(cmd_args[3])
 
-source(paste0(scripts_dir, "merge_duplicate_rows.R"))
-source(paste0(scripts_dir, "calculate_zscores.R"))
+source(paste0(preprocessing_scripts_dir, "collect_filled_functions.R"))
 
 # for each scan mode, collect all filled peak group lists
 scanmodes <- c("positive", "negative")
@@ -17,7 +14,7 @@ for (scanmode in scanmodes) {
   filled_files <- list.files("./", full.names = TRUE, pattern = paste0(scanmode, "_identified_filled"))
   # load files and combine into one object
   outlist_total <- NULL
-  for (file_nr in 1:length(filled_files)) {
+  for (file_nr in seq_along(filled_files)) {
     peakgrouplist_filled <- get(load(filled_files[file_nr]))
     outlist_total <- rbind(outlist_total, peakgrouplist_filled)
   }
@@ -47,14 +44,14 @@ for (scanmode in scanmodes) {
     outlist_stats_more <- cbind(outlist_stats_more, tmp)
     outlist_total <- outlist_stats_more
   }
-  
+
   # make a copy of the outlist
   outlist_ident <- outlist_total
-  # take care of NAs in theormz_noise
-  outlist_ident$theormz_noise[which(is.na(outlist_ident$theormz_noise))] <- 0
-  outlist_ident$theormz_noise <- as.numeric(outlist_ident$theormz_noise)
-  outlist_ident$theormz_noise[which(is.na(outlist_ident$theormz_noise))] <- 0
-  outlist_ident$theormz_noise <- as.numeric(outlist_ident$theormz_noise)
+  # select identified peak groups if ppm deviation is within limits
+  if (z_score == 1) {
+    outlist_ident$ppmdev <- as.numeric(outlist_ident$ppmdev)
+    outlist_ident <- outlist_ident[which(outlist_ident["ppmdev"] >= -ppm & outlist_ident["ppmdev"] <= ppm), ]
+  }
 
   # Extra output in Excel-readable format:
   remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp")

diff --git a/DIMS/CollectFilled.nf b/DIMS/CollectFilled.nf
@@ -14,6 +14,6 @@ process CollectFilled {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R $params.scripts_dir $params.ppm $params.zscore
+        Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R $params.preprocessing_scripts_dir $params.ppm $params.zscore
         """
 }
diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R
@@ -1,25 +1,12 @@
-## adapted from 9-runFillMissing.R
-
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
 peakgrouplist_file <- cmd_args[1]
-scripts_dir <- cmd_args[2]
+preprocessing_scripts_dir <- cmd_args[2]
 thresh <- as.numeric(cmd_args[3])
-resol <- as.numeric(cmd_args[4])
-ppm <- as.numeric(cmd_args[5])
-outdir <- "./"
 
 # load in function scripts
-source(paste0(scripts_dir, "replace_zeros.R"))
-source(paste0(scripts_dir, "fit_optim.R"))
-source(paste0(scripts_dir, "get_fwhm.R"))
-source(paste0(scripts_dir, "get_stdev.R"))
-source(paste0(scripts_dir, "estimate_area.R"))
-source(paste0(scripts_dir, "optimize_gaussfit.R"))
-source(paste0(scripts_dir, "identify_noisepeaks.R"))
-source(paste0(scripts_dir, "get_element_info.R"))
-source(paste0(scripts_dir, "atomic_info.R"))
+source(paste0(preprocessing_scripts_dir, "fill_missing_functions.R"))
 
 # determine scan mode
 if (grepl("_pos", peakgrouplist_file)) {
@@ -33,12 +20,13 @@ pattern_file <- paste0(scanmode, "_repl_pattern.RData")
 repl_pattern <- get(load(pattern_file))
 
 # load peak group list and determine output file name
-outpgrlist_identified <- get(load(peakgrouplist_file))
-
-outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file)
+peakgroup_list <- get(load(peakgrouplist_file))
 
 # replace missing values (zeros) with random noise
-peakgrouplist_filled <- replace_zeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm)
+peakgrouplist_filled <- fill_missing_intensities(peakgroup_list, repl_pattern, thresh)
+
+# set name of output file
+outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file)
 
 # save output
 save(peakgrouplist_filled, file = outputfile_name)
diff --git a/DIMS/FillMissing.nf b/DIMS/FillMissing.nf
@@ -13,6 +13,6 @@ process FillMissing {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/FillMissing.R $peakgrouplist_file $params.scripts_dir $params.thresh $params.resolution $params.ppm
+        Rscript ${baseDir}/CustomModules/DIMS/FillMissing.R $peakgrouplist_file $params.preprocessing_scripts_dir $params.thresh
         """
 }
diff --git a/DIMS/GenerateQCOutput.R b/DIMS/GenerateQCOutput.R
@@ -329,6 +329,7 @@ if (length(sst_colnrs) > 0) {
   sst_list_intensities <- sst_list[, intensity_col_ids]
 }
 for (col_nr in seq_len(ncol(sst_list_intensities))) {
+  sst_list_intensities <- as.data.frame(sst_list_intensities)
   sst_list_intensities[, col_nr] <- as.numeric(sst_list_intensities[, col_nr])
   if (grepl("Zscore", colnames(sst_list_intensities)[col_nr])) {
     sst_list_intensities[, col_nr] <- round(sst_list_intensities[, col_nr], 2)

diff --git a/DIMS/Utils/calculate_zscores.R b/DIMS/Utils/calculate_zscores.R
@@ -55,7 +55,7 @@ calculate_zscores <- function(peakgroup_list, adducts) {
           ppmdev[i] <- NA
         }
       }
-      peakgroup_list <- cbind(peakgroup_list[, 1:6], ppmdev = ppmdev, peakgroup_list[, 7:ncol(peakgroup_list)])
+      peakgroup_list <- cbind(peakgroup_list[, 1:4], ppmdev = ppmdev, peakgroup_list[, 5:ncol(peakgroup_list)])
     }
   }
 

diff --git a/DIMS/Utils/merge_duplicate_rows.R b/DIMS/Utils/merge_duplicate_rows.R
@@ -38,10 +38,6 @@ merge_duplicate_rows <- function(peakgroup_list) {
     single_peakgroup[, "assi_HMDB"] <- collapse("assi_HMDB", peakgroup_list, peaklist_index)
     single_peakgroup[, "iso_HMDB"] <- collapse("iso_HMDB", peakgroup_list, peaklist_index)
     single_peakgroup[, "HMDB_code"] <- collapse("HMDB_code", peakgroup_list, peaklist_index)
-    single_peakgroup[, "assi_noise"] <- collapse("assi_noise", peakgroup_list, peaklist_index)
-    if (single_peakgroup[, "assi_noise"] == ";") single_peakgroup[, "assi_noise"] <- NA
-    single_peakgroup[, "theormz_noise"] <- collapse("theormz_noise", peakgroup_list, peaklist_index)
-    if (single_peakgroup[, "theormz_noise"] == "0;0") single_peakgroup[, "theormz_noise"] <- NA
     single_peakgroup[, "all_hmdb_ids"] <- collapse("all_hmdb_ids", peakgroup_list, peaklist_index)
     single_peakgroup[, "sec_hmdb_ids"] <- collapse("sec_hmdb_ids", peakgroup_list, peaklist_index)
     if (single_peakgroup[, "sec_hmdb_ids"] == ";") single_peakgroup[, "sec_hmdb_ids"] < NA

diff --git a/DIMS/preprocessing/collect_filled_functions.R b/DIMS/preprocessing/collect_filled_functions.R
@@ -0,0 +1,119 @@
+# CollectFilled functions
+
+collapse <- function(column_label, peakgroup_list, index_dup) {
+  #' Collapse identification info for peak groups with the same mass
+  #'
+  #' @param column_label: Name of column in peakgroup_list (string)
+  #' @param peakgroup_list: Peak group list (matrix)
+  #' @param index_dup: Index of duplicate peak group (integer)
+  #'
+  #' @return collapsed_items: Semicolon-separated list of info (string)
+  # get the item(s) that need to be collapsed
+  list_items <- as.vector(peakgroup_list[index_dup, column_label])
+  # remove NA
+  if (length(which(is.na(list_items))) > 0) {
+    list_items <- list_items[-which(is.na(list_items))]
+  }
+  collapsed_items <- paste(list_items, collapse = ";")
+  return(collapsed_items)
+}
+
+merge_duplicate_rows <- function(peakgroup_list) {
+  #' Merge identification info for peak groups with the same mass
+  #'
+  #' @param peakgroup_list: Peak group list (matrix)
+  #'
+  #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix)
+
+  options(digits = 16)
+  collect <- NULL
+  remove <- NULL
+
+  # check for peak groups with identical mass
+  index_dup <- which(duplicated(peakgroup_list[, "mzmed.pgrp"]))
+
+  while (length(index_dup) > 0) {
+    # get the index for the peak group which is double
+    peaklist_index <- which(peakgroup_list[, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])
+    single_peakgroup <- peakgroup_list[peaklist_index[1], , drop = FALSE]
+
+    # use function collapse to concatenate info
+    single_peakgroup[, "assi_HMDB"] <- collapse("assi_HMDB", peakgroup_list, peaklist_index)
+    single_peakgroup[, "iso_HMDB"] <- collapse("iso_HMDB", peakgroup_list, peaklist_index)
+    single_peakgroup[, "HMDB_code"] <- collapse("HMDB_code", peakgroup_list, peaklist_index)
+    single_peakgroup[, "all_hmdb_ids"] <- collapse("all_hmdb_ids", peakgroup_list, peaklist_index)
+    single_peakgroup[, "sec_hmdb_ids"] <- collapse("sec_hmdb_ids", peakgroup_list, peaklist_index)
+    if (single_peakgroup[, "sec_hmdb_ids"] == ";") single_peakgroup[, "sec_hmdb_ids"] < NA
+
+    # keep track of deduplicated entries
+    collect <- rbind(collect, single_peakgroup)
+    remove <- c(remove, peaklist_index)
+
+    # remove current entry from index
+    index_dup <- index_dup[-which(peakgroup_list[index_dup, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])]
+  }
+
+  # remove duplicate entries
+  if (!is.null(remove)) {
+    peakgroup_list <- peakgroup_list[-remove, ]
+  }
+  # append deduplicated entries
+  peakgroup_list_dedup <- rbind(peakgroup_list, collect)
+  return(peakgroup_list_dedup)
+}
+
+calculate_zscores <- function(peakgroup_list) {
+  #' Calculate Z-scores for peak groups based on average and standard deviation of controls
+  #'
+  #' @param peakgroup_list: Peak group list (matrix)
+  #' @param sort_col: Column to sort on (string)
+  #' @param adducts: Parameter indicating whether there are adducts in the list (boolean)
+  #'
+  #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix)
+
+  case_label <- "P"
+  control_label <- "C"
+  # get index for new column names
+  startcol <- ncol(peakgroup_list) + 3
+
+  # calculate mean and standard deviation for Control group
+  ctrl_cols <- grep(control_label, colnames(peakgroup_list), fixed = TRUE)
+  case_cols <- grep(case_label, colnames(peakgroup_list), fixed = TRUE)
+  int_cols <- c(ctrl_cols, case_cols)
+  # set all zeros to NA
+  peakgroup_list[, int_cols][peakgroup_list[, int_cols] == 0] <- NA
+  ctrl_ints <- peakgroup_list[, ctrl_cols, drop = FALSE]
+  peakgroup_list$avg.ctrls <- apply(ctrl_ints, 1, function(x) mean(as.numeric(x), na.rm = TRUE))
+  peakgroup_list$sd.ctrls <- apply(ctrl_ints, 1, function(x) sd(as.numeric(x), na.rm = TRUE))
+
+  # set new column names and calculate Z-scores
+  colnames_zscores <- NULL
+  for (col_index in int_cols) {
+    col_name <- colnames(peakgroup_list)[col_index]
+    colnames_zscores <- c(colnames_zscores, paste0(col_name, "_Zscore"))
+    zscores_1col <- (as.numeric(as.vector(unlist(peakgroup_list[, col_index]))) -
+                     peakgroup_list$avg.ctrls) / peakgroup_list$sd.ctrls
+    peakgroup_list <- cbind(peakgroup_list, zscores_1col)
+  }
+
+  # apply new column names to columns at end plus avg and sd columns
+  colnames(peakgroup_list)[startcol:ncol(peakgroup_list)] <- colnames_zscores
+
+  # add ppm deviation column
+  zscore_cols <- grep("Zscore", colnames(peakgroup_list), fixed = TRUE)
+  # calculate ppm deviation
+  for (row_index in seq_len(nrow(peakgroup_list))) {
+    if (!is.na(peakgroup_list$theormz_HMDB[row_index]) &&
+        !is.null(peakgroup_list$theormz_HMDB[row_index]) &&
+        (peakgroup_list$theormz_HMDB[row_index] != "")) {
+      peakgroup_list$ppmdev[row_index] <- 10^6 * (as.numeric(as.vector(peakgroup_list$mzmed.pgrp[row_index])) -
+                                                  as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index]))) /
+                                                  as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index]))
+    } else {
+      peakgroup_list$ppmdev[row_index] <- NA
+    }
+  }
+
+  return(peakgroup_list)
+}
+
diff --git a/DIMS/preprocessing/fill_missing_functions.R b/DIMS/preprocessing/fill_missing_functions.R
@@ -0,0 +1,36 @@
+fill_missing_intensities <- function(peakgroup_list, repl_pattern, thresh, not_random = FALSE) {
+  #' Replace intensities that are zero with random value
+  #'
+  #' @param peakgroup_list: Peak groups (matrix)
+  #' @param repl_pattern: Replication pattern (list of strings)
+  #' @param thresh: Value for threshold between noise and signal (integer)
+  #'
+  #' @return final_outlist: peak groups with filled-in intensities (matrix)
+
+  # for unit test, turn off randomness
+  if (not_random) {
+    set.seed(123)
+  }
+
+  # replace missing intensities with random values around threshold
+  if (!is.null(peakgroup_list)) {
+    for (sample_index in seq_along(names(repl_pattern))) {
+      sample_peaks <- peakgroup_list[, names(repl_pattern)[sample_index]]
+      zero_intensity <- which(sample_peaks <= 0)
+      if (!length(zero_intensity)) {
+        next
+      }
+      for (zero_index in seq_along(zero_intensity)) {
+        peakgroup_list[zero_intensity[zero_index], names(repl_pattern)[sample_index]] <- rnorm(n = 1,
+                                                                                               mean = thresh,
+                                                                                               sd = 100)
+      }
+    }
+
+    # Add column with average intensity; find intensity columns first
+    int_cols <- which(colnames(peakgroup_list) %in% names(repl_pattern))
+    peakgroup_list <- cbind(peakgroup_list, "avg.int" = apply(peakgroup_list[, int_cols], 1, mean))
+
+    return(peakgroup_list)
+  }
+}
diff --git a/DIMS/tests/testthat/test_fill_missing.R b/DIMS/tests/testthat/test_fill_missing.R
@@ -0,0 +1,45 @@
+# unit tests for FillMissing
+# function: fill_missing_intensities
+source("../../preprocessing/fill_missing_functions.R")
+
+# test fill_missing_intensities
+testthat::test_that("missing values are corretly filled with random values", {
+  # create peakgroup_list to test on in diagnostics setting
+  test_peakgroup_list <- data.frame(matrix(NA, nrow = 4, ncol = 23))
+  colnames(test_peakgroup_list) <- c("mzmed.pgrp", "nrsamples", "ppmdev", "assi_HMDB", "all_hmdb_names",
+                                     "iso_HMDB", "HMDB_code", "all_hmdb_ids", "sec_hmdb_ids", "theormz_HMDB",
+                                     "C101.1", "C102.1", "P2.1", "P3.1",
+                                     "avg.int", "assi_noise", "theormz_noise", "avg.ctrls", "sd.ctrls",
+                                     "C101.1_Zscore", "C102.1_Zscore", "P2.1_Zscore", "P3.1_Zscore")
+  test_peakgroup_list[, c(1)] <- 300 + runif(4)
+  test_peakgroup_list[, c(2, 3)] <- runif(8)
+  test_peakgroup_list[, "HMDB_code"] <- c("HMDB1234567", "HMDB1234567_1", "HMDB1234567_2", "HMDB1234567_7")
+  test_peakgroup_list[, "all_hmdb_ids"] <- paste(test_peakgroup_list[, "HMDB_code"],
+                                                 test_peakgroup_list[, "HMDB_code"], sep = ";")
+  test_peakgroup_list[, "all_hmdb_names"] <- paste(test_peakgroup_list[, "assi_HMDB"],
+                                                   test_peakgroup_list[, "assi_HMDB"], sep = ";")
+  test_peakgroup_list[, grep("C", colnames(test_peakgroup_list))] <- 1000 * (1:16)
+  test_peakgroup_list[, grep("P", colnames(test_peakgroup_list))] <- 0
+  test_repl_pattern <- c(list(1), list(2), list(3), list(4))
+  names(test_repl_pattern) <- c("C101.1", "C102.1", "P2.1", "P3.1")
+  test_thresh <- 2000
+
+  # create a large peak group list to test for negative values
+  test_large_peakgroup_list <- rbind(test_peakgroup_list, test_peakgroup_list)
+  for (index in 1:15) {
+    test_large_peakgroup_list <- rbind(test_large_peakgroup_list, test_large_peakgroup_list)
+  }
+  # for the sake of time, leave only one intensity column with zeros
+  test_large_peakgroup_list$P2.1 <- 1
+
+  expect_equal(round(fill_missing_intensities(test_peakgroup_list, test_repl_pattern, test_thresh, not_random = TRUE)$P2.1),
+               c(1944, 1977, 2156, 2007), TRUE, tolerance = 0.1)
+  # fill_missing_intensities should not produce any negative values, even if a large quantity of numbers are filled in
+  start.time <- Sys.time()
+  expect_gt(min(fill_missing_intensities(test_large_peakgroup_list, test_repl_pattern, test_thresh, not_random = FALSE)$P3.1),
+            0, TRUE)
+  end.time <- Sys.time()
+  time.taken <- end.time - start.time
+  time.taken
+
+})
-Original file line number
+Diff line change
@@ Expand Up / @@ -14,6 +14,6 @@ process CollectFilled { @@
         script:
             """
-            Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R $params.scripts_dir $params.ppm $params.zscore
+            Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R $params.preprocessing_scripts_dir $params.ppm $params.zscore
             """
     }