-
Notifications
You must be signed in to change notification settings - Fork 0
Feature/refactor DIMS FillMissing #91
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
4c937de
4b395e1
1f77b21
d0aff6c
519c5b7
4bbccd8
7ea64f0
14ce7e6
c386ea3
1ff7e0e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,119 @@ | ||
| # CollectFilled functions | ||
|
|
||
| collapse <- function(column_label, peakgroup_list, index_dup) { | ||
| #' Collapse identification info for peak groups with the same mass | ||
| #' | ||
| #' @param column_label: Name of column in peakgroup_list (string) | ||
| #' @param peakgroup_list: Peak group list (matrix) | ||
| #' @param index_dup: Index of duplicate peak group (integer) | ||
| #' | ||
| #' @return collapsed_items: Semicolon-separated list of info (string) | ||
| # get the item(s) that need to be collapsed | ||
| list_items <- as.vector(peakgroup_list[index_dup, column_label]) | ||
| # remove NA | ||
| if (length(which(is.na(list_items))) > 0) { | ||
| list_items <- list_items[-which(is.na(list_items))] | ||
| } | ||
| collapsed_items <- paste(list_items, collapse = ";") | ||
| return(collapsed_items) | ||
| } | ||
|
|
||
| merge_duplicate_rows <- function(peakgroup_list) { | ||
| #' Merge identification info for peak groups with the same mass | ||
| #' | ||
| #' @param peakgroup_list: Peak group list (matrix) | ||
| #' | ||
| #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix) | ||
|
|
||
| options(digits = 16) | ||
| collect <- NULL | ||
| remove <- NULL | ||
|
|
||
| # check for peak groups with identical mass | ||
| index_dup <- which(duplicated(peakgroup_list[, "mzmed.pgrp"])) | ||
|
|
||
| while (length(index_dup) > 0) { | ||
| # get the index for the peak group which is double | ||
| peaklist_index <- which(peakgroup_list[, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"]) | ||
| single_peakgroup <- peakgroup_list[peaklist_index[1], , drop = FALSE] | ||
|
|
||
| # use function collapse to concatenate info | ||
| single_peakgroup[, "assi_HMDB"] <- collapse("assi_HMDB", peakgroup_list, peaklist_index) | ||
| single_peakgroup[, "iso_HMDB"] <- collapse("iso_HMDB", peakgroup_list, peaklist_index) | ||
| single_peakgroup[, "HMDB_code"] <- collapse("HMDB_code", peakgroup_list, peaklist_index) | ||
| single_peakgroup[, "all_hmdb_ids"] <- collapse("all_hmdb_ids", peakgroup_list, peaklist_index) | ||
| single_peakgroup[, "sec_hmdb_ids"] <- collapse("sec_hmdb_ids", peakgroup_list, peaklist_index) | ||
| if (single_peakgroup[, "sec_hmdb_ids"] == ";") single_peakgroup[, "sec_hmdb_ids"] < NA | ||
|
|
||
| # keep track of deduplicated entries | ||
| collect <- rbind(collect, single_peakgroup) | ||
| remove <- c(remove, peaklist_index) | ||
|
|
||
| # remove current entry from index | ||
| index_dup <- index_dup[-which(peakgroup_list[index_dup, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])] | ||
| } | ||
|
|
||
| # remove duplicate entries | ||
| if (!is.null(remove)) { | ||
| peakgroup_list <- peakgroup_list[-remove, ] | ||
| } | ||
| # append deduplicated entries | ||
| peakgroup_list_dedup <- rbind(peakgroup_list, collect) | ||
| return(peakgroup_list_dedup) | ||
| } | ||
|
|
||
| calculate_zscores <- function(peakgroup_list) { | ||
| #' Calculate Z-scores for peak groups based on average and standard deviation of controls | ||
| #' | ||
| #' @param peakgroup_list: Peak group list (matrix) | ||
| #' @param sort_col: Column to sort on (string) | ||
| #' @param adducts: Parameter indicating whether there are adducts in the list (boolean) | ||
| #' | ||
| #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix) | ||
|
|
||
| case_label <- "P" | ||
| control_label <- "C" | ||
| # get index for new column names | ||
| startcol <- ncol(peakgroup_list) + 3 | ||
|
|
||
| # calculate mean and standard deviation for Control group | ||
| ctrl_cols <- grep(control_label, colnames(peakgroup_list), fixed = TRUE) | ||
| case_cols <- grep(case_label, colnames(peakgroup_list), fixed = TRUE) | ||
| int_cols <- c(ctrl_cols, case_cols) | ||
| # set all zeros to NA | ||
| peakgroup_list[, int_cols][peakgroup_list[, int_cols] == 0] <- NA | ||
| ctrl_ints <- peakgroup_list[, ctrl_cols, drop = FALSE] | ||
| peakgroup_list$avg.ctrls <- apply(ctrl_ints, 1, function(x) mean(as.numeric(x), na.rm = TRUE)) | ||
| peakgroup_list$sd.ctrls <- apply(ctrl_ints, 1, function(x) sd(as.numeric(x), na.rm = TRUE)) | ||
|
|
||
| # set new column names and calculate Z-scores | ||
| colnames_zscores <- NULL | ||
| for (col_index in int_cols) { | ||
| col_name <- colnames(peakgroup_list)[col_index] | ||
| colnames_zscores <- c(colnames_zscores, paste0(col_name, "_Zscore")) | ||
| zscores_1col <- (as.numeric(as.vector(unlist(peakgroup_list[, col_index]))) - | ||
| peakgroup_list$avg.ctrls) / peakgroup_list$sd.ctrls | ||
| peakgroup_list <- cbind(peakgroup_list, zscores_1col) | ||
| } | ||
|
|
||
| # apply new column names to columns at end plus avg and sd columns | ||
| colnames(peakgroup_list)[startcol:ncol(peakgroup_list)] <- colnames_zscores | ||
|
|
||
| # add ppm deviation column | ||
| zscore_cols <- grep("Zscore", colnames(peakgroup_list), fixed = TRUE) | ||
| # calculate ppm deviation | ||
| for (row_index in seq_len(nrow(peakgroup_list))) { | ||
| if (!is.na(peakgroup_list$theormz_HMDB[row_index]) && | ||
| !is.null(peakgroup_list$theormz_HMDB[row_index]) && | ||
| (peakgroup_list$theormz_HMDB[row_index] != "")) { | ||
| peakgroup_list$ppmdev[row_index] <- 10^6 * (as.numeric(as.vector(peakgroup_list$mzmed.pgrp[row_index])) - | ||
| as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index]))) / | ||
| as.numeric(as.vector(peakgroup_list$theormz_HMDB[row_index])) | ||
| } else { | ||
| peakgroup_list$ppmdev[row_index] <- NA | ||
| } | ||
| } | ||
|
|
||
| return(peakgroup_list) | ||
| } | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,36 @@ | ||
| fill_missing_intensities <- function(peakgroup_list, repl_pattern, thresh, not_random = FALSE) { | ||
| #' Replace intensities that are zero with random value | ||
| #' | ||
| #' @param peakgroup_list: Peak groups (matrix) | ||
| #' @param repl_pattern: Replication pattern (list of strings) | ||
| #' @param thresh: Value for threshold between noise and signal (integer) | ||
| #' | ||
| #' @return final_outlist: peak groups with filled-in intensities (matrix) | ||
|
|
||
| # for unit test, turn off randomness | ||
| if (not_random) { | ||
| set.seed(123) | ||
| } | ||
mraves2 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| # replace missing intensities with random values around threshold | ||
| if (!is.null(peakgroup_list)) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe move if not null statement to main script. Also missing else, what happens if it is null?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would like to keep the main script as 'clean' as possible, so I prefer to keep the if statement inside the function. |
||
| for (sample_index in seq_along(names(repl_pattern))) { | ||
| sample_peaks <- peakgroup_list[, names(repl_pattern)[sample_index]] | ||
| zero_intensity <- which(sample_peaks <= 0) | ||
| if (!length(zero_intensity)) { | ||
| next | ||
| } | ||
| for (zero_index in seq_along(zero_intensity)) { | ||
| peakgroup_list[zero_intensity[zero_index], names(repl_pattern)[sample_index]] <- rnorm(n = 1, | ||
| mean = thresh, | ||
| sd = 100) | ||
| } | ||
| } | ||
|
|
||
| # Add column with average intensity; find intensity columns first | ||
| int_cols <- which(colnames(peakgroup_list) %in% names(repl_pattern)) | ||
| peakgroup_list <- cbind(peakgroup_list, "avg.int" = apply(peakgroup_list[, int_cols], 1, mean)) | ||
|
|
||
| return(peakgroup_list) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Return is within the if-statement, what happens if peakgroup_list is null?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See comment above; a null object is returned. |
||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,45 @@ | ||
| # unit tests for FillMissing | ||
| # function: fill_missing_intensities | ||
| source("../../preprocessing/fill_missing_functions.R") | ||
|
|
||
| # test fill_missing_intensities | ||
| testthat::test_that("missing values are corretly filled with random values", { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add function name that is tested
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function names have not been added in the test_that line for any other unit test; this is a good idea for the general standardization for version 3.5. |
||
| # create peakgroup_list to test on in diagnostics setting | ||
| test_peakgroup_list <- data.frame(matrix(NA, nrow = 4, ncol = 23)) | ||
| colnames(test_peakgroup_list) <- c("mzmed.pgrp", "nrsamples", "ppmdev", "assi_HMDB", "all_hmdb_names", | ||
| "iso_HMDB", "HMDB_code", "all_hmdb_ids", "sec_hmdb_ids", "theormz_HMDB", | ||
| "C101.1", "C102.1", "P2.1", "P3.1", | ||
| "avg.int", "assi_noise", "theormz_noise", "avg.ctrls", "sd.ctrls", | ||
| "C101.1_Zscore", "C102.1_Zscore", "P2.1_Zscore", "P3.1_Zscore") | ||
| test_peakgroup_list[, c(1)] <- 300 + runif(4) | ||
| test_peakgroup_list[, c(2, 3)] <- runif(8) | ||
| test_peakgroup_list[, "HMDB_code"] <- c("HMDB1234567", "HMDB1234567_1", "HMDB1234567_2", "HMDB1234567_7") | ||
| test_peakgroup_list[, "all_hmdb_ids"] <- paste(test_peakgroup_list[, "HMDB_code"], | ||
| test_peakgroup_list[, "HMDB_code"], sep = ";") | ||
| test_peakgroup_list[, "all_hmdb_names"] <- paste(test_peakgroup_list[, "assi_HMDB"], | ||
| test_peakgroup_list[, "assi_HMDB"], sep = ";") | ||
| test_peakgroup_list[, grep("C", colnames(test_peakgroup_list))] <- 1000 * (1:16) | ||
|
Comment on lines
+8
to
+21
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same code as test_sum_intensities_adducts.R. Maybe make a txt file with the test_peakgroup_list table and load the table in the test_that() function
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. test_sum_adducts.R will be modified in version 3.5. |
||
| test_peakgroup_list[, grep("P", colnames(test_peakgroup_list))] <- 0 | ||
| test_repl_pattern <- c(list(1), list(2), list(3), list(4)) | ||
| names(test_repl_pattern) <- c("C101.1", "C102.1", "P2.1", "P3.1") | ||
| test_thresh <- 2000 | ||
|
|
||
| # create a large peak group list to test for negative values | ||
| test_large_peakgroup_list <- rbind(test_peakgroup_list, test_peakgroup_list) | ||
| for (index in 1:15) { | ||
| test_large_peakgroup_list <- rbind(test_large_peakgroup_list, test_large_peakgroup_list) | ||
| } | ||
| # for the sake of time, leave only one intensity column with zeros | ||
| test_large_peakgroup_list$P2.1 <- 1 | ||
|
|
||
| expect_equal(round(fill_missing_intensities(test_peakgroup_list, test_repl_pattern, test_thresh, not_random = TRUE)$P2.1), | ||
| c(1944, 1977, 2156, 2007), TRUE, tolerance = 0.1) | ||
| # fill_missing_intensities should not produce any negative values, even if a large quantity of numbers are filled in | ||
| start.time <- Sys.time() | ||
| expect_gt(min(fill_missing_intensities(test_large_peakgroup_list, test_repl_pattern, test_thresh, not_random = FALSE)$P3.1), | ||
| 0, TRUE) | ||
| end.time <- Sys.time() | ||
| time.taken <- end.time - start.time | ||
| time.taken | ||
mraves2 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| }) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could the indices be changes to column names, for readability and if column orders change in the future?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ppmdev column is already present in the peak grouplist, so this section is refactored. Referring to columns by number has been removed.