From 871819923f6d7fdceb1d5612d675139fcdeb269f Mon Sep 17 00:00:00 2001 From: Carolyn Allen Date: Mon, 1 May 2023 08:09:00 -0700 Subject: [PATCH 1/6] initial quant module --- diadem/quantify/quant.py | 343 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 343 insertions(+) create mode 100644 diadem/quantify/quant.py diff --git a/diadem/quantify/quant.py b/diadem/quantify/quant.py new file mode 100644 index 0000000..f8830c2 --- /dev/null +++ b/diadem/quantify/quant.py @@ -0,0 +1,343 @@ +import math + +import numpy as np +import polars as pl +from loguru import logger + + +def quant(ms_data: pl.DataFrame, pep: pl.DataFrame()) -> None: + """Main function. + + Parameters + ---------- + ms_data : pl.DataFrame + Mass spec data file. + pep : pl.DataFrame + Peptide data file. + """ + # TODO: protein quant + # TODO: MAJOR OPTIMIZATION - I spent 2 days on this so there are LOTS + # of ways top optimize + # TODO: tests + + # Match the peptide to spectra + pep_spectrum_df = match_peptide(ms_data, pep) + # Perform peptide quant + peptide_quant(ms_data, pep_spectrum_df) + + +def match_peptide(ms_data: pl.DataFrame, pep: pl.DataFrame) -> pl.DataFrame: + """Helper function to get the intensity of a peak at a given retention time. + + Parameters + ---------- + ms_data : pl.DataFrame + Mass spec data file. + pep : pl.DataFrame + Peptide data file. + + Returns + ------- + pl.DataFrame: + DataFrame with the matched peptide and mass spec data. + """ + # Initialize the dictionary that will be converted into the returned dataframe + data = {"peptide": [], "rt": [], "intensity": [], "mz": [], "ims": []} + + # Iterate through each row of the mass spec data and find the corresponding peptides + # in the peptide dataframe. + for row in ms_data.rows(named=True): + # Convert the mzs to a series so they are easier to use + row["mzs"] = pl.Series(row["mzs"]) + # Get all peptides at the given retention time + peptide = pep.filter(pl.col("RetentionTime") == row["rts"]) + # For each peptide at the retention time, find the most intense peak and sum + # across ion mobility + for p in peptide.rows(named=True): + # For each mz in the peptide mz list, get the mass spec mzs that are within + # .02 of that mz. This `grouped_mz` list contains lists of indicies of mzs + # from the mass spec data file that are close to the peptide mz. + grouped_mzs = [] + for mz in p["mzs"]: + mzs = list( + ((row["mzs"] <= mz + 0.02) & (row["mzs"] >= mz - 0.02)).arg_true() + ) + # If those mzs have already been selected, continue. Otherwise add the + # values to a list. + if mzs in grouped_mzs: + continue + else: + found = False + for group in grouped_mzs: + if all(x in group for x in mzs): + found = True + break + if not found: + grouped_mzs.append(mzs) + # Initialize data to hold the information for the most intense peak + most_intense = 0 + most_intense_mz = 0 + most_intense_ims = 0 + # Look at the mzs that are close to each other. + for group in grouped_mzs: + sum_intensity = 0 + total_mz = 0 + total_ims = 0 + num = 0 + # Sum mzs that are close to each other and average the mz and ims + for ind in group: + if math.isclose(row["ims"][ind], p["IonMobility"], abs_tol=0.03): + sum_intensity += row["intensities"][ind] + total_mz += row["mzs"][ind] + total_ims += row["ims"][ind] + num += 1 + if sum_intensity > most_intense: + most_intense = sum_intensity + most_intense_ims = total_ims / num + most_intense_mz = total_mz / num + # Add the most intense peaks to the data + data["peptide"].append(p["peptide"]) + data["intensity"].append(most_intense) + data["rt"].append(row["rts"]) + data["mz"].append(most_intense_mz) + data["ims"].append(most_intense_ims) + + # Return the polars dataframe with the peptide-spectrum matched data + return pl.DataFrame( + data, + schema=[ + ("peptide", pl.Utf8), + ("rt", pl.Float64), + ("intensity", pl.Float64), + ("mz", pl.Float64), + ("ims", pl.Float64), + ], + ) + + +def get_peaks( + ms_data: pl.DataFrame, row: dict, i: int, next_val: float, right: float = True +) -> list: + """Helper function to get the corresponding peaks of the peptide at different + retention times. + + Parameters + ---------- + ms_data : pl.DataFrame + Mass spec data file. + row : dict + Row of the peptide data that contains the peptide we are interested in. + i : int + Index of the peptide of interest in the mass spec data file. + next_val : float + Intensity of next peak either to the right or the left. + This help us to determine what part of the curve we are on (if we should stop + as soon as the intensities increase/level off, or if the values should increase + before they decrease). + right : float, default = True + True if we are looking at intensities to the right of the first measurement. + False if we are looking to the left. + + Returns + ------- + list: + List containing the intensities of the matching peaks at different retention + times. Another list of the retention times those peaks were found at. + """ + # Initialize the lists that will contain the intensities/rts that we will return + intensities = [] + rts = [] + + # Keep track of the previous intensity so you know if the intensities are + # decreasing or increasing. This could be simplified to just look at the + # last item in the intensities list, but I'll do that later. + last_val = row["intensity"] + # Keep track of the number of times the intensity isn't changing (i.e. peak is + # leveling off). + count = 0 + + # Here we determine if the peak is increasing or decreasing at this moment in time. + # If it is decreasing, we should stop once the intensities increase or level off. + # If it is increasing, we should see the intensities increase and then decrease. + break_if_up = True + if row["intensity"] < next_val: + break_if_up = False + + # This is the loop that is getting all the intensities/rts for a peptide. + while True: + # If we are looking to the right of the matched intensity, make sure we are + # in the range of our data and go 1 rt to the right. Otherwise, go 1 rt to + # the left. + if right: + if i + 1 > len(ms_data): + return intensities, rts + i += 1 + else: + if i - 1 < 0: + return intensities, rts + i -= 1 + + # Get the intensity and rt of the peak + sum_intensity, rt = get_intensity_val(ms_data, row, i) + + # Return the data if any of the following criteria is met: + # 1. the intensity is 0 of the current peak + # 2. the intensity is increasing when it should be decreasing + # 3. the intensity is leveling off + if ( + sum_intensity == 0 + or break_if_up + and sum_intensity > last_val + last_val / 100 + or count > 3 + ): + return intensities, rts + + # If the intensity is decreasing when it should be, append the information to + # our lists. Check if the values are roughly the same, meaning it is leveling + # off. + elif break_if_up and sum_intensity < last_val: + intensities.append(sum_intensity) + rts.append(rt) + last_val = sum_intensity + if sum_intensity > last_val + last_val / 100: + count += 1 + else: + count = 0 + + # If the intensity is increasing when it should be, append the information + # to our lists. + elif not break_if_up and sum_intensity > last_val: + intensities.append(sum_intensity) + rts.append(rt) + last_val = sum_intensity + count = 0 + + # If the intensity is decreasing when it should be increasing, this means + # we have gone over the top of our peak. Append the information to our + # lists and set `break_if_up` to True. + elif not break_if_up and sum_intensity < last_val: + break_if_up = True + intensities.append(sum_intensity) + rts.append(rt) + last_val = sum_intensity + count = 0 + + else: + logger.info("Error: something weird happened") + return + + +def get_intensity_val(ms_data: pl.DataFrame, row: dict, i: int) -> int: + """Helper function to get the intensity of a peak at a given retention time. + + Parameters + ---------- + ms_data : pl.DataFrame + Mass spec data file. + row : dict + Row of the peptide data that contains the peptide we are interested in. + i : int + Index of the peptide of interest in the mass spec data file. + + Returns + ------- + int: + Integer containing the intensity of the peak. + Integer containing the rt of the peak. + """ + # Extract the relevant row from our mass spec data. + curr_row = ms_data[i] + + # Get all mz values that are close to the peptide mz value. + s = list( + ( + (curr_row["mzs"].item() <= row["mz"] + 0.02) + & (curr_row["mzs"].item() >= row["mz"] - 0.02) + ).arg_true() + ) + + # Sum the intensities to collapse the ims dimention. + sum_intensity = 0 + total_mz = 0 + num = 0 + for ind in s: + if math.isclose(row["ims"], curr_row["ims"].item()[ind], abs_tol=0.03): + sum_intensity += curr_row["intensities"].item()[ind] + total_mz += curr_row["mzs"].item()[ind] + num += 1 + return sum_intensity, curr_row["rts"].item() + + +def peptide_quant(ms_data: pl.DataFrame, matched_df: pl.DataFrame) -> pl.DataFrame: + """Quantify peptides across rt. + + Parameters + ---------- + ms_data : pl.DataFrame + Mass spec data file. + matched_df : pl.DataFrame + Dataframe with peptide-spectrum matched data. + + Returns + ------- + pl.DataFrame: + DataFrame with the quantified peptides. + """ + # Initialize the dictionary that will be converted into the returned dataframe + pep_quant_data = {"peptide": [], "intensity": [], "mz": [], "num_fragments": []} + # Add row numbers to the mass spec dataframe so I can easily go to the right + # and left rts (this can be changed later to something more elegant) + num_ms_data = ms_data.with_row_count() + + # Iterate through each row in the peptide-spectrum matched dataframe and + # get the intensities for each peptide + for row in matched_df.rows(named=True): + # Get the row number in the mass spec dataframe for the current peptide + i = num_ms_data.filter(pl.col("rts") == row["rt"])["row_nr"].item() + # Get the intensities to the rt to the right and left of the current + # retention time. This can probably be moved to a different function + # or optimized later + sum_intensity_left, _ = get_intensity_val(ms_data, row, i - 1) + sum_intensity_right, _ = get_intensity_val(ms_data, row, i + 1) + + # Get the intensities/rts that match the peptide in rts to the left of + # the current rt. + i_left, rt_left = get_peaks(num_ms_data, row, i, sum_intensity_left, False) + # These lists must be reversed since they will be added by going backward + # in rt. + if len(i_left) > 1: + i_left, rt_left = i_left[::-1], rt_left[::-1] + # Get the intensities/rts that match the peptide in rts to the right of + # the current rt. + i_right, rt_right = get_peaks(num_ms_data, row, i, sum_intensity_right, True) + # Get a list with all intensities/rts for the peptide + all_intensities = i_left + [row["intensity"]] + i_right + rt_left + [row["rt"]] + rt_right + # If there is more than one peak, do a trapezoidal integration + if len(all_intensities) > 1: + summed = np.trapz( + all_intensities + ) # tried to do `x=all_rts` but it gave me a weird answer + # If there are no peaks, something went wrong. + elif len(all_intensities) == 0: + logger.info("Error: no peaks were found") + # If there is only one peak, the intensity is the peak intensity + else: + summed = all_intensities[0] + + # Store the data + pep_quant_data["peptide"].append(row["peptide"]) + pep_quant_data["intensity"].append(summed) + pep_quant_data["mz"].append(row["mz"]) + pep_quant_data["num_fragments"].append(len(all_intensities)) + + # Return the peptide quant dataframe + return pl.DataFrame( + pep_quant_data, + schema=[ + ("peptide", pl.Utf8), + ("intensity", pl.Float64), + ("mz", pl.Float64), + ("num_fragments", pl.Float64), + ], + ) \ No newline at end of file From 62c57c9f52df60adeb1805632084403a31da6d97 Mon Sep 17 00:00:00 2001 From: Carolyn Allen Date: Mon, 1 May 2023 08:14:19 -0700 Subject: [PATCH 2/6] update pre-commit --- .pre-commit-config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2507587..e621c1e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,17 +8,17 @@ repos: - id: trailing-whitespace - id: detect-private-key - repo: https://github.com/psf/black - rev: 22.12.0 + rev: 23.3.0 hooks: - id: black language_version: python3.9 - repo: https://github.com/pycqa/isort - rev: 5.11.4 + rev: 5.12.0 hooks: - id: isort name: isort (python) - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.0.228 + rev: v0.0.263 hooks: - id: ruff args: ['--fix'] From bb2b297bf17531947d9f1dfea80845993e8e7fdb Mon Sep 17 00:00:00 2001 From: Carolyn Allen Date: Tue, 2 May 2023 10:39:54 -0700 Subject: [PATCH 3/6] added basic quant tests --- diadem/quantify/quant.py | 14 +++---- tests/conftest.py | 8 ++++ tests/data/small-ms_data.parquet | Bin 0 -> 2523 bytes tests/data/small-peptides.parquet | Bin 0 -> 1542 bytes tests/quant_test.py | 65 ++++++++++++++++++++++++++++++ 5 files changed, 79 insertions(+), 8 deletions(-) create mode 100644 tests/data/small-ms_data.parquet create mode 100644 tests/data/small-peptides.parquet create mode 100644 tests/quant_test.py diff --git a/diadem/quantify/quant.py b/diadem/quantify/quant.py index f8830c2..a26b9d7 100644 --- a/diadem/quantify/quant.py +++ b/diadem/quantify/quant.py @@ -23,7 +23,7 @@ def quant(ms_data: pl.DataFrame, pep: pl.DataFrame()) -> None: # Match the peptide to spectra pep_spectrum_df = match_peptide(ms_data, pep) # Perform peptide quant - peptide_quant(ms_data, pep_spectrum_df) + return peptide_quant(ms_data, pep_spectrum_df) def match_peptide(ms_data: pl.DataFrame, pep: pl.DataFrame) -> pl.DataFrame: @@ -118,8 +118,7 @@ def match_peptide(ms_data: pl.DataFrame, pep: pl.DataFrame) -> pl.DataFrame: def get_peaks( ms_data: pl.DataFrame, row: dict, i: int, next_val: float, right: float = True ) -> list: - """Helper function to get the corresponding peaks of the peptide at different - retention times. + """Helper function to get corresponding peaks of a peptide at different rts. Parameters ---------- @@ -169,7 +168,7 @@ def get_peaks( # in the range of our data and go 1 rt to the right. Otherwise, go 1 rt to # the left. if right: - if i + 1 > len(ms_data): + if i + 1 >= len(ms_data): return intensities, rts i += 1 else: @@ -186,8 +185,7 @@ def get_peaks( # 3. the intensity is leveling off if ( sum_intensity == 0 - or break_if_up - and sum_intensity > last_val + last_val / 100 + or (break_if_up and sum_intensity > last_val + last_val / 1000) or count > 3 ): return intensities, rts @@ -199,7 +197,7 @@ def get_peaks( intensities.append(sum_intensity) rts.append(rt) last_val = sum_intensity - if sum_intensity > last_val + last_val / 100: + if sum_intensity > last_val + last_val / 1000: count += 1 else: count = 0 @@ -340,4 +338,4 @@ def peptide_quant(ms_data: pl.DataFrame, matched_df: pl.DataFrame) -> pl.DataFra ("mz", pl.Float64), ("num_fragments", pl.Float64), ], - ) \ No newline at end of file + ) diff --git a/tests/conftest.py b/tests/conftest.py index cd2bcec..716ad66 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,5 @@ import numpy as np +import polars as pl import pytest from ms2ml import Peptide @@ -168,3 +169,10 @@ def albumin_peptides(): out = [Peptide.from_proforma_seq(f"{x}/2") for x in ALBUMIN_PEPTIDE_SEQS] return out + + +@pytest.fixture +def get_data(shared_datadir): + """Read the parquet files for the quant module and return the dataframes.""" + return pl.read_parquet(shared_datadir / "small-ms_data.parquet"), \ + pl.read_parquet(shared_datadir / "small-peptides.parquet") diff --git a/tests/data/small-ms_data.parquet b/tests/data/small-ms_data.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d341da1fd03059cc6c200a8dd937ea3832800d79 GIT binary patch literal 2523 zcmd5;TWph696#&Vm98snDBqXX45$w@)nQf{(?+-KoU&yXMYb+%?Wk`3+K;WHH~V#T z)|W9tG-NuEsgoEYYGRD@7EzaZp-yA=FcYH0OGFt$e33*R!0@2M|NngL25d|ocI(UGZ34*zkP60lv5 z$~o`Zge^qEE5f&@Xq#Kbza0LNsnh-RJOJ$bBuMXq>!n_`HeMT_?WcHgjP_6r z*RV28Ur#_Hs#-$6Z<*Tv)t$p1shdB(!!rE2I{T(j5}19SC4vs!3wIB^#jM03nyed$ z!RivC8C4$%t0{drr9?%uR4t9`r5V9)Bw8cJbT`w8w*@jP5Gale?jR}JJs2q6OCHP0 zKGKx+&@LMEADnyT@?hg35}wV}9qk^=K9`L+51u*{J4)ZBYe{#v(`m7o%?5+8ch48u zYpxGA5fFAe4_s=Q9?B~R6z9{c$;Nyc#o<^=iK*dKSXrW+X`TJB9_tB-?S4K|9IIX= zFjk>f8)abu86&s`+bQwEIJ>g)tU?R5)Fw8a*ClhoEJ8A+LZ{5qfdXl$z&^P`gLd5r z`WZqRIkat<5*=wbj?WtNSS5+166{Z^@nn~p3M5l`+>reOpFl|bZ&>HSMX)OuV3k;C z0rZF|H`Ze}#qYHSqBm9I5_}r3*N5_))%fl2qrm<|jl7HM%Ab;cn)7P!V4^+8_ZN3) zgs*t?#CDCMZN4jBjkEAr2Mf&KHbgD0e(!nm1t%nHPQBX*b+CNH1ETg4L-QDu~wxu0kiqJlnw@BK3eH?yZOm)?AwV+Li%NS3W2R zd0aOG2}El{eAqp?e72%LAl!%XAjJm8d>k630k0<_^YC>p)MH4-(8ZmQy<6G+SYHM$ zwc@}TRz0KxLEKG<1A9Gh7{df`wIspG1ucR&SO;^0(47uO+xtS^=8Rm~ytA{TY8RT` z8hqCx{9vW0HyCTZMfkiQPlOtC{iwvu-8}zXKiWaCbU%1+7v{}}f$YNO;A#cQ((~r( za^znc`poABAyC>7adviKPZ_J={NwV6@gC@?0$o)G=9=it&+q@4FLH><-UP6#riNHH wvSc!z-ekMS_DCQV-mTa?{c6e>p>7d?4E{siLDg9qZx#G@B~yf?dTAOSSSmvrXsoA#yzF(Ieo?i(u2`R;heqdE%#^+|tPnN@S;P_qC`)Qvrx51>Z)^12uyGWqnE2Y^`qVMm?|B937DNp7igK18Ji^XH@j6Ztrx3IP4+8 z5bL8BgZ|*{duYXVkGyo(v2)wIMPa0yNqWv6JeP zaj%SGQE&MUoR3Sgkd1;&m@4FR+|##rkxtMjObUjAWu+au;EiF&emY2$7v9jHUIq5u zPZ?1@`Z}2x&Xhgvr7}@|N}X)^G~KSYFf8__7m;84ySe90fxX8}_Ql5}rH{Rm25}s6 zfUD|!@pfZ%L#wUG-=qlIWGl`&Pr#2)Zo%_*7F~E2q5GUS_+8Fn&+_36oVQwdp5}br z&0Byyz7EGci8V7AkI&&D0`vK{Xo4;jiBw!Uz(95mabZ_oJWk&AM z=av@LYNf84mFW1wN^!J`6#{<@z4#`sE`9;rv+;#=xJ);YzqpLkyoB{U3lSidz`nR= z9Z8Td+A!+h0^Z`@&;oc~%@HXepW8hq~I!w&xdzX8phXOI8@ literal 0 HcmV?d00001 diff --git a/tests/quant_test.py b/tests/quant_test.py new file mode 100644 index 0000000..42d3f00 --- /dev/null +++ b/tests/quant_test.py @@ -0,0 +1,65 @@ +import numpy as np +import polars as pl +from polars.testing import assert_frame_equal + +import diadem.quantify.quant as q + + +def test_quant(get_data): + """Test that the entire module returns the correct data.""" + # NOTE: For some reason polars `assert_frame_equals` function is + # failing, but pandas works fine. I'll fix later. + ms_data, peptides = get_data + quant_results = q.quant(ms_data, peptides).to_pandas() + manual_results = pl.DataFrame( + { + "peptide": ["PEPTIDEEEE", "LESLIE"], + "intensity": [ + np.trapz([610.00, 100.00]), + np.trapz([1111.00, (2210.00 + 1560.00), 4000.00]), + ], + "mz": [100.01, np.average([249.99, 249.985])], + "num_fragments": [2.0, 3.0], + } + ).to_pandas() + assert quant_results.equals(manual_results) + # assert_frame_equal(quant_results, manual_results) + # ^^^ NOT WORKING even when comparing the same dataframe + + +def test_match_peptide(get_data): + """Test that the module correctly matches the peptides to the spectra.""" + ms_data, peptides = get_data + match_result = q.match_peptide(ms_data, peptides) + manual_result = pl.DataFrame( + { + "peptide": ["PEPTIDEEEE", "LESLIE"], + "rt": [100.0, 150.0], + "intensity": [610.00, (2210.00 + 1560.00)], + "mz": [100.01, np.average([249.99, 249.985])], + "ims": [99.982, np.average([149.98, 150.01])], + } + ) + assert_frame_equal(match_result, manual_result) + + +def test_get_peaks(get_data): + """Test that the quant module is accurately identifying the + corresponding peaks at different rts. + """ + ms_data, peptides = get_data + pep_spectrum_df = q.match_peptide(ms_data, peptides) + row = pep_spectrum_df.row( + by_predicate=(pl.col("peptide") == "PEPTIDEEEE"), named=True + ) + num_ms_data = ms_data.with_row_count() + + r_intensities, r_rts = q.get_peaks(num_ms_data, row, 1, 100.0) + r_intensities_manual = [100.0] + r_rts_manual = [110.0] + assert r_intensities == r_intensities_manual and r_rts == r_rts_manual + + l_intensities, l_rts = q.get_peaks(num_ms_data, row, 1, 0, False) + l_intensities_manual = [] + l_rts_manual = [] + assert l_intensities == l_intensities_manual and l_rts == l_rts_manual From d12e64ae71a2a6628ee72472990b35592458f3fb Mon Sep 17 00:00:00 2001 From: Carolyn Allen Date: Thu, 4 May 2023 11:16:57 -0600 Subject: [PATCH 4/6] updated tests --- diadem/quantify/quant.py | 12 ++++++--- tests/data/small-ms_data.parquet | Bin 2523 -> 2759 bytes tests/data/small-peptides.parquet | Bin 1542 -> 1583 bytes tests/quant_test.py | 42 ++++++++++++++++++------------ 4 files changed, 35 insertions(+), 19 deletions(-) diff --git a/diadem/quantify/quant.py b/diadem/quantify/quant.py index a26b9d7..1b26327 100644 --- a/diadem/quantify/quant.py +++ b/diadem/quantify/quant.py @@ -186,7 +186,7 @@ def get_peaks( if ( sum_intensity == 0 or (break_if_up and sum_intensity > last_val + last_val / 1000) - or count > 3 + or count >= 3 ): return intensities, rts @@ -219,7 +219,12 @@ def get_peaks( rts.append(rt) last_val = sum_intensity count = 0 - + elif last_val == sum_intensity: + count += 1 + if count >= 3: + return intensities, rts + intensities.append(sum_intensity) + rts.append(rt) else: logger.info("Error: something weird happened") return @@ -286,7 +291,8 @@ def peptide_quant(ms_data: pl.DataFrame, matched_df: pl.DataFrame) -> pl.DataFra # Add row numbers to the mass spec dataframe so I can easily go to the right # and left rts (this can be changed later to something more elegant) num_ms_data = ms_data.with_row_count() - + matched_df.to_pandas() + num_ms_data.to_pandas() # Iterate through each row in the peptide-spectrum matched dataframe and # get the intensities for each peptide for row in matched_df.rows(named=True): diff --git a/tests/data/small-ms_data.parquet b/tests/data/small-ms_data.parquet index d341da1fd03059cc6c200a8dd937ea3832800d79..de7fa1fb4ea036cd73a9f86f61ddf35d1c0b9cca 100644 GIT binary patch delta 1064 zcmcaDd|cEez%j^Bluh&ohv*IF7||9+Q3g>JFcuMHU=Y4n{eExm?QOaF!vE2M##a5m z3=y_$3@)4#P5aFlA~O#Zi7+r&i!w+YOi+-XwUtdaOocCT%iP5~uO3MFGMCw3wt|Cs zJx`Ogv*3o1gVDUF7#tWU|6`H#ZebMb;uPy*QR85Mxa12tE?LXQkitIMj?uE-7z8#) zJItC11UI8EN1r_d1UxaqF=x(1?~Hax1A(? zXBDw)OkyQ!Pq?85x!^WvEdxUY*dT+6K!Z4#8Q9eJ1SJB^ImjgG2{xxlZ4oat9ef}s z9n5882$?LxywDJwB7s2(Ns-z&gJ;ieIyS9ugYn{WkBGtrlKeKt6PnH0X2`v2o6Gw1 zoQTNehpdvmkjPmCj2u1&Nf|K?(JmG-po4+TTP&i>;A~FOYm89wC)}bXQ05|D(IS{? zKG8)ildV~Hai~bjNamfJt81;r|`*u0#z6hsA7x25`TmyKVX%TLH4NF4iTVz bVvCf;RMa@c`UEzMvh8E$n9j<;04g2;b6`)? delta 872 zcmX>udRy2uz%j^Bluh&xv*;Y=7*QKh22ma$782zWWMJUDSN-0W^FIXeYi!m3%Mh`b zmEjA=M3?@0euirShq{g^_G~LQ*)Ao~v#%@ySlvk_avB6fyZje`MV`wS*xY+uXFFol_cA<}_iauK76 zl`KO-f`NgA1P>1z8*_8x!2>83ID}F}v6+BsfWbtB0rS};t)K?X$~;Fsq&5W{{K-;}AW=EC%!qkU5J* z^aK->y^CG6i4n^Fz#*CfWzOS*Gf!}fK4G4$!n(_hM^Z*IN9_u;3=v63>$Za7S@ zk2nZ40|XrU9l&(P1PBcj5#@rI${kRYnp|2`oL}S{1vWWBtVrz$Bgo`=jG{R)qC!xI zbBMA59nQLqk@f$7W{B%e85tTF7$O}QG7l6bIpjYmeUR$F$>1zFql>}7K!Ssdjd|zV z9tLxr2G+^@m?Ry9#Lh5^&0)FH-#In?Gu#jLxB*F-|uRZhgeDjksb23XRp}x*iYhz=Olo8_)ZDAAxh8B={gi*8z z%KXM8ItwH$3=+S=BANviZ(|e9n*5ZpDS}N>MlwdNhfxM!bctvN4!!9bppmvB=1{54l@Zb>p0@m<>4Pi76KZJ}r)BFX^6f(#7I%oPjm7Rx%7}4@7Ks5(2C_CV ziY7ssmzYGG7@_PrEO6mNtfEPid6=5CStMm7W7Mi-@C3iu6E;y12DM3(cQZ*k!UI+8 z2b-uEgW45l8Dc_5tdCt(f Date: Thu, 4 May 2023 11:22:10 -0600 Subject: [PATCH 5/6] remove debugging statements --- diadem/quantify/quant.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/diadem/quantify/quant.py b/diadem/quantify/quant.py index 1b26327..3b919fb 100644 --- a/diadem/quantify/quant.py +++ b/diadem/quantify/quant.py @@ -219,6 +219,9 @@ def get_peaks( rts.append(rt) last_val = sum_intensity count = 0 + + # If the intensities are the same, update the count and return if + # the count is 3 or more. Otherwise, append to intensities and rts. elif last_val == sum_intensity: count += 1 if count >= 3: @@ -291,8 +294,7 @@ def peptide_quant(ms_data: pl.DataFrame, matched_df: pl.DataFrame) -> pl.DataFra # Add row numbers to the mass spec dataframe so I can easily go to the right # and left rts (this can be changed later to something more elegant) num_ms_data = ms_data.with_row_count() - matched_df.to_pandas() - num_ms_data.to_pandas() + # Iterate through each row in the peptide-spectrum matched dataframe and # get the intensities for each peptide for row in matched_df.rows(named=True): From 4a77a720f221f4281307509228623773984d6fed Mon Sep 17 00:00:00 2001 From: Carolyn Allen Date: Thu, 4 May 2023 11:34:08 -0600 Subject: [PATCH 6/6] check precursor information --- diadem/quantify/quant.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/diadem/quantify/quant.py b/diadem/quantify/quant.py index 3b919fb..d527543 100644 --- a/diadem/quantify/quant.py +++ b/diadem/quantify/quant.py @@ -27,7 +27,7 @@ def quant(ms_data: pl.DataFrame, pep: pl.DataFrame()) -> None: def match_peptide(ms_data: pl.DataFrame, pep: pl.DataFrame) -> pl.DataFrame: - """Helper function to get the intensity of a peak at a given retention time. + """Map the peptide to a peak in the mass spec data. Parameters ---------- @@ -49,8 +49,13 @@ def match_peptide(ms_data: pl.DataFrame, pep: pl.DataFrame) -> pl.DataFrame: for row in ms_data.rows(named=True): # Convert the mzs to a series so they are easier to use row["mzs"] = pl.Series(row["mzs"]) + # Make sure we are looking at the correct precursor window + peptides = pep.filter( + (pl.col("PrecursorMZ") >= row["precursor_start"]) + & (pl.col("PrecursorMZ") <= row["precursor_end"]) + ) # Get all peptides at the given retention time - peptide = pep.filter(pl.col("RetentionTime") == row["rts"]) + peptide = peptides.filter(pl.col("RetentionTime") == row["rts"]) # For each peptide at the retention time, find the most intense peak and sum # across ion mobility for p in peptide.rows(named=True):