From 871819923f6d7fdceb1d5612d675139fcdeb269f Mon Sep 17 00:00:00 2001
From: Carolyn Allen <callen@talus.bio>
Date: Mon, 1 May 2023 08:09:00 -0700
Subject: [PATCH 1/6] initial quant module

---
 diadem/quantify/quant.py | 343 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 343 insertions(+)
 create mode 100644 diadem/quantify/quant.py

diff --git a/diadem/quantify/quant.py b/diadem/quantify/quant.py
new file mode 100644
index 0000000..f8830c2
--- /dev/null
+++ b/diadem/quantify/quant.py
@@ -0,0 +1,343 @@
+import math
+
+import numpy as np
+import polars as pl
+from loguru import logger
+
+
+def quant(ms_data: pl.DataFrame, pep: pl.DataFrame()) -> None:
+    """Main function.
+
+    Parameters
+    ----------
+    ms_data : pl.DataFrame
+        Mass spec data file.
+    pep : pl.DataFrame
+        Peptide data file.
+    """
+    # TODO: protein quant
+    # TODO: MAJOR OPTIMIZATION - I spent 2 days on this so there are LOTS
+    # of ways top optimize
+    # TODO: tests
+
+    # Match the peptide to spectra
+    pep_spectrum_df = match_peptide(ms_data, pep)
+    # Perform peptide quant
+    peptide_quant(ms_data, pep_spectrum_df)
+
+
+def match_peptide(ms_data: pl.DataFrame, pep: pl.DataFrame) -> pl.DataFrame:
+    """Helper function to get the intensity of a peak at a given retention time.
+
+    Parameters
+    ----------
+    ms_data : pl.DataFrame
+        Mass spec data file.
+    pep : pl.DataFrame
+        Peptide data file.
+
+    Returns
+    -------
+    pl.DataFrame:
+        DataFrame with the matched peptide and mass spec data.
+    """
+    # Initialize the dictionary that will be converted into the returned dataframe
+    data = {"peptide": [], "rt": [], "intensity": [], "mz": [], "ims": []}
+
+    # Iterate through each row of the mass spec data and find the corresponding peptides
+    # in the peptide dataframe.
+    for row in ms_data.rows(named=True):
+        # Convert the mzs to a series so they are easier to use
+        row["mzs"] = pl.Series(row["mzs"])
+        # Get all peptides at the given retention time
+        peptide = pep.filter(pl.col("RetentionTime") == row["rts"])
+        # For each peptide at the retention time, find the most intense peak and sum
+        # across ion mobility
+        for p in peptide.rows(named=True):
+            # For each mz in the peptide mz list, get the mass spec mzs that are within
+            # .02 of that mz. This `grouped_mz` list contains lists of indicies of mzs
+            # from the mass spec data file that are close to the peptide mz.
+            grouped_mzs = []
+            for mz in p["mzs"]:
+                mzs = list(
+                    ((row["mzs"] <= mz + 0.02) & (row["mzs"] >= mz - 0.02)).arg_true()
+                )
+                # If those mzs have already been selected, continue. Otherwise add the
+                # values to a list.
+                if mzs in grouped_mzs:
+                    continue
+                else:
+                    found = False
+                    for group in grouped_mzs:
+                        if all(x in group for x in mzs):
+                            found = True
+                            break
+                    if not found:
+                        grouped_mzs.append(mzs)
+            # Initialize data to hold the information for the most intense peak
+            most_intense = 0
+            most_intense_mz = 0
+            most_intense_ims = 0
+            # Look at the mzs that are close to each other.
+            for group in grouped_mzs:
+                sum_intensity = 0
+                total_mz = 0
+                total_ims = 0
+                num = 0
+                # Sum mzs that are close to each other and average the mz and ims
+                for ind in group:
+                    if math.isclose(row["ims"][ind], p["IonMobility"], abs_tol=0.03):
+                        sum_intensity += row["intensities"][ind]
+                        total_mz += row["mzs"][ind]
+                        total_ims += row["ims"][ind]
+                        num += 1
+                    if sum_intensity > most_intense:
+                        most_intense = sum_intensity
+                        most_intense_ims = total_ims / num
+                        most_intense_mz = total_mz / num
+            # Add the most intense peaks to the data
+            data["peptide"].append(p["peptide"])
+            data["intensity"].append(most_intense)
+            data["rt"].append(row["rts"])
+            data["mz"].append(most_intense_mz)
+            data["ims"].append(most_intense_ims)
+
+    # Return the polars dataframe with the peptide-spectrum matched data
+    return pl.DataFrame(
+        data,
+        schema=[
+            ("peptide", pl.Utf8),
+            ("rt", pl.Float64),
+            ("intensity", pl.Float64),
+            ("mz", pl.Float64),
+            ("ims", pl.Float64),
+        ],
+    )
+
+
+def get_peaks(
+    ms_data: pl.DataFrame, row: dict, i: int, next_val: float, right: float = True
+) -> list:
+    """Helper function to get the corresponding peaks of the peptide at different
+    retention times.
+
+    Parameters
+    ----------
+    ms_data : pl.DataFrame
+        Mass spec data file.
+    row : dict
+        Row of the peptide data that contains the peptide we are interested in.
+    i : int
+        Index of the peptide of interest in the mass spec data file.
+    next_val : float
+        Intensity of next peak either to the right or the left.
+        This help us to determine what part of the curve we are on (if we should stop
+        as soon as the intensities increase/level off, or if the values should increase
+        before they decrease).
+    right : float, default = True
+        True if we are looking at intensities to the right of the first measurement.
+        False if we are looking to the left.
+
+    Returns
+    -------
+    list:
+        List containing the intensities of the matching peaks at different retention
+        times. Another list of the retention times those peaks were found at.
+    """
+    # Initialize the lists that will contain the intensities/rts that we will return
+    intensities = []
+    rts = []
+
+    # Keep track of the previous intensity so you know if the intensities are
+    # decreasing or increasing. This could be simplified to just look at the
+    # last item in the intensities list, but I'll do that later.
+    last_val = row["intensity"]
+    # Keep track of the number of times the intensity isn't changing (i.e. peak is
+    # leveling off).
+    count = 0
+
+    # Here we determine if the peak is increasing or decreasing at this moment in time.
+    # If it is decreasing, we should stop once the intensities increase or level off.
+    # If it is increasing, we should see the intensities increase and then decrease.
+    break_if_up = True
+    if row["intensity"] < next_val:
+        break_if_up = False
+
+    # This is the loop that is getting all the intensities/rts for a peptide.
+    while True:
+        # If we are looking to the right of the matched intensity, make sure we are
+        # in the range of our data and go 1 rt to the right. Otherwise, go 1 rt to
+        # the left.
+        if right:
+            if i + 1 > len(ms_data):
+                return intensities, rts
+            i += 1
+        else:
+            if i - 1 < 0:
+                return intensities, rts
+            i -= 1
+
+        # Get the intensity and rt of the peak
+        sum_intensity, rt = get_intensity_val(ms_data, row, i)
+
+        # Return the data if any of the following criteria is met:
+        #   1. the intensity is 0 of the current peak
+        #   2. the intensity is increasing when it should be decreasing
+        #   3. the intensity is leveling off
+        if (
+            sum_intensity == 0
+            or break_if_up
+            and sum_intensity > last_val + last_val / 100
+            or count > 3
+        ):
+            return intensities, rts
+
+        # If the intensity is decreasing when it should be, append the information to
+        # our lists. Check if the values are roughly the same, meaning it is leveling
+        # off.
+        elif break_if_up and sum_intensity < last_val:
+            intensities.append(sum_intensity)
+            rts.append(rt)
+            last_val = sum_intensity
+            if sum_intensity > last_val + last_val / 100:
+                count += 1
+            else:
+                count = 0
+
+        # If the intensity is increasing when it should be, append the information
+        # to our lists.
+        elif not break_if_up and sum_intensity > last_val:
+            intensities.append(sum_intensity)
+            rts.append(rt)
+            last_val = sum_intensity
+            count = 0
+
+        # If the intensity is decreasing when it should be increasing, this means
+        # we have gone over the top of our peak. Append the information to our
+        # lists and set `break_if_up` to True.
+        elif not break_if_up and sum_intensity < last_val:
+            break_if_up = True
+            intensities.append(sum_intensity)
+            rts.append(rt)
+            last_val = sum_intensity
+            count = 0
+
+        else:
+            logger.info("Error: something weird happened")
+            return
+
+
+def get_intensity_val(ms_data: pl.DataFrame, row: dict, i: int) -> int:
+    """Helper function to get the intensity of a peak at a given retention time.
+
+    Parameters
+    ----------
+    ms_data : pl.DataFrame
+            Mass spec data file.
+    row : dict
+        Row of the peptide data that contains the peptide we are interested in.
+    i : int
+            Index of the peptide of interest in the mass spec data file.
+
+    Returns
+    -------
+    int:
+        Integer containing the intensity of the peak.
+        Integer containing the rt of the peak.
+    """
+    # Extract the relevant row from our mass spec data.
+    curr_row = ms_data[i]
+
+    # Get all mz values that are close to the peptide mz value.
+    s = list(
+        (
+            (curr_row["mzs"].item() <= row["mz"] + 0.02)
+            & (curr_row["mzs"].item() >= row["mz"] - 0.02)
+        ).arg_true()
+    )
+
+    # Sum the intensities to collapse the ims dimention.
+    sum_intensity = 0
+    total_mz = 0
+    num = 0
+    for ind in s:
+        if math.isclose(row["ims"], curr_row["ims"].item()[ind], abs_tol=0.03):
+            sum_intensity += curr_row["intensities"].item()[ind]
+            total_mz += curr_row["mzs"].item()[ind]
+            num += 1
+    return sum_intensity, curr_row["rts"].item()
+
+
+def peptide_quant(ms_data: pl.DataFrame, matched_df: pl.DataFrame) -> pl.DataFrame:
+    """Quantify peptides across rt.
+
+    Parameters
+    ----------
+    ms_data : pl.DataFrame
+        Mass spec data file.
+    matched_df : pl.DataFrame
+        Dataframe with peptide-spectrum matched data.
+
+    Returns
+    -------
+    pl.DataFrame:
+        DataFrame with the quantified peptides.
+    """
+    # Initialize the dictionary that will be converted into the returned dataframe
+    pep_quant_data = {"peptide": [], "intensity": [], "mz": [], "num_fragments": []}
+    # Add row numbers to the mass spec dataframe so I can easily go to the right
+    # and left rts (this can be changed later to something more elegant)
+    num_ms_data = ms_data.with_row_count()
+
+    # Iterate through each row in the peptide-spectrum matched dataframe and
+    # get the intensities for each peptide
+    for row in matched_df.rows(named=True):
+        # Get the row number in the mass spec dataframe for the current peptide
+        i = num_ms_data.filter(pl.col("rts") == row["rt"])["row_nr"].item()
+        # Get the intensities to the rt to the right and left of the current
+        # retention time. This can probably be moved to a different function
+        # or optimized later
+        sum_intensity_left, _ = get_intensity_val(ms_data, row, i - 1)
+        sum_intensity_right, _ = get_intensity_val(ms_data, row, i + 1)
+
+        # Get the intensities/rts that match the peptide in rts to the left of
+        # the current rt.
+        i_left, rt_left = get_peaks(num_ms_data, row, i, sum_intensity_left, False)
+        # These lists must be reversed since they will be added by going backward
+        # in rt.
+        if len(i_left) > 1:
+            i_left, rt_left = i_left[::-1], rt_left[::-1]
+        # Get the intensities/rts that match the peptide in rts to the right of
+        # the current rt.
+        i_right, rt_right = get_peaks(num_ms_data, row, i, sum_intensity_right, True)
+        # Get a list with all intensities/rts for the peptide
+        all_intensities = i_left + [row["intensity"]] + i_right
+        rt_left + [row["rt"]] + rt_right
+        # If there is more than one peak, do a trapezoidal integration
+        if len(all_intensities) > 1:
+            summed = np.trapz(
+                all_intensities
+            )  # tried to do `x=all_rts` but it gave me a weird answer
+        # If there are no peaks, something went wrong.
+        elif len(all_intensities) == 0:
+            logger.info("Error: no peaks were found")
+        # If there is only one peak, the intensity is the peak intensity
+        else:
+            summed = all_intensities[0]
+
+        # Store the data
+        pep_quant_data["peptide"].append(row["peptide"])
+        pep_quant_data["intensity"].append(summed)
+        pep_quant_data["mz"].append(row["mz"])
+        pep_quant_data["num_fragments"].append(len(all_intensities))
+
+    # Return the peptide quant dataframe
+    return pl.DataFrame(
+        pep_quant_data,
+        schema=[
+            ("peptide", pl.Utf8),
+            ("intensity", pl.Float64),
+            ("mz", pl.Float64),
+            ("num_fragments", pl.Float64),
+        ],
+    )
\ No newline at end of file

From 62c57c9f52df60adeb1805632084403a31da6d97 Mon Sep 17 00:00:00 2001
From: Carolyn Allen <callen@talus.bio>
Date: Mon, 1 May 2023 08:14:19 -0700
Subject: [PATCH 2/6] update pre-commit

---
 .pre-commit-config.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2507587..e621c1e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,17 +8,17 @@ repos:
   - id: trailing-whitespace
   - id: detect-private-key
 - repo: https://github.com/psf/black
-  rev: 22.12.0
+  rev: 23.3.0
   hooks:
     - id: black
       language_version: python3.9
 - repo: https://github.com/pycqa/isort
-  rev: 5.11.4
+  rev: 5.12.0
   hooks:
     - id: isort
       name: isort (python)
 - repo: https://github.com/charliermarsh/ruff-pre-commit
-  rev: v0.0.228
+  rev: v0.0.263
   hooks:
     - id: ruff
       args: ['--fix']

From bb2b297bf17531947d9f1dfea80845993e8e7fdb Mon Sep 17 00:00:00 2001
From: Carolyn Allen <callen@talus.bio>
Date: Tue, 2 May 2023 10:39:54 -0700
Subject: [PATCH 3/6] added basic quant tests

---
 diadem/quantify/quant.py          |  14 +++----
 tests/conftest.py                 |   8 ++++
 tests/data/small-ms_data.parquet  | Bin 0 -> 2523 bytes
 tests/data/small-peptides.parquet | Bin 0 -> 1542 bytes
 tests/quant_test.py               |  65 ++++++++++++++++++++++++++++++
 5 files changed, 79 insertions(+), 8 deletions(-)
 create mode 100644 tests/data/small-ms_data.parquet
 create mode 100644 tests/data/small-peptides.parquet
 create mode 100644 tests/quant_test.py

diff --git a/diadem/quantify/quant.py b/diadem/quantify/quant.py
index f8830c2..a26b9d7 100644
--- a/diadem/quantify/quant.py
+++ b/diadem/quantify/quant.py
@@ -23,7 +23,7 @@ def quant(ms_data: pl.DataFrame, pep: pl.DataFrame()) -> None:
     # Match the peptide to spectra
     pep_spectrum_df = match_peptide(ms_data, pep)
     # Perform peptide quant
-    peptide_quant(ms_data, pep_spectrum_df)
+    return peptide_quant(ms_data, pep_spectrum_df)
 
 
 def match_peptide(ms_data: pl.DataFrame, pep: pl.DataFrame) -> pl.DataFrame:
@@ -118,8 +118,7 @@ def match_peptide(ms_data: pl.DataFrame, pep: pl.DataFrame) -> pl.DataFrame:
 def get_peaks(
     ms_data: pl.DataFrame, row: dict, i: int, next_val: float, right: float = True
 ) -> list:
-    """Helper function to get the corresponding peaks of the peptide at different
-    retention times.
+    """Helper function to get corresponding peaks of a peptide at different rts.
 
     Parameters
     ----------
@@ -169,7 +168,7 @@ def get_peaks(
         # in the range of our data and go 1 rt to the right. Otherwise, go 1 rt to
         # the left.
         if right:
-            if i + 1 > len(ms_data):
+            if i + 1 >= len(ms_data):
                 return intensities, rts
             i += 1
         else:
@@ -186,8 +185,7 @@ def get_peaks(
         #   3. the intensity is leveling off
         if (
             sum_intensity == 0
-            or break_if_up
-            and sum_intensity > last_val + last_val / 100
+            or (break_if_up and sum_intensity > last_val + last_val / 1000)
             or count > 3
         ):
             return intensities, rts
@@ -199,7 +197,7 @@ def get_peaks(
             intensities.append(sum_intensity)
             rts.append(rt)
             last_val = sum_intensity
-            if sum_intensity > last_val + last_val / 100:
+            if sum_intensity > last_val + last_val / 1000:
                 count += 1
             else:
                 count = 0
@@ -340,4 +338,4 @@ def peptide_quant(ms_data: pl.DataFrame, matched_df: pl.DataFrame) -> pl.DataFra
             ("mz", pl.Float64),
             ("num_fragments", pl.Float64),
         ],
-    )
\ No newline at end of file
+    )
diff --git a/tests/conftest.py b/tests/conftest.py
index cd2bcec..716ad66 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,4 +1,5 @@
 import numpy as np
+import polars as pl
 import pytest
 from ms2ml import Peptide
 
@@ -168,3 +169,10 @@ def albumin_peptides():
 
     out = [Peptide.from_proforma_seq(f"{x}/2") for x in ALBUMIN_PEPTIDE_SEQS]
     return out
+
+
+@pytest.fixture
+def get_data(shared_datadir):
+    """Read the parquet files for the quant module and return the dataframes."""
+    return pl.read_parquet(shared_datadir / "small-ms_data.parquet"), \
+           pl.read_parquet(shared_datadir / "small-peptides.parquet")
diff --git a/tests/data/small-ms_data.parquet b/tests/data/small-ms_data.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..d341da1fd03059cc6c200a8dd937ea3832800d79
GIT binary patch
literal 2523
zcmd5;TWph696#&Vm98snDBqXX45$w@)nQf{(?+-KoU&yXMYb+%?Wk`3+K;WHH~V#T
z)|W9tG-NuEsgoEYYGRD@7EzaZp-yA=FcYH0OGFt$e33*R!0@2M|NngL25d|oc<?0c
zIp=@*{cpe1w1s)@6%FFwg7~uF7pp}gn&ESsXj)B3(XUVbSyOaHTP2PUEB+zg&kD&!
z<Ks1^A6^^WK_}Mj`=d20)5bHIHSb*MrQgiJs*Y2TPON)BNx2>I(UGZ34*zkP60lv5
z$~o`Zge^qEE5f&@Xq<WwC>#Kbza0LNsnh-RJOJ$bBuMXq>!n_`HeMT_?WcHgjP_6r
z*RV28Ur#_Hs#-$6Z<*Tv)t$p1shdB(!!rE2I{T(j5}19SC4vs!3wIB^#jM03nyed$
z!RivC8C4$%t0{drr9?%uR4t9`r5V9)Bw8cJbT`w8w*@jP5Gale?jR}JJs2q6OCHP0
zKGKx+&@LMEADnyT@?hg35}wV}9qk^=K9`L+51u*{J4)ZBYe{#v(`m7o%?5+8ch48u
zYpxGA5fFAe4_s=Q9?B~R6z9{c$;Nyc#o<^=iK*dKSXrW+X`TJB9_tB-?S4K|9IIX=
zFjk>f8)abu86&s`+bQwEIJ>g)tU?R5)Fw8a*ClhoEJ8A+LZ{5qfdXl$z&^P`gLd5r
z`WZqRIkat<5*=wbj?WtNSS5+166{Z^@nn~p3M5l`+>reOpFl|bZ&>HSMX)OuV3k;C
z0rZF|H`Ze}#qYHSqBm9I5_}r3*N5_))%fl2qrm<|jl7HM%Ab;cn)7P!V4^+8_ZN3)
zgs*t?#CDCMZN4jBjkE<mjoQIJ3KTw*$;u5q6c1~p;Bna)p<lvF8EIDFq454?<!Akr
zr}J|U#i*t?l!R<1ZH^zmUA=qf#Z^V+t2UqA(_=_p_?mpT<(Kz9u$_KsNS7yx5RNWE
zj&p*2x|mqYB%?SZNDv2(hxOvLE_a?>Ar2Mf&KHbgD0e(!nm<k#i|4qfsJEVK)F<O{
z7$d3DS@3a*I)62QR_QJLnTa|gnl|Vx{JEjc)y!Simb)SKG=S@gqe-uwJI?`%UKUp}
zTg$8w`$?f}zE3y)O^_~@h!U~?W|%j=I{(E#B-+c9_LqtlV*k{b_gdl!Nw0F7lST73
z*H^f}Vt5+TWGN;xR6K8fZRrcfKzZo|Hy$s^8@raHg7iCPA{{H0%yy$RWnP1nu$Y<0
zv#sL+HQ1{}1D4B%3S3oe3uCMvKFqZn?gOoOggX~K#F&Z(u(z`E7SN~DsBdDdCuiq_
zy>1t%nHPQBX*b+CNH1ETg4L-QDu~wxu0kiqJlnw@BK3eH?yZOm)?AwV+Li%NS3W2R
zd0aOG2}El{eAqp?e72%LAl!%XAjJm8d>k630k0<_^YC>p)MH4-(8ZmQy<6G+SYHM$
zwc@}TRz0KxLEKG<1A9Gh7{df`wIspG1ucR&SO;^0(47uO+xtS^=8Rm~ytA{TY8RT`
z8hqCx{9vW0HyCTZMfkiQPlOtC{iwvu-8}zXKiWaCbU%1+7v{}}f$YNO;A#cQ((~r(
za^znc`poABAyC>7adviKPZ_J={NwV6@gC@?0$o)G=9=it&+q@4FLH><-UP6#riNHH
wvSc!z-ekMS_DCQV-mTa?{c6e<jwT{XREecP7LVEDJ8j&Ij205o0)OlN1y?Q26aWAK

literal 0
HcmV?d00001

diff --git a/tests/data/small-peptides.parquet b/tests/data/small-peptides.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..e0906a15ec5d76539918f41b4d2c2cc602ddca00
GIT binary patch
literal 1542
zcmb_d&rcIU6rQ$AAy8vXn8`Njp$FNRpeYnVFdjOU+7?0DZD}d#0SYUuwB5~iOQAP8
z8#Nw07>>p>7d?4E{siLDg9qZx#G@B~yf?dTAOSSSmvrXsoA<u=&G$Cj3|B{~pWdYR
z%2cAH^SeTb&v)%@<Of;&KuA9!B$LSG)6)qsKRnGQa<l0K4FrdXKPdN_nrZ868VxXs
zEi+5$BY=68PMnAN7$DGM&thWVcpjtu4%R@%(pH;R!?5O-1YU+!lzk5ZKB9#(4Y(xz
zR6>#yzF(Ieo?i(u2`R;heqdE%#^+|tPnN@S;P_qC`)Qvrx51>Z)^12uy<tnbt<|7K
zfW7pvU5|3){Y%!^1nr>GWqnE2Y^`qVMm?|B937DNp7igK18Ji^XH@j6Ztrx3IP4+8
z5bL8BgZ|*{duYXVkGyo(<NAA2H+<ORxKs*0NlBJr*tDO}z>v2)wIMPa0yNqWv6JeP
zaj%SGQE&MUoR3Sgkd1;&m@4FR+|##rkxtMjObUjAWu+au;EiF&emY2$7v9jHUIq5u
zPZ?1@`Z}2x&Xhgvr7}@|N}X)^G~KSYFf8__7m;84ySe90fxX8}_Ql5}rH{Rm25}s6
zfUD|!@pfZ%L#wUG-=qlIWGl`&Pr#2)Zo%_*7F~E2q5GUS_+8Fn&+_36oVQwdp5}br
z&0Byyz7EGci8V7AkI&&D0`vK{Xo4;jiBw!Uz(95mabZ_oJW<E_^aK<|phc4N65`Ba
zX|6cc%onHhmEyL!mfGaEkWU*=95Rp$mY9Ym%j0RRgFS)#!31y4GZylJ5a6z3>k&AM
z=av@LYNf84mFW1wN^!J`6#{<@z4#`sE`9;rv+;#=xJ);YzqpLkyoB{U3lSidz`nR=
z9Z8Td+A!+h0^Z`@&;o<W>c~<NIdUEm`ir<IsVti9|EcdnYNlEmE9|UBr*`rM!@2*C
z&<%&$xjzp4?E7O<M$8*@g%m7J#jC3+)m%z0a3WvlmSwb};Tz!xE4IF+h1F)m4(m0u
Zs@32F1u>%@HXepW8hq~I!w&xdzX8phXOI8@

literal 0
HcmV?d00001

diff --git a/tests/quant_test.py b/tests/quant_test.py
new file mode 100644
index 0000000..42d3f00
--- /dev/null
+++ b/tests/quant_test.py
@@ -0,0 +1,65 @@
+import numpy as np
+import polars as pl
+from polars.testing import assert_frame_equal
+
+import diadem.quantify.quant as q
+
+
+def test_quant(get_data):
+    """Test that the entire module returns the correct data."""
+    # NOTE: For some reason polars `assert_frame_equals` function is
+    # failing, but pandas works fine. I'll fix later.
+    ms_data, peptides = get_data
+    quant_results = q.quant(ms_data, peptides).to_pandas()
+    manual_results = pl.DataFrame(
+        {
+            "peptide": ["PEPTIDEEEE", "LESLIE"],
+            "intensity": [
+                np.trapz([610.00, 100.00]),
+                np.trapz([1111.00, (2210.00 + 1560.00), 4000.00]),
+            ],
+            "mz": [100.01, np.average([249.99, 249.985])],
+            "num_fragments": [2.0, 3.0],
+        }
+    ).to_pandas()
+    assert quant_results.equals(manual_results)
+    # assert_frame_equal(quant_results, manual_results)
+    # ^^^ NOT WORKING even when comparing the same dataframe
+
+
+def test_match_peptide(get_data):
+    """Test that the module correctly matches the peptides to the spectra."""
+    ms_data, peptides = get_data
+    match_result = q.match_peptide(ms_data, peptides)
+    manual_result = pl.DataFrame(
+        {
+            "peptide": ["PEPTIDEEEE", "LESLIE"],
+            "rt": [100.0, 150.0],
+            "intensity": [610.00, (2210.00 + 1560.00)],
+            "mz": [100.01, np.average([249.99, 249.985])],
+            "ims": [99.982, np.average([149.98, 150.01])],
+        }
+    )
+    assert_frame_equal(match_result, manual_result)
+
+
+def test_get_peaks(get_data):
+    """Test that the quant module is accurately identifying the
+    corresponding peaks at different rts.
+    """
+    ms_data, peptides = get_data
+    pep_spectrum_df = q.match_peptide(ms_data, peptides)
+    row = pep_spectrum_df.row(
+        by_predicate=(pl.col("peptide") == "PEPTIDEEEE"), named=True
+    )
+    num_ms_data = ms_data.with_row_count()
+
+    r_intensities, r_rts = q.get_peaks(num_ms_data, row, 1, 100.0)
+    r_intensities_manual = [100.0]
+    r_rts_manual = [110.0]
+    assert r_intensities == r_intensities_manual and r_rts == r_rts_manual
+
+    l_intensities, l_rts = q.get_peaks(num_ms_data, row, 1, 0, False)
+    l_intensities_manual = []
+    l_rts_manual = []
+    assert l_intensities == l_intensities_manual and l_rts == l_rts_manual

From d12e64ae71a2a6628ee72472990b35592458f3fb Mon Sep 17 00:00:00 2001
From: Carolyn Allen <callen@talus.bio>
Date: Thu, 4 May 2023 11:16:57 -0600
Subject: [PATCH 4/6] updated tests

---
 diadem/quantify/quant.py          |  12 ++++++---
 tests/data/small-ms_data.parquet  | Bin 2523 -> 2759 bytes
 tests/data/small-peptides.parquet | Bin 1542 -> 1583 bytes
 tests/quant_test.py               |  42 ++++++++++++++++++------------
 4 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/diadem/quantify/quant.py b/diadem/quantify/quant.py
index a26b9d7..1b26327 100644
--- a/diadem/quantify/quant.py
+++ b/diadem/quantify/quant.py
@@ -186,7 +186,7 @@ def get_peaks(
         if (
             sum_intensity == 0
             or (break_if_up and sum_intensity > last_val + last_val / 1000)
-            or count > 3
+            or count >= 3
         ):
             return intensities, rts
 
@@ -219,7 +219,12 @@ def get_peaks(
             rts.append(rt)
             last_val = sum_intensity
             count = 0
-
+        elif last_val == sum_intensity:
+            count += 1
+            if count >= 3:
+                return intensities, rts
+            intensities.append(sum_intensity)
+            rts.append(rt)
         else:
             logger.info("Error: something weird happened")
             return
@@ -286,7 +291,8 @@ def peptide_quant(ms_data: pl.DataFrame, matched_df: pl.DataFrame) -> pl.DataFra
     # Add row numbers to the mass spec dataframe so I can easily go to the right
     # and left rts (this can be changed later to something more elegant)
     num_ms_data = ms_data.with_row_count()
-
+    matched_df.to_pandas()
+    num_ms_data.to_pandas()
     # Iterate through each row in the peptide-spectrum matched dataframe and
     # get the intensities for each peptide
     for row in matched_df.rows(named=True):
diff --git a/tests/data/small-ms_data.parquet b/tests/data/small-ms_data.parquet
index d341da1fd03059cc6c200a8dd937ea3832800d79..de7fa1fb4ea036cd73a9f86f61ddf35d1c0b9cca 100644
GIT binary patch
delta 1064
zcmcaDd|cEez%j^Bluh&ohv*IF7||9+Q3g>JFcuMHU=Y4n{eExm?QOaF!vE2M##a5m
z3=y_$3@)4#P5aFlA~O#Zi7+r&i!w+YOi+-XwUtdaOocCT%iP5~uO3MFGMCw3wt|Cs
zJx`Ogv*3o1gVDUF7#tWU|6`H#ZebMb;uPy*QR85Mxa12tE?LXQkitIMj?uE-7z8#)
zJItC11UI8EN1r_d1UxaqF=x(1?~Hax1A(?<ac;R&N^ABsnzX)cFKPMHAX36s{infL
zAz|_gD+7}Rsfdd=nBO<rf7sB-&TI4LKsOs-%7%+~o?SeYAssm_rErU(un@mlcT=xm
zkJ+t@Zq`6-d@BP;OJ}%A^SbMAWH&1?NV8SQe$dEkV<1=2e4K+%Z1P4%#d?T;m$9hb
zVgY%01(WD9#u#X*Gl+7CG5|xK{r`VvNC=xUF{m&xfIvf-14Cp(v_t%Xqz9=CAM6<%
zI0Qr_WE7NC)HECf85Forup2Q)I@nE+Ji+v-@yvmY1PcQx3Emzyw`j@2MrBbhNmT|m
zNeLh3qLN}!9x)ZM8%$!?7}aiZLS5m8+YwtC7!ts4Q0Q@B09wt=z`AT_Py(y_<SZs>
zXBDw)OkyQ!Pq?85x!^WvEdxUY*dT+6K!Z4#8Q9eJ1SJB^ImjgG2{xxlZ4oat9ef}s
z9n5882$?LxywDJwB7s2(Ns-z&gJ;ieIyS9ugYn{WkBGtrlKeKt6PnH0X2`v2o6Gw1
zoQTNehpdvmkjPmCj2u1&Nf|K?(JmG-po4+TTP&i>;A~FOYm89wC)}bXQ05|D(IS{?
zKG8)ildV~Hai~bjNam<bVwwDxMGBa~;Avg#n6RiAgIWWp3@BAtA*TwlJHnz83~C#=
zWnhtp8E0Z&z-n&r$bjNN1Q7>fJt81;r|`*u0#z6hsA7x25`TmyKVX%TLH4NF4iTVz
bVvCf;RMa@c`UEzMvh8E$n9j<;04g2;b6`)?

delta 872
zcmX>udRy2uz%j^Bluh&xv*;Y=7*QKh22ma$782zWWMJUDSN-0W^FIXeYi!m3%Mh`b
zmEjA=M3?@0euirShq{g^_G~LQ*)Ao~v#<EUN@o7V2U8sjk~Ns7X;rLtlw{Bl<&soo
zV3V|9&aEnD$;m7(VaY5>%@ySlvk_avB6fyZje`MV`wS*xY+uXFFol_cA<}_iauK76
zl`KO-f`NgA1P>1z8*_8x!2>8<YflECV1mJfuK_2TV3HN71|5P7I+Gok6zjomIKiYg
ziv{F}6w$C4u;)Pmz#_^33;<?(h^JW@8D=nmKtq@VLu5m=L;QiH2dNIM5bJjsy<sYX
zdQ^hBsH7O^Jsz<MjABiUYP;AWMq6MpS`r+fy$lQ<V6zo^92l4w!Z_AE;Fw&%CT#;Y
zB1P>3ID}F}v6+BsfWbtB0rS};t)K?X<AMh97NR0pmW_dB@*C!biaZR#8aFNpmZg2+
z<J9IedEA)JR`lT*!)52Y8#XK4pV`DTc^`|U4>$~;Fsq&5W{{K-;}AW=EC%!qkU5J*
z^aK->y^CG6i4n^Fz#*CfWzOS*Gf!}fK4G4$!n(_hM^Z*IN9_u;3=v63><hoB7=zj)
zHW^TSS|G)zSeJmP1cTZh4wx}mB1UWxSj`pA$@y$D!ieA&+a&-pwTXN3JT@s2L@0_~
r0E@5TmjU@)8p-Ek&p;-LZ4nUTQR5Ih#Iu=^eIGN&6jlZXQ1$`<9fI8v

diff --git a/tests/data/small-peptides.parquet b/tests/data/small-peptides.parquet
index e0906a15ec5d76539918f41b4d2c2cc602ddca00..02b7962db2e10df60bdfaeb35215e784426e8f9c 100644
GIT binary patch
delta 696
zcmZqUS<h1+;22~m$|mY28W97e7{HiCltGYzftg)ntNvex2u?-@E(QjM0M~#JPZw7p
zU<2}eT!Vc)U0H!FR~OF^4^bXTRR%Um3HE~2f|AUXR8bx=HnAeH7DhD=kdY>$Za7S@
zk2nZ40|XrU9l&(P1PBcj5#@rI${kRYnp|2`oL}S{1vWWBtVrz$Bgo`=jG{R)qC!xI
zbBMA59nQLqk@f$7W{B%e85tTF7$O}QG7l6bIpjYmeUR$F$>1zFql>}7K!Ssdjd|zV
z9tLxr2G+^@m?Ry9#Lh5^&0<vh#sqP<9S(O#ECq%L*xUn23?dH94BE|(Jq!tqlkFKL
z>)FH-#In?Gu#jLxB*F-|uRZhgeDjksb23XRp}x*iYhz=Olo8_)ZDAAxh8B={gi*8z
z%KXM8ItwH$3=+S=BANviZ(|e9n*5ZpDS}N>MlwdNhfxM!bct<X7Zqbr+r}gVcL<K)
z7dr-4Ie}&JUM49~q>vN4!!9bppmvB=1{54l@Zb>p0@m<>4Pi76KZ<p6039KAk4214
VjYI6oW_RZ8OdR4Yz%U0UR{-L@ii`jN

delta 619
zcmZ3_)5cRD;22~m$|h<e>J}r)BFX^6f(#7I%o<zu|1v}zWMJT8U|<Mv4G8gcaRmZ4
zAkW7&*vHdVlt)sPflX3^y&$!qBr_#dlt+w3EJ3VDje`Mbri!Q;E;Ac}Mu30=SHA<8
zkukwx@?IwKdKR$&u_Uz(j3E74qG2(jJWw~WiL!v)WXAlTAL6XH3=9Pf43Q2DnForJ
z9P%HOI<PS$C`j-;*=EFaX9I(nC>Pjm7R<R-#Vk3Q#U(76C8@bU7xIWrV-#y*RJ+84
z=|}}p1|*-uTnVxT1P&xQOipB!w1PNt4hyD!g4!LK#cd$k53w>x%7}4@7Ks5(2C_CV
ziY7ssmzYGG7@_PrEO6mNtfEPid6=5CStMm7W7Mi-@C3iu6E;y12DM3(cQZ*k!UI+8
z2b-uEgW45l8Dc_5tdCt(f<f&ciwwxuf^c7pEn^3nvShM7v!n#vGh%x{qGHdO#aPrh
U#Aa<SVBXHeA<6<wIKUJJ0GAwe-T(jq

diff --git a/tests/quant_test.py b/tests/quant_test.py
index 42d3f00..524f160 100644
--- a/tests/quant_test.py
+++ b/tests/quant_test.py
@@ -7,24 +7,34 @@
 
 def test_quant(get_data):
     """Test that the entire module returns the correct data."""
-    # NOTE: For some reason polars `assert_frame_equals` function is
-    # failing, but pandas works fine. I'll fix later.
-    ms_data, peptides = get_data
-    quant_results = q.quant(ms_data, peptides).to_pandas()
+    ms_data, pep = get_data
+    quant_results = q.quant(ms_data, pep)
     manual_results = pl.DataFrame(
         {
-            "peptide": ["PEPTIDEEEE", "LESLIE"],
+            "peptide": ["PEPTIDEEEE", "LESLIE", "EDITH"],
             "intensity": [
                 np.trapz([610.00, 100.00]),
                 np.trapz([1111.00, (2210.00 + 1560.00), 4000.00]),
+                np.trapz(
+                    [
+                        110.8,
+                        110.8,
+                        110.8,
+                        123.1,
+                        456.4,
+                        765.7,
+                        321.3,
+                        110.8,
+                        110.8,
+                        110.8,
+                    ]
+                ),
             ],
-            "mz": [100.01, np.average([249.99, 249.985])],
-            "num_fragments": [2.0, 3.0],
+            "mz": [100.01, np.average([249.99, 249.985]), 100.0],
+            "num_fragments": [2.0, 3.0, 10.0],
         }
-    ).to_pandas()
-    assert quant_results.equals(manual_results)
-    # assert_frame_equal(quant_results, manual_results)
-    # ^^^ NOT WORKING even when comparing the same dataframe
+    )
+    assert_frame_equal(manual_results, quant_results)
 
 
 def test_match_peptide(get_data):
@@ -33,11 +43,11 @@ def test_match_peptide(get_data):
     match_result = q.match_peptide(ms_data, peptides)
     manual_result = pl.DataFrame(
         {
-            "peptide": ["PEPTIDEEEE", "LESLIE"],
-            "rt": [100.0, 150.0],
-            "intensity": [610.00, (2210.00 + 1560.00)],
-            "mz": [100.01, np.average([249.99, 249.985])],
-            "ims": [99.982, np.average([149.98, 150.01])],
+            "peptide": ["PEPTIDEEEE", "LESLIE", "EDITH"],
+            "rt": [100.0, 150.0, 5.0],
+            "intensity": [610.00, (2210.00 + 1560.00), 123.1],
+            "mz": [100.01, np.average([249.99, 249.985]), 100.0],
+            "ims": [99.982, np.average([149.98, 150.01]), 100.0],
         }
     )
     assert_frame_equal(match_result, manual_result)

From 365130f37df59e51c0f881769d6a69958f216737 Mon Sep 17 00:00:00 2001
From: Carolyn Allen <callen@talus.bio>
Date: Thu, 4 May 2023 11:22:10 -0600
Subject: [PATCH 5/6] remove debugging statements

---
 diadem/quantify/quant.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/diadem/quantify/quant.py b/diadem/quantify/quant.py
index 1b26327..3b919fb 100644
--- a/diadem/quantify/quant.py
+++ b/diadem/quantify/quant.py
@@ -219,6 +219,9 @@ def get_peaks(
             rts.append(rt)
             last_val = sum_intensity
             count = 0
+
+        # If the intensities are the same, update the count and return if
+        # the count is 3 or more. Otherwise, append to intensities and rts.
         elif last_val == sum_intensity:
             count += 1
             if count >= 3:
@@ -291,8 +294,7 @@ def peptide_quant(ms_data: pl.DataFrame, matched_df: pl.DataFrame) -> pl.DataFra
     # Add row numbers to the mass spec dataframe so I can easily go to the right
     # and left rts (this can be changed later to something more elegant)
     num_ms_data = ms_data.with_row_count()
-    matched_df.to_pandas()
-    num_ms_data.to_pandas()
+
     # Iterate through each row in the peptide-spectrum matched dataframe and
     # get the intensities for each peptide
     for row in matched_df.rows(named=True):

From 4a77a720f221f4281307509228623773984d6fed Mon Sep 17 00:00:00 2001
From: Carolyn Allen <callen@talus.bio>
Date: Thu, 4 May 2023 11:34:08 -0600
Subject: [PATCH 6/6] check precursor information

---
 diadem/quantify/quant.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/diadem/quantify/quant.py b/diadem/quantify/quant.py
index 3b919fb..d527543 100644
--- a/diadem/quantify/quant.py
+++ b/diadem/quantify/quant.py
@@ -27,7 +27,7 @@ def quant(ms_data: pl.DataFrame, pep: pl.DataFrame()) -> None:
 
 
 def match_peptide(ms_data: pl.DataFrame, pep: pl.DataFrame) -> pl.DataFrame:
-    """Helper function to get the intensity of a peak at a given retention time.
+    """Map the peptide to a peak in the mass spec data.
 
     Parameters
     ----------
@@ -49,8 +49,13 @@ def match_peptide(ms_data: pl.DataFrame, pep: pl.DataFrame) -> pl.DataFrame:
     for row in ms_data.rows(named=True):
         # Convert the mzs to a series so they are easier to use
         row["mzs"] = pl.Series(row["mzs"])
+        # Make sure we are looking at the correct precursor window
+        peptides = pep.filter(
+            (pl.col("PrecursorMZ") >= row["precursor_start"])
+            & (pl.col("PrecursorMZ") <= row["precursor_end"])
+        )
         # Get all peptides at the given retention time
-        peptide = pep.filter(pl.col("RetentionTime") == row["rts"])
+        peptide = peptides.filter(pl.col("RetentionTime") == row["rts"])
         # For each peptide at the retention time, find the most intense peak and sum
         # across ion mobility
         for p in peptide.rows(named=True):