TalusBio · jspaezp · Dec 12, 2023 · Dec 12, 2023 · Dec 12, 2023 · Dec 12, 2023
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -27,14 +27,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install flake8 pytest pytest-cov wheel
-        pip install -e .
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+        pip install -e ".[test,build]"
     - name: Run unit and system tests
       run: |
         pytest -v --cov=petasus tests/

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,19 +1,19 @@
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.4.0
+  rev: v4.5.0
   hooks:
   - id: check-toml
   - id: check-yaml
   - id: end-of-file-fixer
   - id: trailing-whitespace
   - id: detect-private-key
 - repo: https://github.com/psf/black
-  rev: 23.3.0
+  rev: 23.12.0
   hooks:
     - id: black
       language_version: python3.10
 - repo: https://github.com/charliermarsh/ruff-pre-commit
-  rev: v0.0.264
+  rev: v0.1.7
   hooks:
     - id: ruff
       args: ['--fix']
diff --git a/petasus/protein_ngram.py b/petasus/protein_ngram.py
@@ -0,0 +1,181 @@
+from __future__ import annotations
+
+import re
+from collections import defaultdict
+from os import PathLike
+
+from loguru import logger
+from pyteomics.fasta import FASTA
+from tqdm.auto import tqdm
+
+FASTA_NAME_REGEX = re.compile(r"^.*\|(.*)\|.*$")
+UNIPROT_ACC_REGEX = re.compile(r"^[A-Z0-9]{6}(-\d+)?$")
+
+
+class ProteinNGram:
+    """Implements an n-gram to fast lookup of proteins that match a peptide.
+
+    Usage
+    -----
+    ```python
+    ngram = ProteinNGram.from_fasta("path/to/fasta")
+    ngram.search_ngram("AAC")
+    ['Prot1']
+    ```
+
+    Examples
+    --------
+    >>> base_ngram = {'AA': {1,3}, 'AB': {2,3}, 'AC': {1, 4}, 'CA': {4}}
+    >>> inv_index = {1: "Prot1", 2: "Prot2", 3: "Prot3", 4: "Prot4"}
+    >>> inv_seq = {1: "AACAA", 2: "AABAA", 3: "ABDAA", 4: "CACAA"}
+    >>> ngram = ProteinNGram(
+    ...     ngram = base_ngram,
+    ...     inv_alias = inv_index,
+    ...     inv_seq = inv_seq)
+    >>> ngram.search_ngram("AAC")
+    ['Prot1']
+    >>> ngram.search_ngram("CAC")
+    ['Prot4']
+    """
+
+    __slots__ = ("ngram_size", "ngram", "inv_alias", "inv_seq")
+
+    def __init__(
+        self,
+        ngram: dict[str, set[int]],
+        inv_alias: dict[int, str],
+        inv_seq: dict[int, str],
+    ) -> None:
+        """Initialized an ngram for fast lookup.
+
+        For details check the main class docstring.
+        """
+        keys = list(ngram)
+        if not all(len(keys[0]) == len(k) for k in ngram):
+            raise ValueError("All ngram keys need to be the same length")
+        self.ngram_size: int = len(keys[0])
+        self.ngram = ngram
+        self.inv_alias = inv_alias
+        self.inv_seq = inv_seq
+
+    def search_ngram(self, entry: str) -> list[str]:
+        """Searches a sequence using the n-gram and returns the matches."""
+
+        if len(entry) < self.ngram_size:
+            raise ValueError(
+                f"Entry {entry} is shorter than the n-gram size ({self.ngram_size})"
+            )
+
+        candidates = None
+        for x in [
+            entry[x : x + self.ngram_size]
+            for x in range(1 + len(entry) - self.ngram_size)
+        ]:
+            if len(x) < self.ngram_size:
+                raise
+
+            if candidates is None:
+                candidates = self.ngram.get(x, set())
+            else:
+                candidates = candidates.intersection(self.ngram[x])
+                if len(candidates) <= 1:
+                    break
+
+        # This makes sure the whole sequence is matched.
+        # For instance ... "BAAAAB" and "BAAAAAAAAAB" share all the same length 2
+        # ngrams, but do not match the same sequence (if the protein is "PEPTIDEBAAAB",
+        # only the first would be kept).
+        out = [
+            self.inv_alias[x] for x in candidates if entry in self.inv_seq[x]
+        ]
+        return out
+
+    @staticmethod
+    def from_fasta(
+        fasta_file: PathLike | str,
+        ngram_size: int = 4,
+        progress: bool = True,
+        proteins_keep: None | set[str] = None,
+    ) -> ProteinNGram:
+        """Builds a protein n-gram from a fasta file.
+
+        Parameters
+        ----------
+        fasta_file:
+            Path-like or string representing the fasta file to read in order
+            to build the index.
+        ngram_size:
+            Size of the chunks that will be used to build the n-gram, should
+            be smaller than the smallest peptide to be searched. Longer sequences
+            should give a more unique aspect to it but a larger index is built.
+        progress:
+            Whether to show a progress bar while building the index.
+        proteins_keep: set[str]:
+            If not None, only keep the proteins in the set, by matching the
+            uniprot ID. Example: {'Q92804', 'Q92804-2', 'P13639'}
+
+        """
+        ngram = defaultdict(set)
+        inv_alias = {}
+        inv_seq = {}
+        skipped = 0
+        kept = 0
+
+        for i, entry in tqdm(
+            enumerate(FASTA(str(fasta_file))),
+            disable=not progress,
+            desc="Building peptide n-gram index",
+        ):
+            entry_name = FASTA_NAME_REGEX.search(entry.description).group(1)
+            if proteins_keep is not None and entry_name not in proteins_keep:
+                skipped += 1
+                continue
+            else:
+                kept += 1
+            sequence = entry.sequence
+            if len(sequence) < ngram_size:
+                logger.warning(
+                    f"Skipping {entry_name} because it is shorter than the n-gram size"
+                )
+                continue
+
+            inv_alias[i] = entry_name
+            inv_seq[i] = sequence
+            for x in [
+                sequence[x : x + ngram_size]
+                for x in range(1 + len(sequence) - ngram_size)
+            ]:
+                ngram[x].add(i)
+
+        if proteins_keep is not None:
+            logger.info(
+                f"Kept {kept} proteins and skipped {skipped} "
+                "proteins when importing the fasta file",
+            )
+
+        return ProteinNGram(ngram=ngram, inv_alias=inv_alias, inv_seq=inv_seq)
+
+
+def get_protein_accessions(protein_list: list[str]) -> set[str]:
+    """Extracts all protein accessions from a list of protein groups.
+
+    This is meant to be used on the protein groups column in a mokapot
+    results file.
+
+    Examples
+    --------
+        >>> tmp = [
+        ...     "sp|Q92804|RBP56_HUMAN",
+        ...     "sp|Q92804-2|RBP56_HUMAN",
+        ...     "sp|P13639|EF2_HUMAN"]
+        >>> got = get_protein_accessions(tmp)
+        >>> exp = {'Q92804', 'Q92804-2', 'P13639'}
+        >>> exp == got
+        True
+    """
+    out = set()
+    for protein in protein_list:
+        for accession in protein.split(","):
+            acc = accession.split("|")[1]
+            out.add(acc)
+    return out
diff --git a/petasus/scripts/add_peptidetoprotein.py b/petasus/scripts/add_peptidetoprotein.py
@@ -0,0 +1,107 @@
+import click
+import sqlite3
+import os
+import pandas as pd
+from petasus.protein_ngram import ProteinNGram, get_protein_accessions
+
+
+def _add_peptidetoprotein(
+    mokapot_proteins: os.PathLike,
+    fasta_file: os.PathLike,
+    speclib: os.PathLike,
+    ngram_size=4,
+    q_threshold=0.1,
+):
+    """Add peptidetoprotein table to dlib/elib file."""
+    mokapot_proteins = pd.read_csv(mokapot_proteins, sep="\t")
+    mokapot_proteins = mokapot_proteins[
+        mokapot_proteins["mokapot q-value"] < q_threshold
+    ]
+
+    accessions_keep = get_protein_accessions(
+        mokapot_proteins["mokapot protein group"].tolist()
+    )
+
+    ngram = ProteinNGram.from_fasta(
+        fasta_file,
+        proteins_keep=accessions_keep,
+        ngram_size=ngram_size,
+    )
+
+    conn = sqlite3.connect(speclib)
+    df = pd.read_sql("SELECT PeptideSeq FROM entries", conn)
+    conn.close()
+    peps = df["PeptideSeq"].unique().tolist()
+    del df
+
+    matches = [ngram.search_ngram(p) for p in peps]
+    num_marches = sum([len(m) for m in matches])
+
+    peptide_col = [None] * num_marches
+    protein_col = [None] * num_marches
+
+    i = 0
+    for p, m in zip(peps, matches):
+        for n in m:
+            peptide_col[i] = p
+            protein_col[i] = n
+            i += 1
+
+    df = pd.DataFrame(
+        {"PeptideSeq": peptide_col, "ProteinAccession": protein_col}
+    )
+    df["isDecoy"] = False
+    conn = sqlite3.connect(speclib)
+    conn.execute("DROP TABLE IF EXISTS peptidetoprotein")
+    conn.execute(
+        (
+            "CREATE TABLE peptidetoprotein "
+            "(PeptideSeq string not null, "
+            "isDecoy boolean, "
+            "ProteinAccession string not null)"
+        )
+    )
+    df.to_sql("peptidetoprotein", conn, if_exists="append", index=False)
+    conn.close()
+
+
+@click.command()
+@click.argument("mokapot_proteins", type=click.Path(exists=True))
+@click.argument("fasta_file", type=click.Path(exists=True))
+@click.argument("speclib", type=click.Path(exists=True))
+@click.option(
+    "--ngram_size",
+    type=int,
+    default=4,
+    help=(
+        "Size of the chunks that will be used to build the n-gram,"
+        " should be smaller than the smallest peptide to be searched."
+        " Longer sequences should give a more unique aspect to it"
+        " but a larger index is built.",
+    ),
+)
+@click.option(
+    "--q_threshold",
+    type=float,
+    default=0.1,
+    help="Threshold for mokapot q-value.",
+)
+def add_peptidetoprotein(
+    mokapot_proteins,
+    fasta_file,
+    speclib,
+    ngram_size=4,
+    q_threshold=0.1,
+):
+    """Add peptidetoprotein table to dlib/elib file."""
+    _add_peptidetoprotein(
+        mokapot_proteins=mokapot_proteins,
+        fasta_file=fasta_file,
+        speclib=speclib,
+        ngram_size=ngram_size,
+        q_threshold=q_threshold,
+    )
+
+
+if __name__ == "__main__":
+    add_peptidetoprotein()
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,16 +19,22 @@ classifiers = [
 ]
 requires-python = ">=3.10"
 dependencies = [
-    "click",
+    "click", # General
     "loguru",
-    "polars",
-    "pyteomics",
     "lxml",
+    "appdirs",
+    "polars", # Data science
+    "pyarrow",
+    "pandas",
     "numpy",
     "numba",
-    "pyarrow",
-    "hdf5plugin>=3.10.0",
-    "ms2ml @ git+https://github.com/wfondrie/ms2ml.git@encyclopedia-update"
+    "lark",
+    "hdf5plugin>=3.10.0", # Proteomics
+    "ms2ml>=0.0.45",
+    "pyteomics",
+    "psims",
+    "mokapot",
+    "spectrum_utils>=0.4.2",
 ]
 dynamic = ["version"]
 
@@ -42,6 +48,16 @@ Homepage = "https://github.com/TalusBio/search2dlib"
 [project.optional-dependencies]
 dev = [
     "pre-commit>=2.7.1",
+    "black",
+    "ruff",
+]
+test = [
+    "pytest",
+    "pytest-cov",
+    "pytest-datadir",
+]
+build = [
+    "wheel",
 ]
 
 [project.scripts]
@@ -65,22 +81,4 @@ select = ["E", "F", "T20"]  # T20 is for print() statements.
 line-length = 79
 target-version = ['py310']
 include = '\.pyi?$'
-exclude = '''
-
-(
-  /(
-      \.eggs         # exclude a few common directories in the
-    | \.git          # root of the project
-    | \.hg
-    | \.mypy_cache
-    | \.tox
-    | \.venv
-    | _build
-    | buck-out
-    | build
-    | dist
-  )/
-  | foo.py           # also separately exclude a file named foo.py in
-                     # the root of the project
-)
-'''
+# Black excludes files in the .gitignore by default