diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 7bcb904..67d6343 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -27,14 +27,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install flake8 pytest pytest-cov wheel - pip install -e . - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + pip install -e ".[test,build]" - name: Run unit and system tests run: | pytest -v --cov=petasus tests/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e23b479..17acc81 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.5.0 hooks: - id: check-toml - id: check-yaml @@ -8,12 +8,12 @@ repos: - id: trailing-whitespace - id: detect-private-key - repo: https://github.com/psf/black - rev: 23.3.0 + rev: 23.12.0 hooks: - id: black language_version: python3.10 - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.0.264 + rev: v0.1.7 hooks: - id: ruff args: ['--fix'] diff --git a/petasus/protein_ngram.py b/petasus/protein_ngram.py new file mode 100644 index 0000000..e9131cb --- /dev/null +++ b/petasus/protein_ngram.py @@ -0,0 +1,181 @@ +from __future__ import annotations + +import re +from collections import defaultdict +from os import PathLike + +from loguru import logger +from pyteomics.fasta import FASTA +from tqdm.auto import tqdm + +FASTA_NAME_REGEX = re.compile(r"^.*\|(.*)\|.*$") +UNIPROT_ACC_REGEX = re.compile(r"^[A-Z0-9]{6}(-\d+)?$") + + +class ProteinNGram: + """Implements an n-gram to fast lookup of proteins that match a peptide. + + Usage + ----- + ```python + ngram = ProteinNGram.from_fasta("path/to/fasta") + ngram.search_ngram("AAC") + ['Prot1'] + ``` + + Examples + -------- + >>> base_ngram = {'AA': {1,3}, 'AB': {2,3}, 'AC': {1, 4}, 'CA': {4}} + >>> inv_index = {1: "Prot1", 2: "Prot2", 3: "Prot3", 4: "Prot4"} + >>> inv_seq = {1: "AACAA", 2: "AABAA", 3: "ABDAA", 4: "CACAA"} + >>> ngram = ProteinNGram( + ... ngram = base_ngram, + ... inv_alias = inv_index, + ... inv_seq = inv_seq) + >>> ngram.search_ngram("AAC") + ['Prot1'] + >>> ngram.search_ngram("CAC") + ['Prot4'] + """ + + __slots__ = ("ngram_size", "ngram", "inv_alias", "inv_seq") + + def __init__( + self, + ngram: dict[str, set[int]], + inv_alias: dict[int, str], + inv_seq: dict[int, str], + ) -> None: + """Initialized an ngram for fast lookup. + + For details check the main class docstring. + """ + keys = list(ngram) + if not all(len(keys[0]) == len(k) for k in ngram): + raise ValueError("All ngram keys need to be the same length") + self.ngram_size: int = len(keys[0]) + self.ngram = ngram + self.inv_alias = inv_alias + self.inv_seq = inv_seq + + def search_ngram(self, entry: str) -> list[str]: + """Searches a sequence using the n-gram and returns the matches.""" + + if len(entry) < self.ngram_size: + raise ValueError( + f"Entry {entry} is shorter than the n-gram size ({self.ngram_size})" + ) + + candidates = None + for x in [ + entry[x : x + self.ngram_size] + for x in range(1 + len(entry) - self.ngram_size) + ]: + if len(x) < self.ngram_size: + raise + + if candidates is None: + candidates = self.ngram.get(x, set()) + else: + candidates = candidates.intersection(self.ngram[x]) + if len(candidates) <= 1: + break + + # This makes sure the whole sequence is matched. + # For instance ... "BAAAAB" and "BAAAAAAAAAB" share all the same length 2 + # ngrams, but do not match the same sequence (if the protein is "PEPTIDEBAAAB", + # only the first would be kept). + out = [ + self.inv_alias[x] for x in candidates if entry in self.inv_seq[x] + ] + return out + + @staticmethod + def from_fasta( + fasta_file: PathLike | str, + ngram_size: int = 4, + progress: bool = True, + proteins_keep: None | set[str] = None, + ) -> ProteinNGram: + """Builds a protein n-gram from a fasta file. + + Parameters + ---------- + fasta_file: + Path-like or string representing the fasta file to read in order + to build the index. + ngram_size: + Size of the chunks that will be used to build the n-gram, should + be smaller than the smallest peptide to be searched. Longer sequences + should give a more unique aspect to it but a larger index is built. + progress: + Whether to show a progress bar while building the index. + proteins_keep: set[str]: + If not None, only keep the proteins in the set, by matching the + uniprot ID. Example: {'Q92804', 'Q92804-2', 'P13639'} + + """ + ngram = defaultdict(set) + inv_alias = {} + inv_seq = {} + skipped = 0 + kept = 0 + + for i, entry in tqdm( + enumerate(FASTA(str(fasta_file))), + disable=not progress, + desc="Building peptide n-gram index", + ): + entry_name = FASTA_NAME_REGEX.search(entry.description).group(1) + if proteins_keep is not None and entry_name not in proteins_keep: + skipped += 1 + continue + else: + kept += 1 + sequence = entry.sequence + if len(sequence) < ngram_size: + logger.warning( + f"Skipping {entry_name} because it is shorter than the n-gram size" + ) + continue + + inv_alias[i] = entry_name + inv_seq[i] = sequence + for x in [ + sequence[x : x + ngram_size] + for x in range(1 + len(sequence) - ngram_size) + ]: + ngram[x].add(i) + + if proteins_keep is not None: + logger.info( + f"Kept {kept} proteins and skipped {skipped} " + "proteins when importing the fasta file", + ) + + return ProteinNGram(ngram=ngram, inv_alias=inv_alias, inv_seq=inv_seq) + + +def get_protein_accessions(protein_list: list[str]) -> set[str]: + """Extracts all protein accessions from a list of protein groups. + + This is meant to be used on the protein groups column in a mokapot + results file. + + Examples + -------- + >>> tmp = [ + ... "sp|Q92804|RBP56_HUMAN", + ... "sp|Q92804-2|RBP56_HUMAN", + ... "sp|P13639|EF2_HUMAN"] + >>> got = get_protein_accessions(tmp) + >>> exp = {'Q92804', 'Q92804-2', 'P13639'} + >>> exp == got + True + """ + out = set() + for protein in protein_list: + for accession in protein.split(","): + acc = accession.split("|")[1] + out.add(acc) + return out diff --git a/petasus/scripts/add_peptidetoprotein.py b/petasus/scripts/add_peptidetoprotein.py new file mode 100644 index 0000000..cea2ccb --- /dev/null +++ b/petasus/scripts/add_peptidetoprotein.py @@ -0,0 +1,107 @@ +import click +import sqlite3 +import os +import pandas as pd +from petasus.protein_ngram import ProteinNGram, get_protein_accessions + + +def _add_peptidetoprotein( + mokapot_proteins: os.PathLike, + fasta_file: os.PathLike, + speclib: os.PathLike, + ngram_size=4, + q_threshold=0.1, +): + """Add peptidetoprotein table to dlib/elib file.""" + mokapot_proteins = pd.read_csv(mokapot_proteins, sep="\t") + mokapot_proteins = mokapot_proteins[ + mokapot_proteins["mokapot q-value"] < q_threshold + ] + + accessions_keep = get_protein_accessions( + mokapot_proteins["mokapot protein group"].tolist() + ) + + ngram = ProteinNGram.from_fasta( + fasta_file, + proteins_keep=accessions_keep, + ngram_size=ngram_size, + ) + + conn = sqlite3.connect(speclib) + df = pd.read_sql("SELECT PeptideSeq FROM entries", conn) + conn.close() + peps = df["PeptideSeq"].unique().tolist() + del df + + matches = [ngram.search_ngram(p) for p in peps] + num_marches = sum([len(m) for m in matches]) + + peptide_col = [None] * num_marches + protein_col = [None] * num_marches + + i = 0 + for p, m in zip(peps, matches): + for n in m: + peptide_col[i] = p + protein_col[i] = n + i += 1 + + df = pd.DataFrame( + {"PeptideSeq": peptide_col, "ProteinAccession": protein_col} + ) + df["isDecoy"] = False + conn = sqlite3.connect(speclib) + conn.execute("DROP TABLE IF EXISTS peptidetoprotein") + conn.execute( + ( + "CREATE TABLE peptidetoprotein " + "(PeptideSeq string not null, " + "isDecoy boolean, " + "ProteinAccession string not null)" + ) + ) + df.to_sql("peptidetoprotein", conn, if_exists="append", index=False) + conn.close() + + +@click.command() +@click.argument("mokapot_proteins", type=click.Path(exists=True)) +@click.argument("fasta_file", type=click.Path(exists=True)) +@click.argument("speclib", type=click.Path(exists=True)) +@click.option( + "--ngram_size", + type=int, + default=4, + help=( + "Size of the chunks that will be used to build the n-gram," + " should be smaller than the smallest peptide to be searched." + " Longer sequences should give a more unique aspect to it" + " but a larger index is built.", + ), +) +@click.option( + "--q_threshold", + type=float, + default=0.1, + help="Threshold for mokapot q-value.", +) +def add_peptidetoprotein( + mokapot_proteins, + fasta_file, + speclib, + ngram_size=4, + q_threshold=0.1, +): + """Add peptidetoprotein table to dlib/elib file.""" + _add_peptidetoprotein( + mokapot_proteins=mokapot_proteins, + fasta_file=fasta_file, + speclib=speclib, + ngram_size=ngram_size, + q_threshold=q_threshold, + ) + + +if __name__ == "__main__": + add_peptidetoprotein() diff --git a/pyproject.toml b/pyproject.toml index 640faa2..3288b3b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,16 +19,22 @@ classifiers = [ ] requires-python = ">=3.10" dependencies = [ - "click", + "click", # General "loguru", - "polars", - "pyteomics", "lxml", + "appdirs", + "polars", # Data science + "pyarrow", + "pandas", "numpy", "numba", - "pyarrow", - "hdf5plugin>=3.10.0", - "ms2ml @ git+https://github.com/wfondrie/ms2ml.git@encyclopedia-update" + "lark", + "hdf5plugin>=3.10.0", # Proteomics + "ms2ml>=0.0.45", + "pyteomics", + "psims", + "mokapot", + "spectrum_utils>=0.4.2", ] dynamic = ["version"] @@ -42,6 +48,16 @@ Homepage = "https://github.com/TalusBio/search2dlib" [project.optional-dependencies] dev = [ "pre-commit>=2.7.1", + "black", + "ruff", +] +test = [ + "pytest", + "pytest-cov", + "pytest-datadir", +] +build = [ + "wheel", ] [project.scripts] @@ -65,22 +81,4 @@ select = ["E", "F", "T20"] # T20 is for print() statements. line-length = 79 target-version = ['py310'] include = '\.pyi?$' -exclude = ''' - -( - /( - \.eggs # exclude a few common directories in the - | \.git # root of the project - | \.hg - | \.mypy_cache - | \.tox - | \.venv - | _build - | buck-out - | build - | dist - )/ - | foo.py # also separately exclude a file named foo.py in - # the root of the project -) -''' +# Black excludes files in the .gitignore by default diff --git a/tests/unit_tests/data/Q99536.fasta b/tests/unit_tests/data/Q99536.fasta new file mode 100644 index 0000000..b1020d3 --- /dev/null +++ b/tests/unit_tests/data/Q99536.fasta @@ -0,0 +1,11 @@ +>sp|Q99536|VAT1_HUMAN Synaptic vesicle membrane protein VAT-1 homolog OS=Homo sapiens OX=9606 GN=VAT1 PE=1 SV=2 +MSDEREVAEAATGEDASSPPPKTEAASDPQHPAASEGAAAAAASPPLLRCLVLTGFGGYD +KVKLQSRPAAPPAPGPGQLTLRLRACGLNFADLMARQGLYDRLPPLPVTPGMEGAGVVIA +VGEGVSDRKAGDRVMVLNRSGMWQEEVTVPSVQTFLIPEAMTFEEAAALLVNYITAYMVL +FDFGNLQPGHSVLVHMAAGGVGMAAVQLCRTVENVTVFGTASASKHEALKENGVTHPIDY +HTTDYVDEIKKISPKGVDIVMDPLGGSDTAKGYNLLKPMGKVVTYGMANLLTGPKRNLMA +LARTWWNQFSVTALQLLQANRAVCGFHLGYLDGEVELVSGVVARLLALYNQGHIKPHIDS +VWPFEKVADAMKQMQEKKNVGKVLLVPGPEKEN +>sp|P01308|INS_HUMAN Insulin OS=Homo sapiens OX=9606 GN=INS PE=1 SV=1 +MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAED +LQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN diff --git a/tests/unit_tests/test_protein_ngram.py b/tests/unit_tests/test_protein_ngram.py new file mode 100644 index 0000000..8591c76 --- /dev/null +++ b/tests/unit_tests/test_protein_ngram.py @@ -0,0 +1,32 @@ +from petasus.protein_ngram import ProteinNGram, get_protein_accessions +import pytest + + +def test_ngram(shared_datadir): + fasta_file = shared_datadir / "Q99536.fasta" + + ngram = ProteinNGram.from_fasta(str(fasta_file), 4) + assert ngram.ngram_size == 4 + assert len(ngram.search_ngram("PEPTIDE")) == 0 + assert len(ngram.search_ngram("LNRSGMWQEEVTVP")) == 1 + assert len(ngram.search_ngram("QEEVTVPLNRSGMW")) == 0 + + res = ngram.search_ngram("LNRSGMWQEEVTVP") + assert res[0] == "Q99536" + + res = ngram.search_ngram("VEALYLVCGERG") + assert res[0] == "P01308" + + with pytest.raises(ValueError): + res = ngram.search_ngram("VEA") + + +def test_id_extractor(): + tmp = [ + "sp|Q92804|RBP56_HUMAN", + "sp|Q92804-2|RBP56_HUMAN", + "sp|P13639|EF2_HUMAN", + ] + got = get_protein_accessions(tmp) + exp = {"Q92804", "Q92804-2", "P13639"} + assert exp == got