From c739b51d0f59e52594cb345b2120a7d1d22460bf Mon Sep 17 00:00:00 2001 From: Auke Heerdink <21688542+aukeheerdink@users.noreply.github.com> Date: Fri, 11 Apr 2025 12:52:24 +0200 Subject: [PATCH 1/4] expanded proteases --- rustyms/src/align/mass_alignment.rs | 2 +- rustyms/src/aminoacid/aminoacid.rs | 29 ++ rustyms/src/peptidoform/peptidoform.rs | 7 +- rustyms/src/protease.rs | 377 +++++++++++++++++++++++-- 4 files changed, 392 insertions(+), 23 deletions(-) diff --git a/rustyms/src/align/mass_alignment.rs b/rustyms/src/align/mass_alignment.rs index 05c84c01..44ea5f45 100644 --- a/rustyms/src/align/mass_alignment.rs +++ b/rustyms/src/align/mass_alignment.rs @@ -299,7 +299,7 @@ fn calculate_masses( .iter() .map(|p| { p.formulas_all( - &[sequence], + &[], &[], &mut Vec::new(), false, diff --git a/rustyms/src/aminoacid/aminoacid.rs b/rustyms/src/aminoacid/aminoacid.rs index fbadb7fe..5fb4e88d 100644 --- a/rustyms/src/aminoacid/aminoacid.rs +++ b/rustyms/src/aminoacid/aminoacid.rs @@ -369,6 +369,35 @@ impl AminoAcid { Self::Valine, ]; + /// All amino acids (including I/L/J/B/Z but excluding X) + pub const ALL_AMINO_ACIDS: &'static [Self] = &[ + Self::Alanine, + Self::AmbiguousAsparagine, + Self::AmbiguousGlutamine, + Self::AmbiguousLeucine, + Self::Arginine, + Self::Asparagine, + Self::AsparticAcid, + Self::Cysteine, + Self::GlutamicAcid, + Self::Glutamine, + Self::Glycine, + Self::Histidine, + Self::Isoleucine, + Self::Leucine, + Self::Lysine, + Self::Methionine, + Self::Phenylalanine, + Self::Proline, + Self::Pyrrolysine, + Self::Selenocysteine, + Self::Serine, + Self::Threonine, + Self::Tryptophan, + Self::Tyrosine, + Self::Valine, + ]; + // TODO: generalise over used storage type, so using molecularformula, monoisotopic mass, or average mass, also make sure that AAs can return these numbers in a const fashion #[expect(clippy::too_many_lines, clippy::too_many_arguments)] pub(crate) fn fragments( diff --git a/rustyms/src/peptidoform/peptidoform.rs b/rustyms/src/peptidoform/peptidoform.rs index c23f0460..58dd6c21 100644 --- a/rustyms/src/peptidoform/peptidoform.rs +++ b/rustyms/src/peptidoform/peptidoform.rs @@ -431,10 +431,7 @@ impl Peptidoform { if let Modification::Ambiguous { .. } = f { acc } else { - let attachment = all_peptides[peptidoform_index] - .sequence - .first() - .map(|s| s.aminoacid.aminoacid()); + let attachment = self.sequence.first().map(|s| s.aminoacid.aminoacid()); let (formula, specific, _seen) = f.formula_inner( all_peptides, visited_peptides, @@ -1522,7 +1519,7 @@ impl> Peptidoform { let mut result = Vec::new(); for (index, start) in sites.iter().enumerate() { - for end in sites.iter().skip(index).take(max_missed_cleavages + 1) { + for end in sites.iter().skip(index + 1).take(max_missed_cleavages + 1) { result.push(self.sub_peptide((*start)..*end)); } } diff --git a/rustyms/src/protease.rs b/rustyms/src/protease.rs index 255fd3d9..14f22102 100644 --- a/rustyms/src/protease.rs +++ b/rustyms/src/protease.rs @@ -1,4 +1,7 @@ +use std::sync::LazyLock; + use itertools::Itertools; +use serde::{Deserialize, Serialize}; use crate::{AminoAcid, SequenceElement}; @@ -6,50 +9,67 @@ use crate::{AminoAcid, SequenceElement}; /// Each position is identified by an option, a none means that there is no specificity at this position. If there is /// a specificity at a certain position any amino acid that is contained in the set is allowed (see /// [`crate::CheckedAminoAcid::canonical_identical`]). +/// +/// #TODO: Add the possibility to define the effiency of amino acids or certain motifs +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct Protease { /// The amino acids n terminal of the cut site. - pub n_term: Vec>>, + pub before: Vec>>, /// The amino acids c terminal of the cut site. - pub c_term: Vec>>, + pub after: Vec>>, } impl Protease { /// Define a simple protease that cuts exactly between the specified sequences. - pub fn new(n_term: &[AminoAcid], c_term: &[AminoAcid]) -> Self { + pub fn new(n_term: Vec, c_term: Vec) -> Self { Self { - n_term: n_term.iter().map(|aa| Some(vec![*aa])).collect_vec(), - c_term: c_term.iter().map(|aa| Some(vec![*aa])).collect_vec(), + before: vec![Some(n_term)], + after: vec![Some(c_term)], } } - /// Define a protease that cuts on the n terminal side of the provided amino acids. - pub fn n_terminal_of(residues: &[AminoAcid]) -> Self { + /// Define a protease that cuts on the c terminal side of the provided amino acids. + pub fn c_terminal_of(residues: Vec) -> Self { Self { - n_term: vec![Some(residues.to_vec())], - c_term: Vec::new(), + before: vec![Some(residues)], + after: Vec::new(), } } - /// Define a protease that cuts on the c terminal side of the provided amino acids. - pub fn c_terminal_of(residues: &[AminoAcid]) -> Self { + /// Define a protease that cuts on the n terminal side of the provided amino acids. + pub fn n_terminal_of(residues: Vec) -> Self { Self { - c_term: vec![Some(residues.to_vec())], - n_term: Vec::new(), + before: Vec::new(), + after: vec![Some(residues)], } } + /// Helper function to get a list of all amino acids except the ones given + pub fn get_exclusive(exclude: &[AminoAcid]) -> Vec { + AminoAcid::ALL_AMINO_ACIDS + .iter() + .copied() + .filter(|aa| !exclude.contains(aa)) + .collect_vec() + } + /// All locations in the given sequence where this protease could cut + /// Note if a cutsite is "after" the last element in the sequence, it will not rapport that, only the cutsites inside the sequence pub fn match_locations(&self, sequence: &[SequenceElement]) -> Vec { - (self.n_term.len()..sequence.len() - self.c_term.len()) - .filter(|i| self.matches_at(&sequence[i - self.n_term.len()..i + self.c_term.len()])) + let upper = sequence + .len() + .saturating_sub(self.after.len()) + .min(sequence.len().saturating_sub(1)); + (self.before.len()..=upper) + .filter(|i| self.matches_at(&sequence[i - self.before.len()..i + self.after.len()])) .collect_vec() } fn matches_at(&self, slice: &[SequenceElement]) -> bool { - debug_assert!(slice.len() == self.n_term.len() + self.c_term.len()); + debug_assert!(slice.len() == self.before.len() + self.after.len()); 'positions: for (actual, pattern) in slice .iter() - .zip(self.n_term.iter().chain(self.c_term.iter())) + .zip(self.before.iter().chain(self.after.iter())) { if let Some(pattern) = pattern { for option in pattern { @@ -63,3 +83,326 @@ impl Protease { true } } + +/// Some well known and widely used proteases +pub mod known_proteases { + use super::*; + + /// `Trypsin` cuts after Lysine (K) or Arginine (R), unless followed by Proline (P) + pub static TRYPSIN: LazyLock = LazyLock::new(|| { + Protease::new( + vec![AminoAcid::Lysine, AminoAcid::Arginine], + Protease::get_exclusive(&[AminoAcid::Proline]), + ) + }); + + /// `Chymotrypsin` cuts after Phenylalanine (F), Tryptophan (W), Tyrosine (Y), unless followed by Proline (P) + pub static CHYMOTRYPSIN: LazyLock = LazyLock::new(|| { + Protease::new( + vec![ + AminoAcid::Phenylalanine, + AminoAcid::Tryptophan, + AminoAcid::Tyrosine, + ], + Protease::get_exclusive(&[AminoAcid::Proline]), + ) + }); + + /// `Pepsin` (pH > 2) cuts after Phenylalanine (F), Tryptophan (W), Tyrosine (Y), Leucine (L) + pub static PEPSIN: LazyLock = LazyLock::new(|| { + Protease::c_terminal_of(vec![ + AminoAcid::Phenylalanine, + AminoAcid::Tryptophan, + AminoAcid::Tyrosine, + AminoAcid::Leucine, + ]) + }); + + /// Elastase cuts after Alanine (A), Glycine (G), Serine (S), Valine (V), Isoleucine (I), Leucine (L) + pub static ELASTASE: LazyLock = LazyLock::new(|| { + Protease::c_terminal_of(vec![ + AminoAcid::Alanine, + AminoAcid::Glycine, + AminoAcid::Serine, + AminoAcid::Valine, + AminoAcid::Isoleucine, + AminoAcid::Leucine, + ]) + }); + + /// `AspN` cuts before Aspartic acid (D) + pub static ASPN: LazyLock = + LazyLock::new(|| Protease::n_terminal_of(vec![AminoAcid::AsparticAcid])); + + /// `GluC` cuts after Glutamic acid (E) + pub static GLUC: LazyLock = + LazyLock::new(|| Protease::c_terminal_of(vec![AminoAcid::GlutamicAcid])); + + /// `LysC` cuts after Lysine (K) + pub static LYSC: LazyLock = + LazyLock::new(|| Protease::c_terminal_of(vec![AminoAcid::Lysine])); + + /// `ArgC` cuts after Arginine (R) + pub static ARGC: LazyLock = + LazyLock::new(|| Protease::c_terminal_of(vec![AminoAcid::Arginine])); +} + +#[cfg(test)] +#[allow(clippy::missing_panics_doc)] +mod tests { + use crate::{Linear, Peptidoform}; + + use super::*; + + pub struct ProteaseTestCase { + pub sequence: Peptidoform, + pub expected_cut_sites: Vec, + pub expected_peptides: Vec>, + } + + /// Generic test function for all proteases + pub fn test_protease(protease: &Protease, test_case: &ProteaseTestCase) { + // Test cut sites + let cut_sites = protease.match_locations(test_case.sequence.sequence()); + + assert_eq!( + cut_sites, test_case.expected_cut_sites, + "Incorrect cut sites: found '{cut_sites:?}' expected '{:?}'", + test_case.expected_cut_sites + ); + + // Test peptides + let peptides = test_case.sequence.digest(protease, 0); + + if peptides.len() != test_case.expected_peptides.len() { + for peptide in &peptides { + println!("{peptide}"); + } + panic!("Incorrect number of peptides") + } + + for (i, peptide) in peptides.iter().enumerate() { + assert_eq!( + peptide, &test_case.expected_peptides[i], + "Peptides don't match: found '{peptide}' expected '{}'", + test_case.expected_peptides[i] + ); + } + } + + fn str_to_peptidoform(str_peptide: &str) -> Peptidoform { + Peptidoform::pro_forma(str_peptide, None) + .unwrap() + .into_linear() + .unwrap() + } + + #[test] + fn trypsin() { + let test_cases = vec![ + ProteaseTestCase { + sequence: str_to_peptidoform("AKRPGKR"), + expected_cut_sites: vec![2, 6], + expected_peptides: vec![ + str_to_peptidoform("AK"), + str_to_peptidoform("RPGK"), + str_to_peptidoform("R"), + ], + }, + ProteaseTestCase { + sequence: str_to_peptidoform("ARAKGCVLRPKDGR"), + expected_cut_sites: vec![2, 4, 11], + expected_peptides: vec![ + str_to_peptidoform("AR"), + str_to_peptidoform("AK"), + str_to_peptidoform("GCVLRPK"), + str_to_peptidoform("DGR"), + ], + }, + ]; + + for test_case in test_cases { + test_protease(&known_proteases::TRYPSIN, &test_case); + } + } + + #[test] + fn chymotrypsin() { + let test_cases = vec![ + ProteaseTestCase { + sequence: str_to_peptidoform("AFWYPLGF"), + expected_cut_sites: vec![2, 3], + expected_peptides: vec![ + str_to_peptidoform("AF"), + str_to_peptidoform("W"), + str_to_peptidoform("YPLGF"), + ], + }, + ProteaseTestCase { + sequence: str_to_peptidoform("AVFUDGWTYPMSR"), + expected_cut_sites: vec![3, 7], + expected_peptides: vec![ + str_to_peptidoform("AVF"), + str_to_peptidoform("UDGW"), + str_to_peptidoform("TYPMSR"), + ], + }, + ]; + + for test_case in test_cases { + test_protease(&known_proteases::CHYMOTRYPSIN, &test_case); + } + } + + #[test] + fn pepsin() { + let test_cases = vec![ + ProteaseTestCase { + sequence: str_to_peptidoform("AACVFLPAKLURF"), + expected_cut_sites: vec![5, 6, 10], + expected_peptides: vec![ + str_to_peptidoform("AACVF"), + str_to_peptidoform("L"), + str_to_peptidoform("PAKL"), + str_to_peptidoform("URF"), + ], + }, + ProteaseTestCase { + sequence: str_to_peptidoform("GFLPKDLVMSRG"), + expected_cut_sites: vec![2, 3, 7], + expected_peptides: vec![ + str_to_peptidoform("GF"), + str_to_peptidoform("L"), + str_to_peptidoform("PKDL"), + str_to_peptidoform("VMSRG"), + ], + }, + ]; + + for test_case in test_cases { + test_protease(&known_proteases::PEPSIN, &test_case); + } + } + + #[test] + fn elastase() { + let test_cases = vec![ + ProteaseTestCase { + sequence: str_to_peptidoform("FASGVPRKT"), + expected_cut_sites: vec![2, 3, 4, 5], + expected_peptides: vec![ + str_to_peptidoform("FA"), + str_to_peptidoform("S"), + str_to_peptidoform("G"), + str_to_peptidoform("V"), + str_to_peptidoform("PRKT"), + ], + }, + ProteaseTestCase { + sequence: str_to_peptidoform("PGAVSLFTGK"), + expected_cut_sites: vec![2, 3, 4, 5, 6, 9], + expected_peptides: vec![ + str_to_peptidoform("PG"), + str_to_peptidoform("A"), + str_to_peptidoform("V"), + str_to_peptidoform("S"), + str_to_peptidoform("L"), + str_to_peptidoform("FTG"), + str_to_peptidoform("K"), + ], + }, + ]; + + for test_case in test_cases { + test_protease(&known_proteases::ELASTASE, &test_case); + } + } + + #[test] + fn aspn() { + let test_cases = vec![ + ProteaseTestCase { + sequence: str_to_peptidoform("FARDKPGLFD"), + expected_cut_sites: vec![3, 9], + expected_peptides: vec![ + str_to_peptidoform("FAR"), + str_to_peptidoform("DKPGLF"), + str_to_peptidoform("D"), + ], + }, + ProteaseTestCase { + sequence: str_to_peptidoform("PFKDLTMSR"), + expected_cut_sites: vec![3], + expected_peptides: vec![str_to_peptidoform("PFK"), str_to_peptidoform("DLTMSR")], + }, + ]; + + for test_case in test_cases { + test_protease(&known_proteases::ASPN, &test_case); + } + } + + #[test] + fn gluc() { + let test_cases = vec![ + ProteaseTestCase { + sequence: str_to_peptidoform("FAREDKPGLF"), + expected_cut_sites: vec![4], + expected_peptides: vec![str_to_peptidoform("FARE"), str_to_peptidoform("DKPGLF")], + }, + ProteaseTestCase { + sequence: str_to_peptidoform("PFKELGTMSR"), + expected_cut_sites: vec![4], + expected_peptides: vec![str_to_peptidoform("PFKE"), str_to_peptidoform("LGTMSR")], + }, + ]; + + for test_case in test_cases { + test_protease(&known_proteases::GLUC, &test_case); + } + } + + #[test] + fn lysc() { + let test_cases = vec![ + ProteaseTestCase { + sequence: str_to_peptidoform("FARKDPGLF"), + expected_cut_sites: vec![4], + expected_peptides: vec![str_to_peptidoform("FARK"), str_to_peptidoform("DPGLF")], + }, + ProteaseTestCase { + sequence: str_to_peptidoform("PFKDLTKMSR"), + expected_cut_sites: vec![3, 7], + expected_peptides: vec![ + str_to_peptidoform("PFK"), + str_to_peptidoform("DLTK"), + str_to_peptidoform("MSR"), + ], + }, + ]; + + for test_case in test_cases { + test_protease(&known_proteases::LYSC, &test_case); + } + } + + #[test] + fn argc() { + let test_cases = vec![ + ProteaseTestCase { + sequence: str_to_peptidoform("FARKDPGLF"), + expected_cut_sites: vec![3], + expected_peptides: vec![str_to_peptidoform("FAR"), str_to_peptidoform("KDPGLF")], + }, + ProteaseTestCase { + sequence: str_to_peptidoform("PFKDLRTMSR"), + expected_cut_sites: vec![6], + expected_peptides: vec![str_to_peptidoform("PFKDLR"), str_to_peptidoform("TMSR")], + }, + ]; + + for test_case in test_cases { + test_protease(&known_proteases::ARGC, &test_case); + } + } +} From ae705036ec1984276bf5775e342e3b5323615dba Mon Sep 17 00:00:00 2001 From: Douwe Schulte Date: Fri, 11 Apr 2025 13:54:06 +0200 Subject: [PATCH 2/4] Added expected peptide size range and removed unspecific enzyme elastase --- rustyms/src/peptidoform/peptidoform.rs | 11 +++++++++-- rustyms/src/protease.rs | 12 ------------ 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/rustyms/src/peptidoform/peptidoform.rs b/rustyms/src/peptidoform/peptidoform.rs index 58dd6c21..b2ddfd50 100644 --- a/rustyms/src/peptidoform/peptidoform.rs +++ b/rustyms/src/peptidoform/peptidoform.rs @@ -1511,7 +1511,12 @@ impl> Peptidoform { } /// Digest this sequence with the given protease and the given maximal number of missed cleavages. - pub fn digest(&self, protease: &Protease, max_missed_cleavages: usize) -> Vec { + pub fn digest( + &self, + protease: &Protease, + max_missed_cleavages: usize, + size_range: impl RangeBounds, + ) -> Vec { let mut sites = vec![0]; sites.extend_from_slice(&protease.match_locations(&self.sequence)); sites.push(self.len()); @@ -1520,7 +1525,9 @@ impl> Peptidoform { for (index, start) in sites.iter().enumerate() { for end in sites.iter().skip(index + 1).take(max_missed_cleavages + 1) { - result.push(self.sub_peptide((*start)..*end)); + if size_range.contains(&(end - start)) { + result.push(self.sub_peptide((*start)..*end)); + } } } result diff --git a/rustyms/src/protease.rs b/rustyms/src/protease.rs index 14f22102..78a7bdbc 100644 --- a/rustyms/src/protease.rs +++ b/rustyms/src/protease.rs @@ -118,18 +118,6 @@ pub mod known_proteases { ]) }); - /// Elastase cuts after Alanine (A), Glycine (G), Serine (S), Valine (V), Isoleucine (I), Leucine (L) - pub static ELASTASE: LazyLock = LazyLock::new(|| { - Protease::c_terminal_of(vec![ - AminoAcid::Alanine, - AminoAcid::Glycine, - AminoAcid::Serine, - AminoAcid::Valine, - AminoAcid::Isoleucine, - AminoAcid::Leucine, - ]) - }); - /// `AspN` cuts before Aspartic acid (D) pub static ASPN: LazyLock = LazyLock::new(|| Protease::n_terminal_of(vec![AminoAcid::AsparticAcid])); From 571a473ff71157e9b938c7a3cf5ecb7c1c65154b Mon Sep 17 00:00:00 2001 From: Auke Heerdink <21688542+aukeheerdink@users.noreply.github.com> Date: Fri, 11 Apr 2025 14:55:48 +0200 Subject: [PATCH 3/4] Added doc-tests --- rustyms/src/protease.rs | 232 ++++++++++++++++++++++++++-------------- 1 file changed, 151 insertions(+), 81 deletions(-) diff --git a/rustyms/src/protease.rs b/rustyms/src/protease.rs index 78a7bdbc..a7c7beb9 100644 --- a/rustyms/src/protease.rs +++ b/rustyms/src/protease.rs @@ -10,7 +10,146 @@ use crate::{AminoAcid, SequenceElement}; /// a specificity at a certain position any amino acid that is contained in the set is allowed (see /// [`crate::CheckedAminoAcid::canonical_identical`]). /// -/// #TODO: Add the possibility to define the effiency of amino acids or certain motifs +/// A standard set of proteases can be found here [`known_proteases`] +/// +/// # Examples +/// +/// ## Basic digestion with Trypsin +/// ```rust +/// # use rustyms::Peptidoform; +/// # use rustyms::known_proteases; +/// // Define a protease and sequence to digest +/// let trypsin = &known_proteases::TRYPSIN; +/// let sequence = Peptidoform::pro_forma("SIADIRGGKSLAIEGCRTKM", None).unwrap().into_linear().unwrap(); +/// +/// // Run an in-silico digest with 0 missed cleavages and only allow peptides ranging from 4 to 40 amino acids long +/// let digest = sequence.digest(trypsin, 0, 4..40); +/// +/// assert_eq!(digest.len(), 2); +/// assert_eq!(digest[0].to_string(), "SIADIR"); +/// ``` +/// +/// ## Finding cut sites in a sequence +/// ```rust +/// # use rustyms::Peptidoform; +/// # use rustyms::known_proteases; +/// // Define a protease and sequence to digest +/// let trypsin = &known_proteases::TRYPSIN; +/// let sequence = Peptidoform::pro_forma("SIADIRGRKM", None).unwrap().into_linear().unwrap(); +/// +/// // Get all locations where trypsin would cut +/// let cut_sites = trypsin.match_locations(sequence.sequence()); +/// +/// assert_eq!(cut_sites, vec![6, 8, 9]); +/// ``` +/// +/// ## Using different proteases +/// ```rust +/// # use rustyms::Peptidoform; +/// # use rustyms::known_proteases; +/// // Define a sequence to digest +/// let sequence = Peptidoform::pro_forma("FAREDKPGLF", None).unwrap().into_linear().unwrap(); +/// +/// // Compare different digestion patterns +/// let gluc_digest = sequence.digest(&known_proteases::GLUC, 0, 4..40); +/// let lysc_digest = sequence.digest(&known_proteases::LYSC, 0, 4..40); +/// +/// assert_eq!(gluc_digest[0].to_string(), "FARE"); +/// assert_eq!(lysc_digest[0].to_string(), "FAREDK"); +/// ``` +/// +/// ## Creating a custom protease +/// ```rust +/// # use rustyms::{Peptidoform, AminoAcid, Protease}; +/// // Define a custom protease that cuts after Histidine (H) +/// let his_protease = Protease::c_terminal_of(vec![AminoAcid::Histidine]); +/// +/// // Test it on a sequence +/// let sequence = Peptidoform::pro_forma("AAHFGHKLM", None).unwrap().into_linear().unwrap(); +/// let digest = sequence.digest(&his_protease, 0, 1..40); +/// +/// assert_eq!(digest.len(), 3); +/// assert_eq!(digest[0].to_string(), "AAH"); +/// assert_eq!(digest[1].to_string(), "FGH"); +/// assert_eq!(digest[2].to_string(), "KLM"); +/// ``` +/// +/// ## Digestion with missed cleavages +/// ```rust +/// # use rustyms::Peptidoform; +/// # use rustyms::known_proteases; +/// // Define a protease and sequence to digest +/// let trypsin = &known_proteases::TRYPSIN; +/// let sequence = Peptidoform::pro_forma("AARKFGKPLM", None).unwrap().into_linear().unwrap(); +/// +/// // With 0 missed cleavages +/// let digest_0 = sequence.digest(trypsin, 0, 1..40); +/// assert_eq!(digest_0.len(), 3); +/// +/// // With 1 missed cleavage +/// let digest_1 = sequence.digest(trypsin, 1, 1..40); +/// assert_eq!(digest_1.len(), 5); // Original 3 peptides + 2 peptides with 1 missed cleavage +/// ``` +/// +/// ## Digestion with a complex custom protease +/// ```rust +/// # use rustyms::{Peptidoform, AminoAcid, Protease}; +/// # use rustyms::known_proteases; +/// // A protease that: +/// // 1. Requires two phenylalanine residues followed by any amino acid before the cut site (FF*) +/// // 2. And requires any amino acid except proline followed by either histidine or tryptophan after the cut site (*H/W) +/// let custom_protease = Protease { +/// before: vec![ +/// Some(vec![AminoAcid::Phenylalanine]), +/// Some(vec![AminoAcid::Phenylalanine]), +/// None +/// ], +/// after: vec![ +/// Some(Protease::get_exclusive(&[AminoAcid::Proline])), +/// Some(vec![AminoAcid::Histidine, AminoAcid::Tryptophan]) +/// ], +/// }; +/// +/// // A sequence with multiple potential cut sites: +/// // - Position 10: FF(M)-(A)H - valid cut site +/// // - Position 18: FF(G)-(V)H - valid cut site +/// // - Position 26: FF(A)-(P)H - invalid due to proline after cut site +/// // - Position 34: FF(T)-(S)W - valid cut site +/// let long_sequence = Peptidoform::pro_forma("SLDARETFFMAHKDGFFGVHPDTFFAPHPHYFFTSWNVPG", None) +/// .unwrap() +/// .into_linear() +/// .unwrap(); +/// +/// // Find all cut sites +/// let cut_sites = custom_protease.match_locations(long_sequence.sequence()); +/// assert_eq!(cut_sites, vec![10, 18, 34]); +/// +/// // Perform a digestion with 0 missed cleavages +/// let digest = long_sequence.digest(&custom_protease, 0, 1..40); +/// +/// assert_eq!(digest.len(), 4); +/// assert_eq!(digest[0].to_string(), "SLDARETFFM"); +/// assert_eq!(digest[1].to_string(), "AHKDGFFG"); +/// assert_eq!(digest[2].to_string(), "VHPDTFFAPHPHYFFT"); +/// assert_eq!(digest[3].to_string(), "SWNVPG"); +/// +/// // Perform a digestion with 1 missed cleavage +/// let digest_missed = long_sequence.digest(&custom_protease, 1, 1..40); +/// +/// // Should include original fragments plus those with 1 missed cleavage +/// assert_eq!(digest_missed.len(), 7); +/// +/// // Original fragments +/// assert!(digest_missed.iter().any(|p| p.to_string() == "SLDARETFFM")); +/// assert!(digest_missed.iter().any(|p| p.to_string() == "AHKDGFFG")); +/// assert!(digest_missed.iter().any(|p| p.to_string() == "VHPDTFFAPHPHYFFT")); +/// assert!(digest_missed.iter().any(|p| p.to_string() == "SWNVPG")); +/// +/// // Fragments with 1 missed cleavage +/// assert!(digest_missed.iter().any(|p| p.to_string() == "SLDARETFFMAHKDGFFG")); +/// assert!(digest_missed.iter().any(|p| p.to_string() == "AHKDGFFGVHPDTFFAPHPHYFFT")); +/// assert!(digest_missed.iter().any(|p| p.to_string() == "VHPDTFFAPHPHYFFTSWNVPG")); +/// ``` #[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct Protease { /// The amino acids n terminal of the cut site. @@ -160,7 +299,7 @@ mod tests { ); // Test peptides - let peptides = test_case.sequence.digest(protease, 0); + let peptides = test_case.sequence.digest(protease, 0, 4..40); if peptides.len() != test_case.expected_peptides.len() { for peptide in &peptides { @@ -191,21 +330,12 @@ mod tests { ProteaseTestCase { sequence: str_to_peptidoform("AKRPGKR"), expected_cut_sites: vec![2, 6], - expected_peptides: vec![ - str_to_peptidoform("AK"), - str_to_peptidoform("RPGK"), - str_to_peptidoform("R"), - ], + expected_peptides: vec![str_to_peptidoform("RPGK")], }, ProteaseTestCase { sequence: str_to_peptidoform("ARAKGCVLRPKDGR"), expected_cut_sites: vec![2, 4, 11], - expected_peptides: vec![ - str_to_peptidoform("AR"), - str_to_peptidoform("AK"), - str_to_peptidoform("GCVLRPK"), - str_to_peptidoform("DGR"), - ], + expected_peptides: vec![str_to_peptidoform("GCVLRPK")], }, ]; @@ -220,20 +350,12 @@ mod tests { ProteaseTestCase { sequence: str_to_peptidoform("AFWYPLGF"), expected_cut_sites: vec![2, 3], - expected_peptides: vec![ - str_to_peptidoform("AF"), - str_to_peptidoform("W"), - str_to_peptidoform("YPLGF"), - ], + expected_peptides: vec![str_to_peptidoform("YPLGF")], }, ProteaseTestCase { sequence: str_to_peptidoform("AVFUDGWTYPMSR"), expected_cut_sites: vec![3, 7], - expected_peptides: vec![ - str_to_peptidoform("AVF"), - str_to_peptidoform("UDGW"), - str_to_peptidoform("TYPMSR"), - ], + expected_peptides: vec![str_to_peptidoform("UDGW"), str_to_peptidoform("TYPMSR")], }, ]; @@ -248,22 +370,12 @@ mod tests { ProteaseTestCase { sequence: str_to_peptidoform("AACVFLPAKLURF"), expected_cut_sites: vec![5, 6, 10], - expected_peptides: vec![ - str_to_peptidoform("AACVF"), - str_to_peptidoform("L"), - str_to_peptidoform("PAKL"), - str_to_peptidoform("URF"), - ], + expected_peptides: vec![str_to_peptidoform("AACVF"), str_to_peptidoform("PAKL")], }, ProteaseTestCase { sequence: str_to_peptidoform("GFLPKDLVMSRG"), expected_cut_sites: vec![2, 3, 7], - expected_peptides: vec![ - str_to_peptidoform("GF"), - str_to_peptidoform("L"), - str_to_peptidoform("PKDL"), - str_to_peptidoform("VMSRG"), - ], + expected_peptides: vec![str_to_peptidoform("PKDL"), str_to_peptidoform("VMSRG")], }, ]; @@ -272,56 +384,18 @@ mod tests { } } - #[test] - fn elastase() { - let test_cases = vec![ - ProteaseTestCase { - sequence: str_to_peptidoform("FASGVPRKT"), - expected_cut_sites: vec![2, 3, 4, 5], - expected_peptides: vec![ - str_to_peptidoform("FA"), - str_to_peptidoform("S"), - str_to_peptidoform("G"), - str_to_peptidoform("V"), - str_to_peptidoform("PRKT"), - ], - }, - ProteaseTestCase { - sequence: str_to_peptidoform("PGAVSLFTGK"), - expected_cut_sites: vec![2, 3, 4, 5, 6, 9], - expected_peptides: vec![ - str_to_peptidoform("PG"), - str_to_peptidoform("A"), - str_to_peptidoform("V"), - str_to_peptidoform("S"), - str_to_peptidoform("L"), - str_to_peptidoform("FTG"), - str_to_peptidoform("K"), - ], - }, - ]; - - for test_case in test_cases { - test_protease(&known_proteases::ELASTASE, &test_case); - } - } - #[test] fn aspn() { let test_cases = vec![ ProteaseTestCase { sequence: str_to_peptidoform("FARDKPGLFD"), expected_cut_sites: vec![3, 9], - expected_peptides: vec![ - str_to_peptidoform("FAR"), - str_to_peptidoform("DKPGLF"), - str_to_peptidoform("D"), - ], + expected_peptides: vec![str_to_peptidoform("DKPGLF")], }, ProteaseTestCase { sequence: str_to_peptidoform("PFKDLTMSR"), expected_cut_sites: vec![3], - expected_peptides: vec![str_to_peptidoform("PFK"), str_to_peptidoform("DLTMSR")], + expected_peptides: vec![str_to_peptidoform("DLTMSR")], }, ]; @@ -361,11 +435,7 @@ mod tests { ProteaseTestCase { sequence: str_to_peptidoform("PFKDLTKMSR"), expected_cut_sites: vec![3, 7], - expected_peptides: vec![ - str_to_peptidoform("PFK"), - str_to_peptidoform("DLTK"), - str_to_peptidoform("MSR"), - ], + expected_peptides: vec![str_to_peptidoform("DLTK")], }, ]; @@ -380,7 +450,7 @@ mod tests { ProteaseTestCase { sequence: str_to_peptidoform("FARKDPGLF"), expected_cut_sites: vec![3], - expected_peptides: vec![str_to_peptidoform("FAR"), str_to_peptidoform("KDPGLF")], + expected_peptides: vec![str_to_peptidoform("KDPGLF")], }, ProteaseTestCase { sequence: str_to_peptidoform("PFKDLRTMSR"), From 3e810b78b0330d3b4366e8ccd35db2183f1c72aa Mon Sep 17 00:00:00 2001 From: Douwe Schulte Date: Fri, 11 Apr 2025 15:39:20 +0200 Subject: [PATCH 4/4] Fix between stretches --- rustyms/src/protease.rs | 58 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/rustyms/src/protease.rs b/rustyms/src/protease.rs index a7c7beb9..3240d638 100644 --- a/rustyms/src/protease.rs +++ b/rustyms/src/protease.rs @@ -10,7 +10,7 @@ use crate::{AminoAcid, SequenceElement}; /// a specificity at a certain position any amino acid that is contained in the set is allowed (see /// [`crate::CheckedAminoAcid::canonical_identical`]). /// -/// A standard set of proteases can be found here [`known_proteases`] +/// A standard set of proteases can be found here [`known_proteases`]. /// /// # Examples /// @@ -159,11 +159,55 @@ pub struct Protease { } impl Protease { - /// Define a simple protease that cuts exactly between the specified sequences. - pub fn new(n_term: Vec, c_term: Vec) -> Self { + /// Define a protease that cuts exactly between the specified sequences. + /// ```rust + /// # use rustyms::{Peptidoform, Protease, AminoAcid}; + /// # let cetuximab = "QVQLKQSGPGLVQPSQSLSITCTVSGFSLTNYGVHWVRQSPGKGLEWLGVIWSGGNTDYNTPFTSRLSINKDNSKSQVFFKMNSLQSNDTAIYYCARALTYYDYEFAYWGQGTLVTVSAASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKRVEPKSCDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSREEMTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSCSVMHEALHNHYTQKSLSLSPGK"; + /// // FabDELLO is a protease designed to create Fabs from antibodies + /// // Define the cut site for FabDello "KSCDK / THTCPPCP" + /// let fab_dello = Protease::between_stretches( + /// &"KSCDK".chars().map(|c| AminoAcid::try_from(c).unwrap()).collect::>(), + /// &"THTCPPCP".chars().map(|c| AminoAcid::try_from(c).unwrap()).collect::>()); + /// + /// // Get the sequence of the antibody Cetuximab + /// let sequence = Peptidoform::pro_forma(cetuximab, None).unwrap().into_linear().unwrap(); + /// + /// // Run an in-silico digest with 0 missed cleavages + /// let digest = sequence.digest(&fab_dello, 0, ..); + /// + /// assert_eq!(digest.len(), 2); + /// ``` + pub fn between_stretches(before: &[AminoAcid], after: &[AminoAcid]) -> Self { Self { - before: vec![Some(n_term)], - after: vec![Some(c_term)], + before: before.iter().map(|aa| Some(vec![*aa])).collect_vec(), + after: after.iter().map(|aa| Some(vec![*aa])).collect_vec(), + } + } + + /// Define a protease that cuts exactly between the specified options before the site and the specified options after the site. + /// ```rust + /// # use rustyms::{Peptidoform, Protease, AminoAcid}; + /// // Define a protease and sequence to digest + /// let chymotrypsin = Protease::between_options( + /// vec![ + /// AminoAcid::Phenylalanine, + /// AminoAcid::Tryptophan, + /// AminoAcid::Tyrosine, + /// ], + /// Protease::get_exclusive(&[AminoAcid::Proline]), + /// ); + /// let sequence = Peptidoform::pro_forma("AFWYPLGF", None).unwrap().into_linear().unwrap(); + /// + /// // Run an in-silico digest with 0 missed cleavages and only allow peptides ranging from 4 to 40 amino acids long + /// let digest = sequence.digest(&chymotrypsin, 0, 4..40); + /// + /// assert_eq!(digest.len(), 1); + /// assert_eq!(digest[0].to_string(), "YPLGF"); + /// ``` + pub fn between_options(before: Vec, after: Vec) -> Self { + Self { + before: vec![Some(before)], + after: vec![Some(after)], } } @@ -229,7 +273,7 @@ pub mod known_proteases { /// `Trypsin` cuts after Lysine (K) or Arginine (R), unless followed by Proline (P) pub static TRYPSIN: LazyLock = LazyLock::new(|| { - Protease::new( + Protease::between_options( vec![AminoAcid::Lysine, AminoAcid::Arginine], Protease::get_exclusive(&[AminoAcid::Proline]), ) @@ -237,7 +281,7 @@ pub mod known_proteases { /// `Chymotrypsin` cuts after Phenylalanine (F), Tryptophan (W), Tyrosine (Y), unless followed by Proline (P) pub static CHYMOTRYPSIN: LazyLock = LazyLock::new(|| { - Protease::new( + Protease::between_options( vec![ AminoAcid::Phenylalanine, AminoAcid::Tryptophan,