diff --git a/psm_utils/__init__.py b/psm_utils/__init__.py index c759a5f..e31341a 100644 --- a/psm_utils/__init__.py +++ b/psm_utils/__init__.py @@ -1,6 +1,6 @@ """Common utilities for parsing and handling PSMs, and search engine results.""" -__version__ = "1.5.0.post1" +__version__ = "1.5.1" __all__ = ["Peptidoform", "PSM", "PSMList"] from warnings import filterwarnings diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py index 85ab9f6..bb39378 100644 --- a/psm_utils/io/__init__.py +++ b/psm_utils/io/__init__.py @@ -241,8 +241,16 @@ def _supports_write_psm(writer: type[WriterBase]) -> bool: temp_file.close() Path(temp_file.name).unlink() example_psm = PSM(peptidoform="ACDE", spectrum_id="0") + + # Prepare writer-specific kwargs for writers that need them + writer_kwargs = {} + if writer == percolator.PercolatorTabWriter: + writer_kwargs["style"] = "pin" + try: - with writer(temp_file.name, example_psm=example_psm) as writer_instance: + with writer( + temp_file.name, example_psm=example_psm, **writer_kwargs + ) as writer_instance: writer_instance.write_psm(example_psm) except NotImplementedError: supports_write_psm = False diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py index 3f0bae0..4f7b5e2 100644 --- a/psm_utils/io/idxml.py +++ b/psm_utils/io/idxml.py @@ -34,8 +34,11 @@ import pyopenms as oms # type: ignore[import] _has_openms = True + # Check if we have pyOpenMS 3.5+ with PeptideIdentificationList + _has_peptide_id_list = hasattr(oms, "PeptideIdentificationList") except ImportError: _has_openms = False + _has_peptide_id_list = False oms = None # type: ignore[assignment] logger = logging.getLogger(__name__) @@ -157,8 +160,17 @@ def _parse_idxml(self) -> tuple[Any, Any]: """ protein_ids: Any = [] # list[oms.ProteinIdentification] - peptide_ids: Any = [] # list[oms.PeptideIdentification] - oms.IdXMLFile().load(str(self.filename), protein_ids, peptide_ids) # type: ignore + # In pyOpenMS 3.5+, peptide_ids must be a PeptideIdentificationList + if _has_peptide_id_list: + peptide_ids: Any = oms.PeptideIdentificationList() # type: ignore + else: + peptide_ids = [] # list[oms.PeptideIdentification] for pyOpenMS <3.5 + + # Load the idXML file - the lists will be populated by pyOpenMS + idxml_file = oms.IdXMLFile() # type: ignore + # Ensure filename is a string, not a Path object + filename_str: str = str(self.filename) + idxml_file.load(filename_str, protein_ids, peptide_ids) if len(protein_ids) == 0: raise IdXMLReaderEmptyListException( @@ -564,7 +576,10 @@ def _update_existing_ids( peptide_id.setHits(updated_peptide_hits) - oms.IdXMLFile().store(str(self.filename), self.protein_ids, self.peptide_ids) # type: ignore + # Store the idXML file + idxml_file = oms.IdXMLFile() # type: ignore + filename_str: str = str(self.filename) + idxml_file.store(filename_str, self.protein_ids, self.peptide_ids) def _update_peptide_hit(self, peptide_hit: Any, psm: PSM) -> None: """Inplace update of PeptideHit with novel predicted features information from PSM.""" @@ -594,7 +609,11 @@ def _create_ids_for_collection( ) -> None: """Create ProteinIdentification and PeptideIdentification objects for a single collection.""" self.protein_ids = [oms.ProteinIdentification()] # type: ignore - self.peptide_ids = [] + # In pyOpenMS 3.5+, peptide_ids must be a PeptideIdentificationList + if _has_peptide_id_list: + self.peptide_ids = oms.PeptideIdentificationList() # type: ignore + else: + self.peptide_ids = [] # list[oms.PeptideIdentification] for pyOpenMS <3.5 # Set msrun filename with spectra_data meta value msrun_reference = [str(run).encode() for run in runs.keys()] @@ -617,14 +636,19 @@ def _create_ids_for_collection( # Create PeptideHits peptide_hits = [self._create_peptide_hit(psm) for psm in psms] peptide_id.setHits(peptide_hits) - self.peptide_ids.append(peptide_id) + # Use push_back for pyOpenMS 3.5+, append for older versions + if _has_peptide_id_list: + self.peptide_ids.push_back(peptide_id) # type: ignore + else: + self.peptide_ids.append(peptide_id) # type: ignore[union-attr] # Create protein hits self._create_protein_hits(protein_list) # Write idXML file - filename = "/".join(filter(None, [collection, str(self.filename)])) - oms.IdXMLFile().store(filename, self.protein_ids, self.peptide_ids) # type: ignore + filename: str = "/".join(filter(None, [collection, str(self.filename)])) + idxml_file = oms.IdXMLFile() # type: ignore + idxml_file.store(filename, self.protein_ids, self.peptide_ids) # type: ignore def _create_peptide_identification( self, diff --git a/psm_utils/io/peptide_record.py b/psm_utils/io/peptide_record.py index 749b3f9..235e20c 100644 --- a/psm_utils/io/peptide_record.py +++ b/psm_utils/io/peptide_record.py @@ -448,7 +448,8 @@ def from_dataframe(peprec_df: pd.DataFrame) -> PSMList: """ psm_list = [] for _, row in peprec_df.iterrows(): - entry = _PeprecEntry(**row.to_dict()) + row_dict = {str(k): v for k, v in row.to_dict().items()} + entry = _PeprecEntry(**row_dict) psm_list.append(PeptideRecordReader._entry_to_psm(entry, filename="")) return PSMList(psm_list=psm_list) diff --git a/psm_utils/io/pepxml.py b/psm_utils/io/pepxml.py index 7c2fc78..6a82542 100644 --- a/psm_utils/io/pepxml.py +++ b/psm_utils/io/pepxml.py @@ -27,6 +27,12 @@ "mzFidelity", ] +KNOWN_METADATA_KEYS = [ + "num_matched_ions", + "tot_num_ions", + "num_missed_cleavages", +] + class PepXMLReader(ReaderBase): """Reader for pepXML PSM files.""" @@ -127,32 +133,40 @@ def _parse_peptidoform( def _parse_psm(self, spectrum_query: dict[str, Any], search_hit: dict[str, Any]) -> PSM: """Parse pepXML PSM to PSM.""" - metadata = { - "num_matched_ions": str(search_hit["num_matched_ions"]), - "tot_num_ions": str(search_hit["tot_num_ions"]), - "num_missed_cleavages": str(search_hit["num_missed_cleavages"]), - } + # Build metadata from optional search hit fields + metadata = {key: str(search_hit[key]) for key in KNOWN_METADATA_KEYS if key in search_hit} + + # Add all search scores to metadata metadata.update( { - f"search_score_{key.lower()}": str(search_hit["search_score"][key]) - for key in search_hit["search_score"] + f"search_score_{key.lower()}": str(value) + for key, value in search_hit["search_score"].items() } ) + # Build provenance data from optional spectrum query fields + provenance_data = { + k: str(v) + for k, v in { + "pepxml_index": spectrum_query.get("index"), + "start_scan": spectrum_query.get("start_scan"), + "end_scan": spectrum_query.get("end_scan"), + }.items() + if v is not None + } + return PSM( peptidoform=self._parse_peptidoform( search_hit["peptide"], search_hit["modifications"], spectrum_query["assumed_charge"], ), - spectrum_id=spectrum_query["spectrumNativeID"] - if "spectrumNativeID" in spectrum_query - else spectrum_query["spectrum"], + spectrum_id=spectrum_query.get("spectrumNativeID", spectrum_query.get("spectrum")), run=None, collection=None, spectrum=None, is_decoy=None, - score=search_hit["search_score"][self.score_key], + score=search_hit["search_score"].get(self.score_key, None), qvalue=None, pep=None, precursor_mz=mass_to_mz( @@ -160,14 +174,10 @@ def _parse_psm(self, spectrum_query: dict[str, Any], search_hit: dict[str, Any]) ), retention_time=spectrum_query.get("retention_time_sec"), ion_mobility=spectrum_query.get("ion_mobility"), - protein_list=[p["protein"] for p in search_hit["proteins"]], - rank=search_hit["hit_rank"], + protein_list=[p["protein"] for p in search_hit.get("proteins", [])], + rank=search_hit.get("hit_rank", None), source=None, - provenance_data={ - "pepxml_index": str(spectrum_query["index"]), - "start_scan": str(spectrum_query["start_scan"]), - "end_scan": str(spectrum_query["end_scan"]), - }, + provenance_data=provenance_data, metadata=metadata, rescoring_features={}, ) diff --git a/psm_utils/io/percolator.py b/psm_utils/io/percolator.py index 41f86ac..c18d8e3 100644 --- a/psm_utils/io/percolator.py +++ b/psm_utils/io/percolator.py @@ -342,8 +342,10 @@ def write_file(self, psm_list: PSMList) -> None: f, fieldnames=self._columns, delimiter="\t", extrasaction="ignore" ) writer.writeheader() - for psm in psm_list: - writer.writerow(self._psm_to_entry(psm)) + for i, psm in enumerate(psm_list): + entry = self._psm_to_entry(psm) + entry["ScanNr"] = i + writer.writerow(entry) def _psm_to_entry(self, psm: PSM) -> dict[str, Any]: """Parse PSM to Percolator Tab entry."""