From 9c3ff79204f4645117ff6954f810e4ae0ad151c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ant=C3=B4nio=20Camargo?= Date: Mon, 31 Mar 2025 10:56:39 -0700 Subject: [PATCH 1/9] Add Python 3.13 and Linux aarch to CI builds --- .github/workflows/release.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 5bcfa83..0694a51 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -25,9 +25,12 @@ jobs: platform: - runner: ubuntu-22.04 target: x86_64 + - runner: ubuntu-22.04 + target: aarch64 python-version: - "3.11" - "3.12" + - "3.13" steps: - uses: actions/checkout@v4 - name: Build wheels @@ -55,6 +58,7 @@ jobs: python-version: - "3.11" - "3.12" + - "3.13" steps: - uses: actions/checkout@v4 - name: Build wheels From bf16b969b203037275426dc4d2156477b2dddee6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ant=C3=B4nio=20Camargo?= Date: Mon, 31 Mar 2025 11:03:47 -0700 Subject: [PATCH 2/9] Bump PyO3 to 0.24.0 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index ecd71f0..af634cd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,7 +28,7 @@ bytecount = { version = "0.6", features = ["runtime-dispatch-simd"] } bzip2 = { version = "0.4", optional = true } flate2 = { version = "1.0.30", optional = true } memchr = "2.7.2" -pyo3 = { version = "0.21.2", optional = true } +pyo3 = { version = "0.24.0", optional = true } liblzma = { version = "0.3.1", optional = true } zstd = { version = "0.13.2", optional = true } From 6f2c252cf6c49805e711eaf4a23092592a26cff4 Mon Sep 17 00:00:00 2001 From: Antonio Camargo Date: Mon, 31 Mar 2025 16:27:38 -0700 Subject: [PATCH 3/9] Use locks to make `PyFastxReader` thread-safe --- src/python.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/python.rs b/src/python.rs index 6888c24..1d81f2a 100644 --- a/src/python.rs +++ b/src/python.rs @@ -19,6 +19,7 @@ use pyo3::{create_exception, wrap_pyfunction}; use std::hash::{DefaultHasher, Hash, Hasher}; use std::io::Cursor; use std::path::PathBuf; +use std::sync::Mutex; create_exception!(needletail, NeedletailError, pyo3::exceptions::PyException); @@ -56,7 +57,7 @@ fn get_seq_snippet(seq: &str, max_len: usize) -> String { /// A class representing a FASTA/FASTQ sequence record. #[pyclass] pub struct PyFastxReader { - reader: Box, + reader: Mutex>, } #[pymethods] @@ -69,8 +70,8 @@ impl PyFastxReader { slf } - fn __next__(mut slf: PyRefMut) -> PyResult> { - if let Some(rec) = slf.reader.next() { + fn __next__(slf: PyRefMut) -> PyResult> { + if let Some(rec) = slf.reader.lock().unwrap().next() { let record = py_try!(rec); Ok(Some(Record::from_sequence_record(&record))) } else { @@ -280,7 +281,9 @@ impl Record { #[pyfunction] fn parse_fastx_file(path: PathBuf) -> PyResult { let reader = py_try!(rs_parse_fastx_file(path)); - Ok(PyFastxReader { reader }) + Ok(PyFastxReader { + reader: reader.into(), + }) } /// Parse sequence records from a FASTA/FASTQ string. @@ -310,7 +313,9 @@ fn parse_fastx_file(path: PathBuf) -> PyResult { #[pyfunction] fn parse_fastx_string(content: &str) -> PyResult { let reader = py_try!(parse_fastx_reader(Cursor::new(content.to_owned()))); - Ok(PyFastxReader { reader }) + Ok(PyFastxReader { + reader: reader.into(), + }) } /// Normalize the sequence string of nucleotide records by: @@ -383,6 +388,6 @@ fn needletail(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(parse_fastx_string))?; m.add_wrapped(wrap_pyfunction!(normalize_seq))?; m.add_wrapped(wrap_pyfunction!(reverse_complement))?; - m.add("NeedletailError", py.get_type_bound::())?; + m.add("NeedletailError", py.get_type::())?; Ok(()) } From 78f6a52784813e8633e6603a9ee23816b726020f Mon Sep 17 00:00:00 2001 From: Antonio Camargo Date: Mon, 31 Mar 2025 16:27:51 -0700 Subject: [PATCH 4/9] Update pyproject.toml --- pyproject.toml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3394db4..e61223d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,14 @@ [build-system] -requires = ["maturin>=1.7,<2.0"] +requires = ["maturin>=1.8,<2.0"] build-backend = "maturin" [project] name = "needletail" +requires-python = ">=3.8" dynamic = ["version"] classifier = [ "Intended Audience :: Science/Research", - "Programming Language :: Python :: 3", + "Programming Language :: Rust", "License :: OSI Approved :: MIT License", "Topic :: Scientific/Engineering :: Bio-Informatics", ] From 345a107d0a6057313d48bd8983873cfc3b12bab4 Mon Sep 17 00:00:00 2001 From: Antonio Camargo Date: Thu, 17 Apr 2025 16:18:40 -0700 Subject: [PATCH 5/9] Build for Python 3.9 and 3.10 --- .github/workflows/release.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 0694a51..d99006a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -28,6 +28,8 @@ jobs: - runner: ubuntu-22.04 target: aarch64 python-version: + - "3.9" + - "3.10" - "3.11" - "3.12" - "3.13" @@ -56,6 +58,8 @@ jobs: - runner: macos-14 target: aarch64 python-version: + - "3.9" + - "3.10" - "3.11" - "3.12" - "3.13" From 49ce5add07e0ba69fcb0a69ecb4cb9548ef9f255 Mon Sep 17 00:00:00 2001 From: Antonio Camargo Date: Thu, 17 Apr 2025 17:18:32 -0700 Subject: [PATCH 6/9] Bump PyO3 --- Cargo.toml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index af634cd..7185e7e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,10 @@ [package] name = "needletail" version = "0.6.3" -authors = ["Roderick Bovee ", "Vincent Prouillet "] +authors = [ + "Roderick Bovee ", + "Vincent Prouillet ", +] description = "FASTX parsing and k-mer methods" keywords = ["FASTA", "FASTQ", "kmer", "bioinformatics"] categories = ["science", "parsing"] @@ -28,7 +31,7 @@ bytecount = { version = "0.6", features = ["runtime-dispatch-simd"] } bzip2 = { version = "0.4", optional = true } flate2 = { version = "1.0.30", optional = true } memchr = "2.7.2" -pyo3 = { version = "0.24.0", optional = true } +pyo3 = { version = "0.24.1", optional = true } liblzma = { version = "0.3.1", optional = true } zstd = { version = "0.13.2", optional = true } From dddd37fb4c44d1c602da6894009a5c8bb9341162 Mon Sep 17 00:00:00 2001 From: Antonio Camargo Date: Thu, 17 Apr 2025 17:18:45 -0700 Subject: [PATCH 7/9] Drop `PyTuple::new_bound` --- src/python.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/python.rs b/src/python.rs index 0da7aeb..54ddedc 100644 --- a/src/python.rs +++ b/src/python.rs @@ -379,12 +379,12 @@ pub fn reverse_complement(seq: &str) -> PyResult { Ok(String::from_utf8_lossy(&comp).to_string()) } -/// Decode Phred quality data to quality scores. +/// Decode Phred quality strings to quality scores. /// /// Parameters: /// ----------- /// phred : str -/// A string representing Phred-encoded quality data. +/// A string representing Phred-encoded quality strings. /// base_64 : bool, default=False /// If `True`, return the quality using the Phred+64 encoding, otherwise /// the Phred+33 encoding will be used. @@ -402,9 +402,9 @@ pub fn py_decode_phred(qual: &str, base_64: bool, py: Python<'_>) -> PyResult Date: Thu, 17 Apr 2025 18:12:37 -0700 Subject: [PATCH 8/9] Add stub file --- needletail.pyi | 233 +++++++++++++++++++++++++++++++++++++++++++++++++ src/python.rs | 36 ++++++-- 2 files changed, 260 insertions(+), 9 deletions(-) create mode 100644 needletail.pyi diff --git a/needletail.pyi b/needletail.pyi new file mode 100644 index 0000000..de176e8 --- /dev/null +++ b/needletail.pyi @@ -0,0 +1,233 @@ +from pathlib import Path +from typing import Iterator, Union + +class FastxReader(Iterator[Record]): + """An iterator that yields sequence records. + + Yields + ------ + Record + A `Record` object representing a sequence record. + + See also + -------- + parse_fastx_file: + A function to parse sequence records from a FASTA/FASTQ file. + parse_fastx_string: + A function to parse sequence records from a FASTA/FASTQ string. + Record: + A class representing a FASTA/FASTQ sequence record. + """ + +class Record: + """ + A record representing a biological sequence. + + Parameters + ---------- + id : str + The identifier of the sequence record. + seq : str + A string representing the sequence. + + Attributes + ---------- + id : str + The identifier of the sequence record. In a FASTA file, this is the + string containing all characters (including whitespaces) after the + leading '>' character. In a FASTQ file, this is the string containing + all characters (including whitespaces) after the leading '@' character. + seq : str + A string representing the sequence. + qual : str, optional + A string representing the quality scores of the sequence. If the object + represents a FASTA record, this attribute will be `None`. + name : str + The name of the sequence record. This is the string before the first + whitespace character in the `id` attribute. + description : str, optional + The description of the sequence record. This is the string after the + first whitespace character in the `id` attribute. If the `id` attribute + contains no whitespace characters, this attribute will be `None`. + + Methods + ------- + is_fasta + Check if the object represents a FASTA record. + is_fastq + Check if the object represents a FASTQ record. + normalize(iupac) + Normalize the sequence stored in the `seq` attribute of the object. + """ + def is_fasta(self) -> bool: + """ + Check if the object represents a FASTA record. + + Returns + ------- + bool + `True` if the record lacks quality information, otherwise `False`. + """ + pass + + def is_fastq(self) -> bool: + """ + Check if the object represents a FASTQ record. + + Returns + ------- + bool + `True` if the record has quality information, otherwise `False`. + """ + pass + + def normalize(self, iupac: bool) -> None: + """ + Normalize the sequence stored in the `seq` attribute of the object. + + See also + -------- + normalize_seq: A function to normalize nucleotide sequence strings. + + Notes + ----- + The `normalize` method is designed for nucleotide sequences only. If + used with protein sequences, it will incorrectly process amino acid + characters as if they were nucleotides. + """ + pass + +def parse_fastx_file(path: Union[str, Path]) -> FastxReader: + """ + An iterator that reads sequence records from a FASTA/FASTQ file. + + Parameters + ---------- + path : str or pathlib.Path + The path to a FASTA/FASTQ file. + + Returns + ------- + FastxReader + A `FastxReader` iterator that yields `Record` objects representing + sequences from the input file. + + Raises + ------ + NeedletailError + If an error occurs while reading and parsing the input file. + + See also + -------- + parse_fastx_string: + A function to parse sequence records from a FASTA/FASTQ string. + FastxReader: + A class with instances that are iterators that yield `Record` objects. + """ + pass + +def parse_fastx_string(fastx_string: str) -> FastxReader: + """ + Parse sequence records from a FASTA/FASTQ string. + + Parameters + ---------- + content : str + A string containing FASTA/FASTQ-formatted sequence records. + + Returns + ------- + FastxReader + A `FastxReader` iterator that yields `Record` objects representing + sequences from the input string. + + Raises + ------ + NeedletailError + If an error occurs while parsing the input string. + + See also + -------- + parse_fastx_file: + A function to parse sequence records from a FASTA/FASTQ file. + FastxReader: + A class with instances that are iterators that yield `Record` objects. + """ + pass + +def normalize_seq(seq: str, iupac: bool) -> str: + """ + Normalize the sequence string of nucleotide records by: + + - Converting lowercase characters to uppercase. + - Removing whitespace and newline characters. + - Replacing 'U' with 'T'. + - Replacing '.' and '~' with '-'. + - Replacing characters not in 'ACGTN-' with 'N', unless `iupac` is `True`, + in which case characters representing nucleotide ambiguity are not + replaced. + + Parameters + ---------- + seq : str + A string representing a nucleotide sequence. + iupac : bool, default: False + If `True`, characters representing nucleotide ambiguity ('B', 'D', + 'H', 'V', 'R', 'Y', 'S', 'W', 'K', and 'M', and their lowercase + forms) will not be converted to 'N'. Lowercase characters will still + be converted to uppercase. + + Returns + ------- + str + The normalized sequence string. + + Notes + ----- + The `normalize_seq` function is designed for nucleotide sequences only. If + used with protein sequences, it will incorrectly process amino acid + characters as if they were nucleotides. + """ + pass + +def reverse_complement(seq: str) -> str: + """ + Compute the reverse complement of a nucleotide sequence. + + Parameters + ---------- + seq : str + A string representing a nucleotide sequence. + + Returns + ------- + str + The reverse complement of the input nucleotide sequence. + + Notes + ----- + The `reverse_complement` method is designed for nucleotide sequences + only. If used with protein sequences, it will incorrectly process + amino acid characters as if they were nucleotides. + """ + pass + +def decode_phred(qual: str, base_64: bool) -> tuple[int]: + """ + Decode Phred quality strings to quality scores. + + Parameters + ---------- + phred : str + A string representing Phred-encoded quality strings. + base_64 : bool, default=False + If `True`, return the quality using the Phred+64 encoding, otherwise + the Phred+33 encoding will be used. + + Returns + ------- + tuple of int + A list of integers representing quality scores derived from the + probability of a base-calling error using a logarithmic transformation. + """ + pass diff --git a/src/python.rs b/src/python.rs index 54ddedc..35067cf 100644 --- a/src/python.rs +++ b/src/python.rs @@ -1,6 +1,11 @@ //! Python bindings for needletail // TODO: +// - The `normalize` method of the `Record` class should return a new `Record` +// object with the normalized sequence. +// - Add a `reverse_complement` method to the `Record` class that returns a new +// `Record` object with the reverse complement of the sequence. +// - Turn `is_fasta` and `is_fastq` into properties. // - Make the return values of `__repr__` and `__str__` show up as raw strings. // - Make `normalize_seq`, `reverse_complement`, and `decode_phred` functions // able to handle `Record` objects as input. @@ -54,6 +59,7 @@ fn get_seq_snippet(seq: &str, max_len: usize) -> String { /// Record: /// A class representing a FASTA/FASTQ sequence record. #[pyclass] +#[pyo3(name = "FastxReader")] pub struct PyFastxReader { reader: Mutex>, } @@ -180,6 +186,12 @@ impl Record { /// See also /// -------- /// normalize_seq: A function to normalize nucleotide sequence strings. + /// + // Notes + // ----- + // The `normalize` method is designed for nucleotide sequences only. If + // used with protein sequences, it will incorrectly process amino acid + // characters as if they were nucleotides. #[pyo3(signature = (iupac=false))] pub fn normalize(&mut self, iupac: bool) -> PyResult<()> { if let Some(s) = normalize(self.seq.as_bytes(), iupac) { @@ -261,8 +273,8 @@ impl Record { /// /// Returns /// ------- -/// PyFastxReader -/// A `PyFastxReader` iterator that yields `Record` objects representing +/// FastxReader +/// A `FastxReader` iterator that yields `Record` objects representing /// sequences from the input file. /// /// Raises @@ -274,7 +286,7 @@ impl Record { /// -------- /// parse_fastx_string: /// A function to parse sequence records from a FASTA/FASTQ string. -/// PyFastxReader: +/// FastxReader: /// A class with instances that are iterators that yield `Record` objects. #[pyfunction] #[pyo3(name = "parse_fastx_file")] @@ -294,8 +306,8 @@ fn py_parse_fastx_file(path: PathBuf) -> PyResult { /// /// Returns /// ------- -/// PyFastxReader -/// A `PyFastxReader` iterator that yields `Record` objects representing +/// FastxReader +/// A `FastxReader` iterator that yields `Record` objects representing /// sequences from the input string. /// /// Raises @@ -307,11 +319,11 @@ fn py_parse_fastx_file(path: PathBuf) -> PyResult { /// -------- /// parse_fastx_file: /// A function to parse sequence records from a FASTA/FASTQ file. -/// PyFastxReader: +/// FastxReader: /// A class with instances that are iterators that yield `Record` objects. #[pyfunction] -fn parse_fastx_string(content: &str) -> PyResult { - let reader = py_try!(parse_fastx_reader(Cursor::new(content.to_owned()))); +fn parse_fastx_string(fastx_string: &str) -> PyResult { + let reader = py_try!(parse_fastx_reader(Cursor::new(fastx_string.to_owned()))); Ok(PyFastxReader { reader: reader.into(), }) @@ -344,7 +356,7 @@ fn parse_fastx_string(content: &str) -> PyResult { /// /// Notes /// ----- -/// The `normalize` method is designed for nucleotide sequences only. If +/// The `normalize_seq` function is designed for nucleotide sequences only. If /// used with protein sequences, it will incorrectly process amino acid /// characters as if they were nucleotides. #[pyfunction] @@ -368,6 +380,12 @@ pub fn normalize_seq(seq: &str, iupac: bool) -> PyResult { /// -------- /// str /// The reverse complement of the input nucleotide sequence. +/// +/// Notes +/// ----- +/// The `reverse_complement` function is designed for nucleotide sequences +/// only. If used with protein sequences, it will incorrectly process +/// amino acid characters as if they were nucleotides. #[pyfunction] pub fn reverse_complement(seq: &str) -> PyResult { let comp: Vec = seq From 4fde193646ae846c892782dfd53d0ea3c56d7169 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ant=C3=B4nio=20Camargo?= Date: Thu, 17 Apr 2025 23:47:52 -0700 Subject: [PATCH 9/9] Update `parse_fastx_file` and `parse_fastx_string` docstrings --- needletail.pyi | 6 ++++-- src/python.rs | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/needletail.pyi b/needletail.pyi index de176e8..71c35a3 100644 --- a/needletail.pyi +++ b/needletail.pyi @@ -99,7 +99,8 @@ class Record: def parse_fastx_file(path: Union[str, Path]) -> FastxReader: """ - An iterator that reads sequence records from a FASTA/FASTQ file. + Returns an iterator that parses a FASTA/FASTQ file and yields sequence + records. Parameters ---------- @@ -128,7 +129,8 @@ def parse_fastx_file(path: Union[str, Path]) -> FastxReader: def parse_fastx_string(fastx_string: str) -> FastxReader: """ - Parse sequence records from a FASTA/FASTQ string. + Returns an iterator that parses a FASTA/FASTQ string and yields sequence + records. Parameters ---------- diff --git a/src/python.rs b/src/python.rs index 35067cf..0d1642b 100644 --- a/src/python.rs +++ b/src/python.rs @@ -264,7 +264,8 @@ impl Record { } } -/// An iterator that reads sequence records from a FASTA/FASTQ file. +/// Returns an iterator that parses a FASTA/FASTQ file and yields sequence +/// records. /// /// Parameters /// ---------- @@ -297,7 +298,8 @@ fn py_parse_fastx_file(path: PathBuf) -> PyResult { }) } -/// Parse sequence records from a FASTA/FASTQ string. +/// Returns an iterator that parses a FASTA/FASTQ string and yields sequence +/// records. /// /// Parameters /// ----------