onecodex · audy · Apr 18, 2025 · Mar 31, 2025 · Mar 31, 2025 · Mar 31, 2025
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -25,9 +25,14 @@ jobs:
         platform:
           - runner: ubuntu-22.04
             target: x86_64
+          - runner: ubuntu-22.04
+            target: aarch64
         python-version:
+          - "3.9"
+          - "3.10"
           - "3.11"
           - "3.12"
+          - "3.13"
     steps:
       - uses: actions/checkout@v4
       - name: Build wheels
@@ -53,8 +58,11 @@ jobs:
           - runner: macos-14
             target: aarch64
         python-version:
+          - "3.9"
+          - "3.10"
           - "3.11"
           - "3.12"
+          - "3.13"
     steps:
       - uses: actions/checkout@v4
       - name: Build wheels

diff --git a/Cargo.toml b/Cargo.toml
@@ -1,7 +1,10 @@
 [package]
 name = "needletail"
 version = "0.6.3"
-authors = ["Roderick Bovee <rbovee@gmail.com>", "Vincent Prouillet <vincent@onecodex.com>"]
+authors = [
+    "Roderick Bovee <rbovee@gmail.com>",
+    "Vincent Prouillet <vincent@onecodex.com>",
+]
 description = "FASTX parsing and k-mer methods"
 keywords = ["FASTA", "FASTQ", "kmer", "bioinformatics"]
 categories = ["science", "parsing"]
@@ -28,7 +31,7 @@ bytecount = { version = "0.6", features = ["runtime-dispatch-simd"] }
 bzip2 = { version = "0.4", optional = true }
 flate2 = { version = "1.0.30", optional = true }
 memchr = "2.7.2"
-pyo3 = { version = "0.21.2", optional = true }
+pyo3 = { version = "0.24.1", optional = true }
 liblzma = { version = "0.3.1", optional = true }
 zstd = { version = "0.13.2", optional = true }
 

diff --git a/needletail.pyi b/needletail.pyi
@@ -0,0 +1,235 @@
+from pathlib import Path
+from typing import Iterator, Union
+
+class FastxReader(Iterator[Record]):
+    """An iterator that yields sequence records.
+
+    Yields
+    ------
+    Record
+        A `Record` object representing a sequence record.
+
+    See also
+    --------
+    parse_fastx_file:
+        A function to parse sequence records from a FASTA/FASTQ file.
+    parse_fastx_string:
+        A function to parse sequence records from a FASTA/FASTQ string.
+    Record:
+        A class representing a FASTA/FASTQ sequence record.
+    """
+
+class Record:
+    """
+    A record representing a biological sequence.
+
+    Parameters
+    ----------
+    id : str
+        The identifier of the sequence record.
+    seq : str
+        A string representing the sequence.
+
+    Attributes
+    ----------
+    id : str
+        The identifier of the sequence record. In a FASTA file, this is the
+        string containing all characters (including whitespaces) after the
+        leading '>' character. In a FASTQ file, this is the string containing
+        all characters (including whitespaces) after the leading '@' character.
+    seq : str
+        A string representing the sequence.
+    qual : str, optional
+        A string representing the quality scores of the sequence. If the object
+        represents a FASTA record, this attribute will be `None`.
+    name : str
+        The name of the sequence record. This is the string before the first
+        whitespace character in the `id` attribute.
+    description : str, optional
+        The description of the sequence record. This is the string after the
+        first whitespace character in the `id` attribute. If the `id` attribute
+        contains no whitespace characters, this attribute will be `None`.
+
+    Methods
+    -------
+    is_fasta
+        Check if the object represents a FASTA record.
+    is_fastq
+        Check if the object represents a FASTQ record.
+    normalize(iupac)
+        Normalize the sequence stored in the `seq` attribute of the object.
+    """
+    def is_fasta(self) -> bool:
+        """
+        Check if the object represents a FASTA record.
+
+        Returns
+        -------
+        bool
+            `True` if the record lacks quality information, otherwise `False`.
+        """
+        pass
+
+    def is_fastq(self) -> bool:
+        """
+        Check if the object represents a FASTQ record.
+
+        Returns
+        -------
+        bool
+            `True` if the record has quality information, otherwise `False`.
+        """
+        pass
+
+    def normalize(self, iupac: bool) -> None:
+        """
+        Normalize the sequence stored in the `seq` attribute of the object.
+
+        See also
+        --------
+        normalize_seq: A function to normalize nucleotide sequence strings.
+
+        Notes
+        -----
+        The `normalize` method is designed for nucleotide sequences only. If
+        used with protein sequences, it will incorrectly process amino acid
+        characters as if they were nucleotides.
+        """
+        pass
+
+def parse_fastx_file(path: Union[str, Path]) -> FastxReader:
+    """
+    Returns an iterator that parses a FASTA/FASTQ file and yields sequence
+    records.
+
+    Parameters
+    ----------
+    path : str or pathlib.Path
+        The path to a FASTA/FASTQ file.
+
+    Returns
+    -------
+    FastxReader
+        A `FastxReader` iterator that yields `Record` objects representing
+        sequences from the input file.
+
+    Raises
+    ------
+    NeedletailError
+        If an error occurs while reading and parsing the input file.
+
+    See also
+    --------
+    parse_fastx_string:
+        A function to parse sequence records from a FASTA/FASTQ string.
+    FastxReader:
+        A class with instances that are iterators that yield `Record` objects.
+    """
+    pass
+
+def parse_fastx_string(fastx_string: str) -> FastxReader:
+    """
+    Returns an iterator that parses a FASTA/FASTQ string and yields sequence
+    records.
+
+    Parameters
+    ----------
+    content : str
+        A string containing FASTA/FASTQ-formatted sequence records.
+
+    Returns
+    -------
+    FastxReader
+        A `FastxReader` iterator that yields `Record` objects representing
+        sequences from the input string.
+
+    Raises
+    ------
+    NeedletailError
+        If an error occurs while parsing the input string.
+
+    See also
+    --------
+    parse_fastx_file:
+        A function to parse sequence records from a FASTA/FASTQ file.
+    FastxReader:
+        A class with instances that are iterators that yield `Record` objects.
+    """
+    pass
+
+def normalize_seq(seq: str, iupac: bool) -> str:
+    """
+    Normalize the sequence string of nucleotide records by:
+
+    - Converting lowercase characters to uppercase.
+    - Removing whitespace and newline characters.
+    - Replacing 'U' with 'T'.
+    - Replacing '.' and '~' with '-'.
+    - Replacing characters not in 'ACGTN-' with 'N', unless `iupac` is `True`,
+      in which case characters representing nucleotide ambiguity are not
+      replaced.
+
+    Parameters
+    ----------
+    seq : str
+        A string representing a nucleotide sequence.
+    iupac : bool, default: False
+        If `True`, characters representing nucleotide ambiguity ('B', 'D',
+        'H', 'V', 'R', 'Y', 'S', 'W', 'K', and 'M', and their lowercase
+        forms) will not be converted to 'N'. Lowercase characters will still
+        be converted to uppercase.
+
+    Returns
+    -------
+    str
+        The normalized sequence string.
+
+    Notes
+    -----
+    The `normalize_seq` function is designed for nucleotide sequences only. If
+    used with protein sequences, it will incorrectly process amino acid
+    characters as if they were nucleotides.
+    """
+    pass
+
+def reverse_complement(seq: str) -> str:
+    """
+    Compute the reverse complement of a nucleotide sequence.
+
+    Parameters
+    ----------
+    seq : str
+        A string representing a nucleotide sequence.
+
+    Returns
+    -------
+    str
+        The reverse complement of the input nucleotide sequence.
+
+    Notes
+    -----
+    The `reverse_complement` method is designed for nucleotide sequences
+    only. If used with protein sequences, it will incorrectly process
+    amino acid characters as if they were nucleotides.
+    """
+    pass
+
+def decode_phred(qual: str, base_64: bool) -> tuple[int]:
+    """
+    Decode Phred quality strings to quality scores.
+
+    Parameters
+    ----------
+    phred : str
+        A string representing Phred-encoded quality strings.
+    base_64 : bool, default=False
+        If `True`, return the quality using the Phred+64 encoding, otherwise
+        the Phred+33 encoding will be used.
+
+    Returns
+    -------
+    tuple of int
+        A list of integers representing quality scores derived from the
+        probability of a base-calling error using a logarithmic transformation.
+    """
+    pass
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,13 +1,14 @@
 [build-system]
-requires = ["maturin>=1.7,<2.0"]
+requires = ["maturin>=1.8,<2.0"]
 build-backend = "maturin"
 
 [project]
 name = "needletail"
+requires-python = ">=3.8"
 dynamic = ["version"]
 classifier = [
     "Intended Audience :: Science/Research",
-    "Programming Language :: Python :: 3",
+    "Programming Language :: Rust",
     "License :: OSI Approved :: MIT License",
     "Topic :: Scientific/Engineering :: Bio-Informatics",
 ]