Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,14 @@ jobs:
platform:
- runner: ubuntu-22.04
target: x86_64
- runner: ubuntu-22.04
target: aarch64
python-version:
- "3.9"
- "3.10"
- "3.11"
- "3.12"
- "3.13"
steps:
- uses: actions/checkout@v4
- name: Build wheels
Expand All @@ -53,8 +58,11 @@ jobs:
- runner: macos-14
target: aarch64
python-version:
- "3.9"
- "3.10"
- "3.11"
- "3.12"
- "3.13"
steps:
- uses: actions/checkout@v4
- name: Build wheels
Expand Down
7 changes: 5 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
[package]
name = "needletail"
version = "0.6.3"
authors = ["Roderick Bovee <rbovee@gmail.com>", "Vincent Prouillet <vincent@onecodex.com>"]
authors = [
"Roderick Bovee <rbovee@gmail.com>",
"Vincent Prouillet <vincent@onecodex.com>",
]
description = "FASTX parsing and k-mer methods"
keywords = ["FASTA", "FASTQ", "kmer", "bioinformatics"]
categories = ["science", "parsing"]
Expand All @@ -28,7 +31,7 @@ bytecount = { version = "0.6", features = ["runtime-dispatch-simd"] }
bzip2 = { version = "0.4", optional = true }
flate2 = { version = "1.0.30", optional = true }
memchr = "2.7.2"
pyo3 = { version = "0.21.2", optional = true }
pyo3 = { version = "0.24.1", optional = true }
liblzma = { version = "0.3.1", optional = true }
zstd = { version = "0.13.2", optional = true }

Expand Down
235 changes: 235 additions & 0 deletions needletail.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
from pathlib import Path
from typing import Iterator, Union

class FastxReader(Iterator[Record]):
"""An iterator that yields sequence records.

Yields
------
Record
A `Record` object representing a sequence record.

See also
--------
parse_fastx_file:
A function to parse sequence records from a FASTA/FASTQ file.
parse_fastx_string:
A function to parse sequence records from a FASTA/FASTQ string.
Record:
A class representing a FASTA/FASTQ sequence record.
"""

class Record:
"""
A record representing a biological sequence.

Parameters
----------
id : str
The identifier of the sequence record.
seq : str
A string representing the sequence.

Attributes
----------
id : str
The identifier of the sequence record. In a FASTA file, this is the
string containing all characters (including whitespaces) after the
leading '>' character. In a FASTQ file, this is the string containing
all characters (including whitespaces) after the leading '@' character.
seq : str
A string representing the sequence.
qual : str, optional
A string representing the quality scores of the sequence. If the object
represents a FASTA record, this attribute will be `None`.
name : str
The name of the sequence record. This is the string before the first
whitespace character in the `id` attribute.
description : str, optional
The description of the sequence record. This is the string after the
first whitespace character in the `id` attribute. If the `id` attribute
contains no whitespace characters, this attribute will be `None`.

Methods
-------
is_fasta
Check if the object represents a FASTA record.
is_fastq
Check if the object represents a FASTQ record.
normalize(iupac)
Normalize the sequence stored in the `seq` attribute of the object.
"""
def is_fasta(self) -> bool:
"""
Check if the object represents a FASTA record.

Returns
-------
bool
`True` if the record lacks quality information, otherwise `False`.
"""
pass

def is_fastq(self) -> bool:
"""
Check if the object represents a FASTQ record.

Returns
-------
bool
`True` if the record has quality information, otherwise `False`.
"""
pass

def normalize(self, iupac: bool) -> None:
"""
Normalize the sequence stored in the `seq` attribute of the object.

See also
--------
normalize_seq: A function to normalize nucleotide sequence strings.

Notes
-----
The `normalize` method is designed for nucleotide sequences only. If
used with protein sequences, it will incorrectly process amino acid
characters as if they were nucleotides.
"""
pass

def parse_fastx_file(path: Union[str, Path]) -> FastxReader:
"""
Returns an iterator that parses a FASTA/FASTQ file and yields sequence
records.

Parameters
----------
path : str or pathlib.Path
The path to a FASTA/FASTQ file.

Returns
-------
FastxReader
A `FastxReader` iterator that yields `Record` objects representing
sequences from the input file.

Raises
------
NeedletailError
If an error occurs while reading and parsing the input file.

See also
--------
parse_fastx_string:
A function to parse sequence records from a FASTA/FASTQ string.
FastxReader:
A class with instances that are iterators that yield `Record` objects.
"""
pass

def parse_fastx_string(fastx_string: str) -> FastxReader:
"""
Returns an iterator that parses a FASTA/FASTQ string and yields sequence
records.

Parameters
----------
content : str
A string containing FASTA/FASTQ-formatted sequence records.

Returns
-------
FastxReader
A `FastxReader` iterator that yields `Record` objects representing
sequences from the input string.

Raises
------
NeedletailError
If an error occurs while parsing the input string.

See also
--------
parse_fastx_file:
A function to parse sequence records from a FASTA/FASTQ file.
FastxReader:
A class with instances that are iterators that yield `Record` objects.
"""
pass

def normalize_seq(seq: str, iupac: bool) -> str:
"""
Normalize the sequence string of nucleotide records by:

- Converting lowercase characters to uppercase.
- Removing whitespace and newline characters.
- Replacing 'U' with 'T'.
- Replacing '.' and '~' with '-'.
- Replacing characters not in 'ACGTN-' with 'N', unless `iupac` is `True`,
in which case characters representing nucleotide ambiguity are not
replaced.

Parameters
----------
seq : str
A string representing a nucleotide sequence.
iupac : bool, default: False
If `True`, characters representing nucleotide ambiguity ('B', 'D',
'H', 'V', 'R', 'Y', 'S', 'W', 'K', and 'M', and their lowercase
forms) will not be converted to 'N'. Lowercase characters will still
be converted to uppercase.

Returns
-------
str
The normalized sequence string.

Notes
-----
The `normalize_seq` function is designed for nucleotide sequences only. If
used with protein sequences, it will incorrectly process amino acid
characters as if they were nucleotides.
"""
pass

def reverse_complement(seq: str) -> str:
"""
Compute the reverse complement of a nucleotide sequence.

Parameters
----------
seq : str
A string representing a nucleotide sequence.

Returns
-------
str
The reverse complement of the input nucleotide sequence.

Notes
-----
The `reverse_complement` method is designed for nucleotide sequences
only. If used with protein sequences, it will incorrectly process
amino acid characters as if they were nucleotides.
"""
pass

def decode_phred(qual: str, base_64: bool) -> tuple[int]:
"""
Decode Phred quality strings to quality scores.

Parameters
----------
phred : str
A string representing Phred-encoded quality strings.
base_64 : bool, default=False
If `True`, return the quality using the Phred+64 encoding, otherwise
the Phred+33 encoding will be used.

Returns
-------
tuple of int
A list of integers representing quality scores derived from the
probability of a base-calling error using a logarithmic transformation.
"""
pass
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
[build-system]
requires = ["maturin>=1.7,<2.0"]
requires = ["maturin>=1.8,<2.0"]
build-backend = "maturin"

[project]
name = "needletail"
requires-python = ">=3.8"
dynamic = ["version"]
classifier = [
"Intended Audience :: Science/Research",
"Programming Language :: Python :: 3",
"Programming Language :: Rust",
"License :: OSI Approved :: MIT License",
"Topic :: Scientific/Engineering :: Bio-Informatics",
]
Expand Down
Loading
Loading