Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
035a0c7
dev: VEP chunk and VEP cache beegfs
migrau Apr 3, 2025
8ef2919
fix: use standard cache for ENSEMBLVEP_VEP
migrau May 8, 2025
40bb507
perf: improve VEP performance by converting input format
migrau May 14, 2025
bb21b25
fix: panel_postprocessing_annotation.py
migrau May 14, 2025
7c73d3b
fix: arguments safe_transform_context
migrau May 16, 2025
276152d
perf: chunking panel_custom_processing.py
migrau May 20, 2025
7bc3a16
perf: CREATECAPTUREDPANELS containers edited. create_panel_versions.p…
migrau May 22, 2025
346665d
fix: python3 container for CREATECAPTUREDPANELS
migrau Jun 4, 2025
08d8fad
fix: remove container option CREATECAPTUREDPANELS. fix conda versions…
migrau Jun 4, 2025
5c8ff55
fix: typo CREATECAPTUREDPANELS
migrau Jun 4, 2025
891ec85
fix: wave true only for CREATECAPTUREDPANELS
migrau Jun 4, 2025
e1fd6af
fix: syntax config module CREATECAPTUREDPANELS
migrau Jun 5, 2025
ca0ae01
fix: new way to specify wave for a single process
migrau Jun 5, 2025
5560c25
fix: toString added for wave
migrau Jun 5, 2025
c0c3e97
fix: wave label added
migrau Jun 5, 2025
24efcf6
fix: wave true for everything
migrau Jun 5, 2025
7734938
fix: wave false except CREATECAPTUREDPANELS
migrau Jun 5, 2025
b625332
fix: comma...
migrau Jun 5, 2025
8110a34
fix: wave removed. New container created
migrau Jun 5, 2025
e718e41
fix: Removed wave from nextflow.config
migrau Jun 6, 2025
9fd0ed7
fix: adjust memory requeriments
migrau Jun 30, 2025
abc85ed
perf: added new profile, nanoseq
migrau Jun 30, 2025
3e0b4b5
fix: naming withLabel config review
migrau Jul 1, 2025
61ec864
fix: nanoseq config resourceLimits
migrau Jul 1, 2025
0188172
fix: correct withName *
migrau Jul 1, 2025
b0e422a
fix: SITESFROMPOSITIONS memory test
migrau Jul 1, 2025
63dcea7
fix SITESFROMPOSITIONS
migrau Jul 1, 2025
7c2f56b
fix: SITESFROMPOSITIONS
migrau Jul 1, 2025
6e53f23
fix: fix profile
migrau Jul 1, 2025
e9d1b3b
fix: SITESFROMPOSITIONS config
migrau Jul 1, 2025
1dffd94
fix: POSTPROCESSVEPPANEL. Time
migrau Jul 2, 2025
24b170a
fix: RESOURCE LIMITS added
migrau Jul 3, 2025
d243ebc
fix: typo
migrau Jul 3, 2025
945c129
fix: update base.config
migrau Jul 3, 2025
198ff20
fix: adjust nanoconfig
migrau Jul 3, 2025
0cfd80f
Merge branch 'dev' into dev-chunk-optimization-POSTPROCESSVEPPANEL
migrau Nov 14, 2025
6c64f4d
fix: parallelization optional. Include sort for bedtools merge
migrau Nov 14, 2025
b2f12fd
fix: gene omega error: "No flagged entries found; skipping plots and …
migrau Nov 16, 2025
d4ed3c2
fix: Add debug logging and ensure failing_consensus file is always cr…
migrau Nov 18, 2025
4be3b45
feat: Add chunking support for SITESFROMPOSITIONS with genomic sorting
migrau Nov 19, 2025
e52cb76
feat: add parallel_processing_parameters section to schema for chunki…
migrau Nov 19, 2025
92580ce
update dnds genes list
FerriolCalvet Nov 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion bin/annotate_omega_failing.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,8 +338,26 @@ def main(omegas_file: str, compiled_flagged_files: str, output: str) -> None:
lines = [ln.strip() for ln in fh if ln.strip()]
flagged_paths = [Path(l) for l in lines]

# Read omegas with resilience to missing header lines
# Some aggregation steps may drop the header; if so, re-read with explicit names
def _read_omegas(path: Path) -> pd.DataFrame:
try:
df = pd.read_csv(path, sep="\t", header=0, dtype=str, skip_blank_lines=True)
except pd.errors.EmptyDataError:
return pd.DataFrame(columns=["gene","sample","impact","mutations","dnds","pvalue","lower","upper"]) # empty
# If expected columns are missing (e.g., header was dropped), re-read with names
expected = {"gene","sample","impact","mutations","dnds","pvalue","lower","upper"}
if not expected.issubset(set(map(str, df.columns))):
df = pd.read_csv(path,
sep="\t",
header=None,
names=["gene","sample","impact","mutations","dnds","pvalue","lower","upper"],
dtype=str,
skip_blank_lines=True)
return df.fillna("")

# Read omegas
omegas = pd.read_csv(omegas_path, sep="\t", header=0, dtype=str).fillna("")
omegas = _read_omegas(omegas_path)

syn_flagged, npa_flagged = load_flagged_tables(flagged_paths)

Expand Down
10 changes: 10 additions & 0 deletions bin/create_consensus_panel.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ def create_consensus_panel(compact_annot_panel_path, depths_path, version, conse
#####
# Filter failing columns only for rows that pass the compliance threshold
compliance_df_passing = compliance_df.filter(passing_rows)

print(f"DEBUG: Total positions passing compliance threshold: {compliance_df_passing.height}")
print(f"DEBUG: Number of samples: {compliance_df_passing.width}")

# Invert all boolean values (True → False, False → True)
failing_mask = pl.DataFrame([
Expand All @@ -64,6 +67,7 @@ def create_consensus_panel(compact_annot_panel_path, depths_path, version, conse
"Failed": True
})

print(f"DEBUG: Total failing entries found: {len(failing_columns_counts)}")

if failing_columns_counts:
failing_columns_counts_df = pl.DataFrame(failing_columns_counts)
Expand All @@ -73,6 +77,12 @@ def create_consensus_panel(compact_annot_panel_path, depths_path, version, conse
.rename({"count": "FAILING_COUNT"})
)
failure_counts_filtered.write_csv(f"failing_consensus.{version}.tsv", separator="\t")
print(f"DEBUG: Created failing_consensus.{version}.tsv with {failure_counts_filtered.height} samples")
else:
# Create empty file with header for consistency
empty_df = pl.DataFrame({"SAMPLE_ID": [], "FAILING_COUNT": []}, schema={"SAMPLE_ID": pl.Utf8, "FAILING_COUNT": pl.Int64})
empty_df.write_csv(f"failing_consensus.{version}.tsv", separator="\t")
print(f"DEBUG: No failures detected - created empty failing_consensus.{version}.tsv")


@click.command()
Expand Down
56 changes: 35 additions & 21 deletions bin/create_panel_versions.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
#!/usr/bin/env python
#!/usr/bin/env python3

"""
create_panel_versions.py

import click
import pandas as pd
import os
Generates multiple VEP annotation panel subsets based on the 'IMPACT' column
using the high-performance Polars library.

Usage:
python create_panel_versions.py --compact-annot-panel-path <input_tsv> --output <output_prefix>
"""

# TODO: check pandas version 2.0.3
# -- Auxiliary functions -- #
import polars as pl
import click
import sys

panel_impact_dict = {
PANEL_IMPACT_DICT = {

"protein_affecting": ["nonsense", "missense",
"essential_splice",
Expand Down Expand Up @@ -68,25 +74,33 @@

}

# -- Main function -- #

def create_panel_versions(compact_annot_panel_path, output_path):
def create_panel_versions(input_path: str, output_prefix: str) -> None:
"""
Generates panel subsets from a VEP-annotated file using Polars.

\b
INPUT_PATH: Path to the annotated TSV file.
OUTPUT_PREFIX: Prefix for the output files (e.g., 'output/panel').
"""
try:
df = pl.read_csv(input_path, separator="\t")
except Exception as e:
click.echo(f"Error reading input file: {e}", err=True)
sys.exit(1)

# Load VEP annotated panel, already compacted to have one variant per site
## requires column named IMPACT with consequence type
compact_annot_panel_df = pd.read_csv(compact_annot_panel_path, sep = "\t")
if "IMPACT" not in df.columns:
click.echo("ERROR: 'IMPACT' column not found in input file.", err=True)
sys.exit(1)

# Create panel versions
for version in panel_impact_dict:
for version_name, impact_values in PANEL_IMPACT_DICT.items():
filtered = df.filter(pl.col("IMPACT").is_in(impact_values))
filtered.write_csv(f"{output_prefix}.{version_name}.tsv", separator="\t")

panel_version = compact_annot_panel_df.loc[compact_annot_panel_df["IMPACT"].isin(panel_impact_dict[version])]
panel_version.to_csv(f"{output_path}.{version}.tsv",
sep = "\t", index = False)
# Write the full file as a version
df.write_csv(f"{output_prefix}.all.tsv", separator="\t")

# Store complete panel (better change this way of using this version in nextflow)
version = "all"
compact_annot_panel_df.to_csv(f"{output_path}.{version}.tsv",
sep = "\t", index = False)
click.echo("Panel versions generated successfully.")


@click.command()
Expand Down
7 changes: 5 additions & 2 deletions bin/dNdS_run.R
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,12 @@ if (!is.null(opt$genelist)){

# Loads the covs object
load(opt$covariates)
load(opt$referencetranscripts)

reference_genes <- intersect(rownames(covs), unique(gr_genes$names))

# Identify genes that are in 'genes' but not in the row names of 'covs'
missing_genes <- setdiff(genes, rownames(covs))
missing_genes <- setdiff(genes, reference_genes)

# Print the missing genes, if any
if (length(missing_genes) > 0) {
Expand All @@ -109,7 +112,7 @@ if (length(missing_genes) > 0) {
}

# Check that all the "requested" genes are in the covariates file
genes <- intersect(rownames(covs), genes)
genes <- intersect(reference_genes, genes)
print("Keeping only the genes with in the covariates")


Expand Down
74 changes: 53 additions & 21 deletions bin/panel_custom_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,29 +17,60 @@
}


def load_chr_data_chunked(filepath, chrom, chunksize=1_000_000):
"""
Loads data for a specific chromosome from a large VEP output file in chunks.

Args:
filepath (str): Path to the VEP output file.
chrom (str): Chromosome to filter.
chunksize (int): Number of rows per chunk.

Returns:
pd.DataFrame: Filtered DataFrame for the chromosome.
"""
reader = pd.read_csv(filepath, sep="\t", na_values=custom_na_values, chunksize=chunksize, dtype={'CHROM': str})
chr_data = []
for chunk in reader:
filtered = chunk[chunk["CHROM"] == chrom]
if not filtered.empty:
chr_data.append(filtered)
return pd.concat(chr_data) if chr_data else pd.DataFrame()


def customize_panel_regions(VEP_output_file, custom_regions_file, customized_output_annotation_file,
simple = True
simple = True,
chr_chunk_size = 1_000_000
):
"""
# TODO
explain what this function does
Modifies annotations in a VEP output file based on custom genomic regions.

- For each region in the custom regions file, identifies the corresponding slice
in the VEP output.
- Updates gene names and impact values for the region.
- Saves both the modified annotation file and a record of added regions.

Args:
VEP_output_file (str): Path to the full VEP output file (TSV).
custom_regions_file (str): Custom region definitions (tab-delimited).
customized_output_annotation_file (str): Output file for updated annotations.
simple (bool): If True, outputs simplified annotations; else adds more fields.
"""

# simple = ['CHROM', 'POS', 'REF', 'ALT', 'MUT_ID' , 'GENE', 'IMPACT' , 'CONTEXT_MUT', 'CONTEXT']
# rich = ['CHROM', 'POS', 'REF', 'ALT', 'MUT_ID', 'STRAND', 'GENE', 'IMPACT', 'Feature', 'Protein_position', 'Amino_acids', 'CONTEXT_MUT', 'CONTEXT']
all_possible_sites = pd.read_csv(VEP_output_file, sep = "\t",
na_values = custom_na_values)
print("all possible sites loaded")

custom_regions_df = pd.read_table(custom_regions_file)

added_regions_df = pd.DataFrame()

current_chr = ""
for ind, row in custom_regions_df.iterrows():
chr_data = pd.DataFrame()

for _, row in custom_regions_df.iterrows():
try:
if row["CHROM"] != current_chr:
current_chr = row["CHROM"]
chr_data = all_possible_sites[all_possible_sites["CHROM"] == current_chr]
chr_data = load_chr_data_chunked(VEP_output_file, current_chr, chunksize=chr_chunk_size)

print("Updating chromosome to:", current_chr)

# Get start and end indices
Expand Down Expand Up @@ -88,25 +119,25 @@ def customize_panel_regions(VEP_output_file, custom_regions_file, customized_out

## Insert modified rows back into the df
if simple:
all_possible_sites.loc[original_df_start: original_df_end, ["GENE", "IMPACT"]] = hotspot_data[["GENE", "IMPACT"]].values
chr_data.loc[original_df_start: original_df_end, ["GENE", "IMPACT"]] = hotspot_data[["GENE", "IMPACT"]].values
else:
print("Getting Feature to '-'")
hotspot_data["Feature"] = '-'
all_possible_sites.loc[original_df_start: original_df_end, ["GENE", "IMPACT", "Feature"]] = hotspot_data[["GENE", "IMPACT", "Feature"]].values
chr_data.loc[original_df_start: original_df_end, ["GENE", "IMPACT", "Feature"]] = hotspot_data[["GENE", "IMPACT", "Feature"]].values


added_regions_df = pd.concat((added_regions_df, hotspot_data))
print("Small region added:", row["NAME"])

except Exception as e:
print(f"Error processing row {row}: {e}")

all_possible_sites = all_possible_sites.drop_duplicates(subset = ['CHROM', 'POS', 'REF', 'ALT', 'MUT_ID',
'GENE', 'CONTEXT_MUT', 'CONTEXT', 'IMPACT'],
keep = 'first')
all_possible_sites.to_csv(customized_output_annotation_file,
header = True,
index = False,
sep = "\t")
chr_data = chr_data.drop_duplicates(
subset=['CHROM', 'POS', 'REF', 'ALT', 'MUT_ID', 'GENE', 'CONTEXT_MUT', 'CONTEXT', 'IMPACT'],
keep='first'
)
chr_data.to_csv(customized_output_annotation_file, header=True, index=False, sep="\t")


added_regions_df = added_regions_df.drop_duplicates(subset = ['CHROM', 'POS', 'REF', 'ALT', 'MUT_ID',
'GENE', 'CONTEXT_MUT', 'CONTEXT', 'IMPACT'],
Expand All @@ -123,8 +154,9 @@ def customize_panel_regions(VEP_output_file, custom_regions_file, customized_out
@click.option('--custom-regions-file', required=True, type=click.Path(exists=True), help='Input custom regions file (TSV)')
@click.option('--customized-output-annotation-file', required=True, type=click.Path(), help='Output annotation file (TSV)')
@click.option('--simple', is_flag=True, help='Use simple annotation')
def main(vep_output_file, custom_regions_file, customized_output_annotation_file, simple):
customize_panel_regions(vep_output_file, custom_regions_file, customized_output_annotation_file, simple)
@click.option('--chr-chunk-size', type=int, default=1000000, show_default=True, help='Chunk size for per-chromosome loading')
def main(vep_output_file, custom_regions_file, customized_output_annotation_file, simple, chr_chunk_size):
customize_panel_regions(vep_output_file, custom_regions_file, customized_output_annotation_file, simple, chr_chunk_size)

if __name__ == '__main__':
main()
Expand Down
Loading