bbglab · migrau · Apr 3, 2025 · May 8, 2025 · May 14, 2025 · May 14, 2025
diff --git a/bin/annotate_omega_failing.py b/bin/annotate_omega_failing.py
@@ -338,8 +338,26 @@ def main(omegas_file: str, compiled_flagged_files: str, output: str) -> None:
         lines = [ln.strip() for ln in fh if ln.strip()]
     flagged_paths = [Path(l) for l in lines]
 
+    # Read omegas with resilience to missing header lines
+    # Some aggregation steps may drop the header; if so, re-read with explicit names
+    def _read_omegas(path: Path) -> pd.DataFrame:
+        try:
+            df = pd.read_csv(path, sep="\t", header=0, dtype=str, skip_blank_lines=True)
+        except pd.errors.EmptyDataError:
+            return pd.DataFrame(columns=["gene","sample","impact","mutations","dnds","pvalue","lower","upper"])  # empty
+        # If expected columns are missing (e.g., header was dropped), re-read with names
+        expected = {"gene","sample","impact","mutations","dnds","pvalue","lower","upper"}
+        if not expected.issubset(set(map(str, df.columns))):
+            df = pd.read_csv(path,
+                             sep="\t",
+                             header=None,
+                             names=["gene","sample","impact","mutations","dnds","pvalue","lower","upper"],
+                             dtype=str,
+                             skip_blank_lines=True)
+        return df.fillna("")
+
     # Read omegas
-    omegas = pd.read_csv(omegas_path, sep="\t", header=0, dtype=str).fillna("")
+    omegas = _read_omegas(omegas_path)
 
     syn_flagged, npa_flagged = load_flagged_tables(flagged_paths)
 

diff --git a/bin/create_consensus_panel.py b/bin/create_consensus_panel.py
@@ -47,6 +47,9 @@ def create_consensus_panel(compact_annot_panel_path, depths_path, version, conse
     #####
     # Filter failing columns only for rows that pass the compliance threshold
     compliance_df_passing = compliance_df.filter(passing_rows)
+
+    print(f"DEBUG: Total positions passing compliance threshold: {compliance_df_passing.height}")
+    print(f"DEBUG: Number of samples: {compliance_df_passing.width}")
 
     # Invert all boolean values (True → False, False → True)
     failing_mask = pl.DataFrame([
@@ -64,6 +67,7 @@ def create_consensus_panel(compact_annot_panel_path, depths_path, version, conse
                     "Failed": True
                 })
 
+    print(f"DEBUG: Total failing entries found: {len(failing_columns_counts)}")
 
     if failing_columns_counts:
         failing_columns_counts_df = pl.DataFrame(failing_columns_counts)
@@ -73,6 +77,12 @@ def create_consensus_panel(compact_annot_panel_path, depths_path, version, conse
             .rename({"count": "FAILING_COUNT"})
         )
         failure_counts_filtered.write_csv(f"failing_consensus.{version}.tsv", separator="\t")
+        print(f"DEBUG: Created failing_consensus.{version}.tsv with {failure_counts_filtered.height} samples")
+    else:
+        # Create empty file with header for consistency
+        empty_df = pl.DataFrame({"SAMPLE_ID": [], "FAILING_COUNT": []}, schema={"SAMPLE_ID": pl.Utf8, "FAILING_COUNT": pl.Int64})
+        empty_df.write_csv(f"failing_consensus.{version}.tsv", separator="\t")
+        print(f"DEBUG: No failures detected - created empty failing_consensus.{version}.tsv")
 
 
 @click.command()

diff --git a/bin/create_panel_versions.py b/bin/create_panel_versions.py
@@ -1,14 +1,20 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
+"""
+create_panel_versions.py
 
-import click
-import pandas as pd
-import os
+Generates multiple VEP annotation panel subsets based on the 'IMPACT' column
+using the high-performance Polars library.
+
+Usage:
+    python create_panel_versions.py --compact-annot-panel-path <input_tsv> --output <output_prefix>
+"""
 
-# TODO: check pandas version 2.0.3
-# -- Auxiliary functions -- #
+import polars as pl
+import click
+import sys
 
-panel_impact_dict = {
+PANEL_IMPACT_DICT = {
 
     "protein_affecting": ["nonsense", "missense",
                             "essential_splice",
@@ -68,25 +74,33 @@
 
 }
 
-# -- Main function -- #
 
-def create_panel_versions(compact_annot_panel_path, output_path):
+def create_panel_versions(input_path: str, output_prefix: str) -> None:
+    """
+    Generates panel subsets from a VEP-annotated file using Polars.
+
+    \b
+    INPUT_PATH: Path to the annotated TSV file.
+    OUTPUT_PREFIX: Prefix for the output files (e.g., 'output/panel').
+    """
+    try:
+        df = pl.read_csv(input_path, separator="\t")
+    except Exception as e:
+        click.echo(f"Error reading input file: {e}", err=True)
+        sys.exit(1)
 
-    # Load VEP annotated panel, already compacted to have one variant per site
-    ## requires column named IMPACT with consequence type
-    compact_annot_panel_df = pd.read_csv(compact_annot_panel_path, sep = "\t")
+    if "IMPACT" not in df.columns:
+        click.echo("ERROR: 'IMPACT' column not found in input file.", err=True)
+        sys.exit(1)
 
-    # Create panel versions
-    for version in panel_impact_dict:
+    for version_name, impact_values in PANEL_IMPACT_DICT.items():
+        filtered = df.filter(pl.col("IMPACT").is_in(impact_values))
+        filtered.write_csv(f"{output_prefix}.{version_name}.tsv", separator="\t")
 
-        panel_version = compact_annot_panel_df.loc[compact_annot_panel_df["IMPACT"].isin(panel_impact_dict[version])]
-        panel_version.to_csv(f"{output_path}.{version}.tsv",
-                                sep = "\t", index = False)
+    # Write the full file as a version
+    df.write_csv(f"{output_prefix}.all.tsv", separator="\t")
 
-    # Store complete panel (better change this way of using this version in nextflow)
-    version = "all"
-    compact_annot_panel_df.to_csv(f"{output_path}.{version}.tsv",
-                                    sep = "\t", index = False)
+    click.echo("Panel versions generated successfully.")
 
 
 @click.command()

diff --git a/bin/dNdS_run.R b/bin/dNdS_run.R
@@ -96,9 +96,12 @@ if (!is.null(opt$genelist)){
 
 # Loads the covs object
 load(opt$covariates)
+load(opt$referencetranscripts)
+
+reference_genes <- intersect(rownames(covs), unique(gr_genes$names))
 
 # Identify genes that are in 'genes' but not in the row names of 'covs'
-missing_genes <- setdiff(genes, rownames(covs))
+missing_genes <- setdiff(genes, reference_genes)
 
 # Print the missing genes, if any
 if (length(missing_genes) > 0) {
@@ -109,7 +112,7 @@ if (length(missing_genes) > 0) {
 }
 
 # Check that all the "requested" genes are in the covariates file
-genes <- intersect(rownames(covs), genes)
+genes <- intersect(reference_genes, genes)
 print("Keeping only the genes with in the covariates")
 
 

diff --git a/bin/panel_custom_processing.py b/bin/panel_custom_processing.py
@@ -17,29 +17,60 @@
             }
 
 
+def load_chr_data_chunked(filepath, chrom, chunksize=1_000_000):
+    """
+    Loads data for a specific chromosome from a large VEP output file in chunks.
+
+    Args:
+        filepath (str): Path to the VEP output file.
+        chrom (str): Chromosome to filter.
+        chunksize (int): Number of rows per chunk.
+
+    Returns:
+        pd.DataFrame: Filtered DataFrame for the chromosome.
+    """
+    reader = pd.read_csv(filepath, sep="\t", na_values=custom_na_values, chunksize=chunksize, dtype={'CHROM': str})
+    chr_data = []
+    for chunk in reader:
+        filtered = chunk[chunk["CHROM"] == chrom]
+        if not filtered.empty:
+            chr_data.append(filtered)
+    return pd.concat(chr_data) if chr_data else pd.DataFrame()
+
+
 def customize_panel_regions(VEP_output_file, custom_regions_file, customized_output_annotation_file,
-                            simple = True
+                            simple = True,
+                            chr_chunk_size = 1_000_000
                             ):
     """
-    # TODO
-    explain what this function does
+    Modifies annotations in a VEP output file based on custom genomic regions.
+
+    - For each region in the custom regions file, identifies the corresponding slice
+      in the VEP output.
+    - Updates gene names and impact values for the region.
+    - Saves both the modified annotation file and a record of added regions.
+
+    Args:
+        VEP_output_file (str): Path to the full VEP output file (TSV).
+        custom_regions_file (str): Custom region definitions (tab-delimited).
+        customized_output_annotation_file (str): Output file for updated annotations.
+        simple (bool): If True, outputs simplified annotations; else adds more fields.
     """
+
     # simple = ['CHROM', 'POS', 'REF', 'ALT', 'MUT_ID'          , 'GENE', 'IMPACT'                                              , 'CONTEXT_MUT', 'CONTEXT']
     # rich   = ['CHROM', 'POS', 'REF', 'ALT', 'MUT_ID', 'STRAND', 'GENE', 'IMPACT', 'Feature', 'Protein_position', 'Amino_acids', 'CONTEXT_MUT', 'CONTEXT']
-    all_possible_sites = pd.read_csv(VEP_output_file, sep = "\t",
-                                        na_values = custom_na_values)
-    print("all possible sites loaded")
 
     custom_regions_df = pd.read_table(custom_regions_file)
-
     added_regions_df = pd.DataFrame()
-
     current_chr = ""
-    for ind, row in custom_regions_df.iterrows():
+    chr_data = pd.DataFrame()
+
+    for _, row in custom_regions_df.iterrows():
         try:
             if row["CHROM"] != current_chr:
                 current_chr = row["CHROM"]
-                chr_data = all_possible_sites[all_possible_sites["CHROM"] == current_chr]
+                chr_data = load_chr_data_chunked(VEP_output_file, current_chr, chunksize=chr_chunk_size)
+
                 print("Updating chromosome to:", current_chr)
 
             # Get start and end indices
@@ -88,25 +119,25 @@ def customize_panel_regions(VEP_output_file, custom_regions_file, customized_out
 
             ## Insert modified rows back into the df
             if simple:
-                all_possible_sites.loc[original_df_start: original_df_end, ["GENE", "IMPACT"]] = hotspot_data[["GENE", "IMPACT"]].values
+                chr_data.loc[original_df_start: original_df_end, ["GENE", "IMPACT"]] = hotspot_data[["GENE", "IMPACT"]].values
             else:
                 print("Getting Feature to '-'")
                 hotspot_data["Feature"] = '-'
-                all_possible_sites.loc[original_df_start: original_df_end, ["GENE", "IMPACT", "Feature"]] = hotspot_data[["GENE", "IMPACT", "Feature"]].values
+                chr_data.loc[original_df_start: original_df_end, ["GENE", "IMPACT", "Feature"]] = hotspot_data[["GENE", "IMPACT", "Feature"]].values
+
 
             added_regions_df = pd.concat((added_regions_df, hotspot_data))
             print("Small region added:", row["NAME"])
 
         except Exception as e:
             print(f"Error processing row {row}: {e}")
 
-    all_possible_sites = all_possible_sites.drop_duplicates(subset = ['CHROM', 'POS', 'REF', 'ALT', 'MUT_ID',
-                                                                    'GENE', 'CONTEXT_MUT', 'CONTEXT', 'IMPACT'],
-                                                            keep = 'first')
-    all_possible_sites.to_csv(customized_output_annotation_file,
-                                        header = True,
-                                        index = False,
-                                        sep = "\t")
+    chr_data = chr_data.drop_duplicates(
+        subset=['CHROM', 'POS', 'REF', 'ALT', 'MUT_ID', 'GENE', 'CONTEXT_MUT', 'CONTEXT', 'IMPACT'],
+        keep='first'
+    )
+    chr_data.to_csv(customized_output_annotation_file, header=True, index=False, sep="\t")
+
 
     added_regions_df = added_regions_df.drop_duplicates(subset = ['CHROM', 'POS', 'REF', 'ALT', 'MUT_ID',
                                                                     'GENE', 'CONTEXT_MUT', 'CONTEXT', 'IMPACT'],
@@ -123,8 +154,9 @@ def customize_panel_regions(VEP_output_file, custom_regions_file, customized_out
 @click.option('--custom-regions-file', required=True, type=click.Path(exists=True), help='Input custom regions file (TSV)')
 @click.option('--customized-output-annotation-file', required=True, type=click.Path(), help='Output annotation file (TSV)')
 @click.option('--simple', is_flag=True, help='Use simple annotation')
-def main(vep_output_file, custom_regions_file, customized_output_annotation_file, simple):
-    customize_panel_regions(vep_output_file, custom_regions_file, customized_output_annotation_file, simple)
+@click.option('--chr-chunk-size', type=int, default=1000000, show_default=True, help='Chunk size for per-chromosome loading')
+def main(vep_output_file, custom_regions_file, customized_output_annotation_file, simple, chr_chunk_size):
+    customize_panel_regions(vep_output_file, custom_regions_file, customized_output_annotation_file, simple, chr_chunk_size)
 
 if __name__ == '__main__':
     main()