diff --git a/docs/examples/plugin_examples.rst b/docs/examples/plugin_examples.rst index 9e6af80..c387cf1 100644 --- a/docs/examples/plugin_examples.rst +++ b/docs/examples/plugin_examples.rst @@ -3,12 +3,18 @@ Plugin examples =============================== + + **OpenVariant** offers a plugin system, where the user will be able to build their own plugins and make a customized data transformation. First of all, you will need to create a plugin; hence, check :ref:`Command-line interface` section and :ref:`Command-line interface examples` to understand how a plugin template can be generated. Also, it is important to know how plugins works and how they are composed in order to understand the following examples that we introduce. -We are going to introduce you two little plugins that we will use them in the example. The two plugins are described and built as: +Unique field plugin +---------------------- + +Plugins can modify individual fields, and in this example, we introduce two small plugins that are described and +implemented as follows: *Add date* plugin ######################## @@ -93,7 +99,188 @@ extract the length between the two fields. return context.row[context.field_name] -These two plugins are used in the following example: +Multiple fields plugin +------------------------- + +The plugin system allows transforming multiple fields simultaneously, and can be constructed as follows: + +*HGVS decoder* plugin +####################### + +`The Human Genome Variation Society (HGVS) Nomenclature `_ is the global standard +for describing DNA, RNA, and protein sequence variants. It is widely used in clinical reports, scientific publications, +and variant databases to communicate genetic changes. HGVS variants are expressed using a specific syntax that encodes +detailed information about the type and location of the change (e.g `c.76A>T`, `r.76_78del`, `p.Gly76_Val78del`). + +In this plugin, we decode HGVS expressions by identifying and separating the variant type (*TYPE*), its position (*POSITION*), +and the specific change that occurs (*VARIANT*). + +The *annotation* file with multiple fields can be described as: + +.. code-block:: yaml + + columns: + - TYPE + - POSITION + - VARIANT + + annotation: + - type: plugin + plugin: HGVS_decoder + field: + - TYPE + - POSITION + - VARIANT + - type: internal + field: HGVS + fieldSource: + - 'HGVS Consequence' + - HGVSp + + +We built the plugin with attention to the order of the different fields it processes. + +.. code-block:: python + + from openvariant.plugins.context import Context + from openvariant.plugins.plugin import Plugin + + import re + + class HGVS_decoderContext(Context): + + def __init__(self, row: dict, field_name: str, file_path: str) -> None: + super().__init__(row, field_name, file_path) + + + amino_acids_map = { + "Ala": "Alanine", + "Arg": "Arginine", + "Asn": "Asparagine", + "Asp": "Aspartic Acid", + "Cys": "Cysteine", + "Gln": "Glutamine", + "Glu": "Glutamic Acid", + "Gly": "Glycine", + "His": "Histidine", + "Ile": "Isoleucine", + "Leu": "Leucine", + "Lys": "Lysine", + "Met": "Methionine", + "Phe": "Phenylalanine", + "Pro": "Proline", + "Ser": "Serine", + "Thr": "Threonine", + "Trp": "Tryptophan", + "Tyr": "Tyrosine", + "Val": "Valine", + "Ter": "Termination codon" + } + + variant_map = { + "delins": "deletion-insertion by ", + "del": "deletion", + "ins": "insertion of ", + "dup": "duplication", + "inv": "inversion", + "con": "conversion", + "ext": "extension of ", + "fs": "frameshift mutation of " + } + + position_regex = re.compile(r'(\(?\*?-?\??\_?\d+(?:\_?[+-]\d+\??)?\)?(_)?(?:\(?\*?-?\d+\_?(?:[+-]\d+)?\??\)?)?)') + protein_position_regex = re.compile(r'(?[ACTG]+|del|ins[ACTG]+|dup|inv|con|\[[0-9]+\]|delins[ACTG]+') + variant_rna_regex = re.compile(r'[agcu]+>[agcu]+|del|ins[agcu]+|dup|inv|con|\[[0-9]+\]|delins[agcu]+') + + amino_acids = r'(?:Ala|Arg|Asn|Asp|Cys|Gln|Glu|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val|Ter)' + variant_protein_aa_regex = re.compile(rf'(? 0: + variant = variant_map.get(matches_variant[0]) + matches_n = re.findall(nucleotides, matches[0]) + if len(matches_n) > 0: + variant += matches_n[0] + else: + variant = matches[0] + return variant + + def parse_hgvs_variant_protein(hgvs_str): + matches = re.findall(variant_protein_aa_regex, hgvs_str) + if len(matches) == 1: + variant = amino_acids_map.get(matches[0]) + else: + aa_1 = amino_acids_map.get(matches[0]) + aa_2 = amino_acids_map.get(matches[1]) + if aa_1 == aa_2: + variant = "Synonymous (silent) variant" + else: + variant = aa_1 + " mutated to " + aa_2 + matches = re.findall(variant_protein_mod_regex, hgvs_str) + if len(matches) > 0: + variant += " and " + matches_variant = re.findall(variant_type_regex, matches[0]) + variant += variant_map.get(matches_variant[0]) + matches_amino_acid = re.findall(amino_acids, matches[0]) + if len(matches_amino_acid) > 0: + variant += amino_acids_map.get(matches_amino_acid[0]) + return variant + + def interpret_hgvs(hgvs_str): + prefix_map = { + "g.": ("gDNA", parse_hgvs_pos, parse_hgvs_variant), + "c.": ("cDNA", parse_hgvs_pos, parse_hgvs_variant), + "n.": ("ncDNA", parse_hgvs_pos, parse_hgvs_variant), + "m.": ("mtDNA", parse_hgvs_pos, parse_hgvs_variant), + "r.": ("RNA", parse_hgvs_pos, parse_hgvs_variant), + "p.": ("Protein", parse_hgvs_pos_protein, parse_hgvs_variant_protein), + } + + prefix = hgvs_str[:2] + + result = prefix_map.get(prefix, ("Unknown", [], [])) + seq = hgvs_str[2:] + + type_variant = result[0] + position = result[1](seq) + variant = result[2](seq) + + return type_variant, position, variant + + + + class HGVS_decoderPlugin(Plugin): + + def run(self, context: HGVS_decoderContext) -> dict: + + value = context.row["HGVS"] + type_variant, position, variant = interpret_hgvs(value) + + return type_variant, position, variant + + + +We can find all the examples on the repository: `OpenVariant examples `_ +and these plugins are used in the following examples: .. nbgallery:: :name: Plugin System examples @@ -101,4 +288,3 @@ These two plugins are used in the following example: plugin_system/plugin_system.ipynb -We can find all the examples on the repository: `OpenVariant examples `_. \ No newline at end of file diff --git a/docs/examples/plugin_system/HGVS_decoder/HGVS_decoder.py b/docs/examples/plugin_system/HGVS_decoder/HGVS_decoder.py new file mode 100644 index 0000000..b23c87a --- /dev/null +++ b/docs/examples/plugin_system/HGVS_decoder/HGVS_decoder.py @@ -0,0 +1,137 @@ +from openvariant.plugins.context import Context +from openvariant.plugins.plugin import Plugin + +import re + +class HGVS_decoderContext(Context): + + def __init__(self, row: dict, field_name: str, file_path: str) -> None: + super().__init__(row, field_name, file_path) + + +amino_acids_map = { + "Ala": "Alanine", + "Arg": "Arginine", + "Asn": "Asparagine", + "Asp": "Aspartic Acid", + "Cys": "Cysteine", + "Gln": "Glutamine", + "Glu": "Glutamic Acid", + "Gly": "Glycine", + "His": "Histidine", + "Ile": "Isoleucine", + "Leu": "Leucine", + "Lys": "Lysine", + "Met": "Methionine", + "Phe": "Phenylalanine", + "Pro": "Proline", + "Ser": "Serine", + "Thr": "Threonine", + "Trp": "Tryptophan", + "Tyr": "Tyrosine", + "Val": "Valine", + "Ter": "Termination codon" +} + +variant_map = { + "delins": "deletion-insertion by ", + "del": "deletion", + "ins": "insertion of ", + "dup": "duplication", + "inv": "inversion", + "con": "conversion", + "ext": "extension of ", + "fs": "frameshift mutation of " +} + +position_regex = re.compile(r'(\(?\*?-?\??\_?\d+(?:\_?[+-]\d+\??)?\)?(_)?(?:\(?\*?-?\d+\_?(?:[+-]\d+)?\??\)?)?)') +protein_position_regex = re.compile(r'(?[ACTG]+|del|ins[ACTG]+|dup|inv|con|\[[0-9]+\]|delins[ACTG]+') +variant_rna_regex = re.compile(r'[agcu]+>[agcu]+|del|ins[agcu]+|dup|inv|con|\[[0-9]+\]|delins[agcu]+') + +amino_acids = r'(?:Ala|Arg|Asn|Asp|Cys|Gln|Glu|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val|Ter)' +variant_protein_aa_regex = re.compile(rf'(? 0: + variant = variant_map.get(matches_variant[0]) + matches_n = re.findall(nucleotides, matches[0]) + if len(matches_n) > 0: + variant += matches_n[0] + else: + variant = matches[0] + return variant + +def parse_hgvs_variant_protein(hgvs_str): + matches = re.findall(variant_protein_aa_regex, hgvs_str) + if len(matches) == 1: + variant = amino_acids_map.get(matches[0]) + else: + aa_1 = amino_acids_map.get(matches[0]) + aa_2 = amino_acids_map.get(matches[1]) + if aa_1 == aa_2: + variant = "Synonymous (silent) variant" + else: + variant = aa_1 + " mutated to " + aa_2 + matches = re.findall(variant_protein_mod_regex, hgvs_str) + if len(matches) > 0: + variant += " and " + matches_variant = re.findall(variant_type_regex, matches[0]) + variant += variant_map.get(matches_variant[0]) + matches_amino_acid = re.findall(amino_acids, matches[0]) + if len(matches_amino_acid) > 0: + variant += amino_acids_map.get(matches_amino_acid[0]) + return variant + +def parse_hgvs_unknow(hgvs_str): + return None + +def interpret_hgvs(hgvs_str): + prefix_map = { + "g.": ("gDNA", parse_hgvs_pos, parse_hgvs_variant), + "c.": ("cDNA", parse_hgvs_pos, parse_hgvs_variant), + "n.": ("ncDNA", parse_hgvs_pos, parse_hgvs_variant), + "m.": ("mtDNA", parse_hgvs_pos, parse_hgvs_variant), + "r.": ("RNA", parse_hgvs_pos, parse_hgvs_variant), + "p.": ("Protein", parse_hgvs_pos_protein, parse_hgvs_variant_protein), + } + + prefix = hgvs_str[:2] + + result = prefix_map.get(prefix, ("Unknown", parse_hgvs_unknow, parse_hgvs_unknow)) + seq = hgvs_str[2:] + + type_variant = result[0] + position = result[1](seq) + variant = result[2](seq) + + return type_variant, position, variant + + + +class HGVS_decoderPlugin(Plugin): + + def run(self, context: HGVS_decoderContext) -> dict: + + value = context.row["HGVS"] + type_variant, position, variant = interpret_hgvs(value) + + return type_variant, position, variant diff --git a/docs/examples/plugin_system/HGVS_decoder/__init__.py b/docs/examples/plugin_system/HGVS_decoder/__init__.py new file mode 100644 index 0000000..a9b2ec9 --- /dev/null +++ b/docs/examples/plugin_system/HGVS_decoder/__init__.py @@ -0,0 +1,2 @@ +import .multi_test from Multi_testPlugin +import .multi_test from Multi_testContext diff --git a/docs/examples/plugin_system/plugin_system.ipynb b/docs/examples/plugin_system/plugin_system.ipynb index 8469a7c..5a741d6 100644 --- a/docs/examples/plugin_system/plugin_system.ipynb +++ b/docs/examples/plugin_system/plugin_system.ipynb @@ -2,19 +2,40 @@ "cells": [ { "cell_type": "markdown", - "source": [ - "# Plugin system example" - ], "metadata": { "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "# Plugin system example" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Unique field plugin\n", + "\n", + "Includes the Add Date plugin and the Get Length plugin." + ] }, { "cell_type": "code", "execution_count": 1, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "name": "stdout", @@ -41,34 +62,74 @@ "source": [ "%%bash\n", "openvar cat ../datasets/sample3 --header" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Multiple fields plugin\n", + "\n", + "Decoding HGVS across different variants" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TYPE\tPOSITION\tVARIANT\n", + "cDNA\t-33-42\tG>T\n", + "cDNA\t-33-42\tG>C\n", + "cDNA\t-33-42\tG>A\n", + "cDNA\t-33-39\tT>C\n", + "cDNA\t-33-37\tC>A\n", + "cDNA\t-33-36\tC>T\n", + "cDNA\t-33-34\tA>G\n", + "cDNA\t-33-33\tG>A\n", + "cDNA\t-33-30\tT>C\n", + "cDNA\t-33-28\tC>A\n", + "cDNA\t-33-27\tT>C\n", + "cDNA\t-33-25\tduplication\n", + "cDNA\t-33-24\tG>C\n", + "cDNA\t-33-24\tG>A\n", + "cDNA\t-33-23\tG>A\n", + "cDNA\t-33-21\tG>A\n", + "cDNA\t-33-20\tT>A\n", + "cDNA\t-33-19\tC>G\n", + "cDNA\t-33-19\tC>A\n" + ] } - } + ], + "source": [ + "%%bash\n", + "openvar cat ../datasets/sample4 --header" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.13.2" } }, "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "nbformat_minor": 4 +} diff --git a/docs/user_guide/annotation_structure.rst b/docs/user_guide/annotation_structure.rst index c99e3ed..6caa278 100644 --- a/docs/user_guide/annotation_structure.rst +++ b/docs/user_guide/annotation_structure.rst @@ -228,12 +228,12 @@ Plugin ############# It will apply the plugin functionality to each row of the `input` file. The plugin can be internal, located into `plugin` -folder or can be customized and created by the user. See further details in :ref:`Plugin system` section. +folder or can be customized and created by the user. The parameters that `Plugin` needs are: * ``type``: type of annotation. (required) -* ``field``: name that will appear as a head column of this annotation. (required) +* ``field``: a single name or a list of fields that will appear as a head column of this annotation. (required) * ``plugin``: name of plugin to apply (required) .. code-block:: yaml @@ -243,6 +243,22 @@ The parameters that `Plugin` needs are: field: 'ALT_TYPE' plugin: 'alteration_type' +The plugin system supports multiple fields, however, the order and number of fields must be consistent between the +annotation and the plugin implementation. + +.. code-block:: yaml + + # Example: + - type: 'plugin' + field: + - 'Chr' + - 'Start' + - 'End' + - 'Alt' + - 'Ref' + plugin: 'variant_decoder' + +See further details in :ref:`Plugin system` section. Exclude (optional) ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/user_guide/plugin_system.rst b/docs/user_guide/plugin_system.rst index 572caae..b81c8bb 100644 --- a/docs/user_guide/plugin_system.rst +++ b/docs/user_guide/plugin_system.rst @@ -20,7 +20,7 @@ visualize how the different classes are connected and composed with **OpenVarian | As we have mentioned before, the plugin has to be present on the `annotation` file in order to be used. Custom plugins will be placed in the -folder where the environment variable :bash:`OPENVAR_PLUGIN` points (:bash:`/home/user/.local/share/openvariant/` by default). +folder where the environment variable ``OPENVAR_PLUGIN`` points (``/home/user/.local/share/openvariant/`` by default). The `Builder` will manage to find them and apply the data transformation. Plugins will inherit `Context` and `Plugin` as base classes for each plugin. These classes are described as it follows: @@ -62,5 +62,19 @@ Plugins will inherit `Context` and `Plugin` as base classes for each plugin. The """ raise NotImplementedError +A plugin can return either a single field or multiple fields. Both cases are handled as follows: + +*Returning a single field:* + +.. code-block:: python + + return position + +*Returning multiple fields:* + +.. code-block:: python + + return chromosome, start, end, alt, ref + Check :ref:`Command-line interface` to know how to create a new plugin. Also, to check more examples on how plugins can be applied and written, see :ref:`Plugin examples`. \ No newline at end of file diff --git a/examples/datasets/sample4/gnomAD.csv b/examples/datasets/sample4/gnomAD.csv new file mode 100644 index 0000000..73b6f66 --- /dev/null +++ b/examples/datasets/sample4/gnomAD.csv @@ -0,0 +1,20 @@ +gnomAD ID,Chromosome,Position,rsIDs,Reference,Alternate,Filters - exomes,Filters - genomes,Transcript,HGVS Consequence,Protein Consequence,Transcript Consequence,VEP Annotation,ClinVar Germline Classification,ClinVar Variation ID,Flags,Allele Count,Allele Number,Allele Frequency,Homozygote Count,Hemizygote Count,Filters - joint,GroupMax FAF group,GroupMax FAF frequency,cadd,revel_max,spliceai_ds_max,pangolin_largest_ds,phylop,sift_max,polyphen_max,Allele Count African/African American,Allele Number African/African American,Homozygote Count African/African American,Hemizygote Count African/African American,Allele Count Admixed American,Allele Number Admixed American,Homozygote Count Admixed American,Hemizygote Count Admixed American,Allele Count Ashkenazi Jewish,Allele Number Ashkenazi Jewish,Homozygote Count Ashkenazi Jewish,Hemizygote Count Ashkenazi Jewish,Allele Count East Asian,Allele Number East Asian,Homozygote Count East Asian,Hemizygote Count East Asian,Allele Count European (Finnish),Allele Number European (Finnish),Homozygote Count European (Finnish),Hemizygote Count European (Finnish),Allele Count Middle Eastern,Allele Number Middle Eastern,Homozygote Count Middle Eastern,Hemizygote Count Middle Eastern,Allele Count European (non-Finnish),Allele Number European (non-Finnish),Homozygote Count European (non-Finnish),Hemizygote Count European (non-Finnish),Allele Count Amish,Allele Number Amish,Homozygote Count Amish,Hemizygote Count Amish,Allele Count South Asian,Allele Number South Asian,Homozygote Count South Asian,Hemizygote Count South Asian,Allele Count Remaining,Allele Number Remaining,Homozygote Count Remaining,Hemizygote Count Remaining +1-3682291-G-T,1,3682291,,G,T,PASS,NA,ENST00000378295.9,c.-33-42G>T,,c.-33-42G>T,intron_variant,,,,2,1375376,1.45414781121671E-06,0,0,PASS,sas,0.00000531,0.482,,0,0,-0.056,,,0,67716,0,0,0,37548,0,0,0,21442,0,0,0,35262,0,0,0,54404,0,0,0,5224,0,0,0,1038424,0,0,0,912,0,0,2,62504,0,0,0,51940,0,0 +1-3682291-G-C,1,3682291,rs534349566,G,C,PASS,PASS,ENST00000378295.9,c.-33-42G>C,,c.-33-42G>C,intron_variant,,,,59,1375374,4.28974228100866E-05,1,0,PASS,amr,0.0012026,0.419,,0,0,-0.056,,,1,67716,0,0,57,37546,1,0,0,21442,0,0,0,35262,0,0,0,54404,0,0,0,5224,0,0,0,1038424,0,0,0,912,0,0,0,62504,0,0,1,51940,0,0 +1-3682291-G-A,1,3682291,rs534349566,G,A,PASS,PASS,ENST00000378295.9,c.-33-42G>A,,c.-33-42G>A,intron_variant,,,,23,1375258,1.6724134671458E-05,0,0,PASS,nfe,0.0000117,0.644,,0.01,0,-0.056,,,0,67594,0,0,0,37528,0,0,0,21442,0,0,1,35274,0,0,1,54404,0,0,0,5246,0,0,19,1038432,0,0,0,912,0,0,0,62508,0,0,2,51918,0,0 +1-3682294-T-C,1,3682294,rs916680140,T,C,PASS,NA,ENST00000378295.9,c.-33-39T>C,,c.-33-39T>C,intron_variant,,,,2,1383912,1.44517859517079E-06,0,0,PASS,nfe,0.00000032,0.067,,0,0,-0.017,,,0,67926,0,0,0,38244,0,0,0,21612,0,0,0,35122,0,0,0,54618,0,0,0,5294,0,0,2,1044616,0,0,0,910,0,0,0,63312,0,0,0,52258,0,0 +1-3682296-C-A,1,3682296,,C,A,PASS,NA,ENST00000378295.9,c.-33-37C>A,,c.-33-37C>A,intron_variant,,,,1,1395094,7.16797577797625E-07,0,0,PASS,,,0.693,,0.03,0,-0.247,,,0,68312,0,0,0,38864,0,0,0,21976,0,0,0,35740,0,0,0,54858,0,0,0,5336,0,0,0,1051480,0,0,0,912,0,0,1,64836,0,0,0,52780,0,0 +1-3682297-C-T,1,3682297,,C,T,PASS,NA,ENST00000378295.9,c.-33-36C>T,,c.-33-36C>T,intron_variant,,,,1,1397234,7.1569973247144E-07,0,0,PASS,,,2.31,,0.02,0,-0.027,,,0,68350,0,0,1,38954,0,0,0,22022,0,0,0,35768,0,0,0,54912,0,0,0,5346,0,0,0,1053160,0,0,0,912,0,0,0,64990,0,0,0,52820,0,0 +1-3682299-A-G,1,3682299,,A,G,PASS,NA,ENST00000378295.9,c.-33-34A>G,,c.-33-34A>G,intron_variant,,,,3,1395420,2.14989035559187E-06,0,0,PASS,nfe,0.00000032,0.509,,0,-0.01,-3.89,,,0,68240,0,0,0,38698,0,0,0,21962,0,0,0,35692,0,0,0,54938,0,0,0,5348,0,0,2,1052118,0,0,0,910,0,0,1,64786,0,0,0,52728,0,0 +1-3682300-G-A,1,3682300,,G,A,PASS,NA,ENST00000378295.9,c.-33-33G>A,,c.-33-33G>A,intron_variant,,,,1,1400418,7.14072512635513E-07,0,0,PASS,,,0.078,,0.01,0,-0.611,,,0,68452,0,0,0,39038,0,0,0,22120,0,0,0,35804,0,0,0,55004,0,0,0,5354,0,0,1,1055368,0,0,0,910,0,0,0,65406,0,0,0,52962,0,0 +1-3682303-T-C,1,3682303,,T,C,PASS,NA,ENST00000378295.9,c.-33-30T>C,,c.-33-30T>C,intron_variant,,,,1,1411590,7.08421000432137E-07,0,0,PASS,,,0.334,,0,-0.01,-2.55,,,0,68850,0,0,0,40442,0,0,0,22552,0,0,0,35996,0,0,0,55292,0,0,0,5408,0,0,0,1062338,0,0,0,912,0,0,0,66380,0,0,1,53420,0,0 +1-3682305-C-A,1,3682305,,C,A,PASS,NA,ENST00000378295.9,c.-33-28C>A,,c.-33-28C>A,intron_variant,,,,1,1429328,6.99629476229389E-07,0,0,PASS,,,6.93,,0,0,2.11,,,0,69494,0,0,0,42188,0,0,0,23198,0,0,0,36648,0,0,0,55812,0,0,0,5478,0,0,0,1072304,0,0,0,912,0,0,0,68996,0,0,1,54298,0,0 +1-3682306-T-C,1,3682306,rs374235190,T,C,PASS,PASS,ENST00000378295.9,c.-33-27T>C,,c.-33-27T>C,intron_variant,,,,36,1429814,2.5178100088543E-05,0,0,PASS,sas,0.00037441,4.67,,0.02,-0.07,0.436,,,0,69468,0,0,0,42094,0,0,0,23190,0,0,0,36616,0,0,0,55790,0,0,0,5480,0,0,0,1073056,0,0,0,912,0,0,35,68932,0,0,1,54276,0,0 +1-3682307-C-CA,1,3682307,,C,CA,PASS,NA,ENST00000378295.9,c.-33-25dup,,c.-33-25dup,intron_variant,,,,1,1431932,6.98357184559043E-07,0,0,PASS,,,6.6,,0,0,2.41,,,0,69544,0,0,0,42206,0,0,0,23256,0,0,0,36706,0,0,0,55852,0,0,0,5492,0,0,1,1074286,0,0,0,912,0,0,0,69304,0,0,0,54374,0,0 +1-3682309-G-C,1,3682309,,G,C,PASS,NA,ENST00000378295.9,c.-33-24G>C,,c.-33-24G>C,intron_variant,,,,1,1442640,6.93173626129873E-07,0,0,PASS,,,0.693,,0,0,-3.09,,,0,69918,0,0,0,43250,0,0,0,23634,0,0,0,37102,0,0,0,56160,0,0,0,5522,0,0,0,1080638,0,0,0,912,0,0,0,70644,0,0,1,54860,0,0 +1-3682309-G-A,1,3682309,,G,A,PASS,NA,ENST00000378295.9,c.-33-24G>A,,c.-33-24G>A,intron_variant,,,,6,1442640,4.15904175677924E-06,0,0,PASS,nfe,0.000002,0.825,,0,0,-3.09,,,0,69918,0,0,0,43250,0,0,0,23634,0,0,0,37102,0,0,0,56160,0,0,0,5522,0,0,6,1080638,0,0,0,912,0,0,0,70644,0,0,0,54860,0,0 +1-3682310-G-A,1,3682310,rs1449761755,G,A,PASS,NA,ENST00000378295.9,c.-33-23G>A,,c.-33-23G>A,intron_variant,,,,1,1445388,6.91855750843372E-07,0,0,PASS,,,0.247,,0,0,-0.758,,,0,70038,0,0,1,43458,0,0,0,23788,0,0,0,37182,0,0,0,56218,0,0,0,5536,0,0,0,1082228,0,0,0,912,0,0,0,71036,0,0,0,54992,0,0 +1-3682312-G-A,1,3682312,,G,A,PASS,NA,ENST00000378295.9,c.-33-21G>A,,c.-33-21G>A,intron_variant,,,,5,1447908,3.45325807993326E-06,0,0,PASS,nfe,0.00000086,4.77,,0.03,0.01,0.468,,,0,70096,0,0,0,43558,0,0,0,23852,0,0,0,37252,0,0,0,56312,0,0,0,5542,0,0,4,1083938,0,0,0,912,0,0,1,71346,0,0,0,55100,0,0 +1-3682313-T-A,1,3682313,,T,A,PASS,NA,ENST00000378295.9,c.-33-20T>A,,c.-33-20T>A,intron_variant,,,,1,1446084,6.91522760780148E-07,0,0,PASS,,,3.91,,0,0,-0.524,,,1,69952,0,0,0,43328,0,0,0,23842,0,0,0,37070,0,0,0,56288,0,0,0,5542,0,0,0,1082966,0,0,0,910,0,0,0,71170,0,0,0,55016,0,0 +1-3682314-C-G,1,3682314,rs554511962,C,G,PASS,NA,ENST00000378295.9,c.-33-19C>G,,c.-33-19C>G,intron_variant,,,,6,1451188,4.13454356017277E-06,0,0,PASS,nfe,0.00000199,0.322,,0,0,0.42,,,0,70186,0,0,0,43810,0,0,0,24036,0,0,0,37346,0,0,0,56458,0,0,0,5552,0,0,6,1085908,0,0,0,912,0,0,0,71728,0,0,0,55252,0,0 +1-3682314-C-A,1,3682314,rs554511962,C,A,PASS,PASS,ENST00000378295.9,c.-33-19C>A,,c.-33-19C>A,intron_variant,,,,23,1451186,1.5849105490268E-05,0,0,PASS,eas,0.00037641,0.292,,0,0,0.42,,,0,70186,0,0,0,43810,0,0,0,24036,0,0,21,37346,0,0,0,56458,0,0,0,5552,0,0,0,1085906,0,0,0,912,0,0,1,71728,0,0,1,55252,0,0 diff --git a/examples/datasets/sample4/sample4.yaml b/examples/datasets/sample4/sample4.yaml new file mode 100644 index 0000000..5f3df3f --- /dev/null +++ b/examples/datasets/sample4/sample4.yaml @@ -0,0 +1,22 @@ +pattern: + - '*.csv' + +delimiter: C + +columns: +- TYPE +- POSITION +- VARIANT + +annotation: +- type: plugin + plugin: HGVS_decoder + field: + - TYPE + - POSITION + - VARIANT +- type: internal + field: HGVS + fieldSource: + - 'HGVS Consequence' + - HGVSp diff --git a/examples/plugin_system/HGVS_decoder/HGVS_decoder.py b/examples/plugin_system/HGVS_decoder/HGVS_decoder.py new file mode 100644 index 0000000..b23c87a --- /dev/null +++ b/examples/plugin_system/HGVS_decoder/HGVS_decoder.py @@ -0,0 +1,137 @@ +from openvariant.plugins.context import Context +from openvariant.plugins.plugin import Plugin + +import re + +class HGVS_decoderContext(Context): + + def __init__(self, row: dict, field_name: str, file_path: str) -> None: + super().__init__(row, field_name, file_path) + + +amino_acids_map = { + "Ala": "Alanine", + "Arg": "Arginine", + "Asn": "Asparagine", + "Asp": "Aspartic Acid", + "Cys": "Cysteine", + "Gln": "Glutamine", + "Glu": "Glutamic Acid", + "Gly": "Glycine", + "His": "Histidine", + "Ile": "Isoleucine", + "Leu": "Leucine", + "Lys": "Lysine", + "Met": "Methionine", + "Phe": "Phenylalanine", + "Pro": "Proline", + "Ser": "Serine", + "Thr": "Threonine", + "Trp": "Tryptophan", + "Tyr": "Tyrosine", + "Val": "Valine", + "Ter": "Termination codon" +} + +variant_map = { + "delins": "deletion-insertion by ", + "del": "deletion", + "ins": "insertion of ", + "dup": "duplication", + "inv": "inversion", + "con": "conversion", + "ext": "extension of ", + "fs": "frameshift mutation of " +} + +position_regex = re.compile(r'(\(?\*?-?\??\_?\d+(?:\_?[+-]\d+\??)?\)?(_)?(?:\(?\*?-?\d+\_?(?:[+-]\d+)?\??\)?)?)') +protein_position_regex = re.compile(r'(?[ACTG]+|del|ins[ACTG]+|dup|inv|con|\[[0-9]+\]|delins[ACTG]+') +variant_rna_regex = re.compile(r'[agcu]+>[agcu]+|del|ins[agcu]+|dup|inv|con|\[[0-9]+\]|delins[agcu]+') + +amino_acids = r'(?:Ala|Arg|Asn|Asp|Cys|Gln|Glu|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val|Ter)' +variant_protein_aa_regex = re.compile(rf'(? 0: + variant = variant_map.get(matches_variant[0]) + matches_n = re.findall(nucleotides, matches[0]) + if len(matches_n) > 0: + variant += matches_n[0] + else: + variant = matches[0] + return variant + +def parse_hgvs_variant_protein(hgvs_str): + matches = re.findall(variant_protein_aa_regex, hgvs_str) + if len(matches) == 1: + variant = amino_acids_map.get(matches[0]) + else: + aa_1 = amino_acids_map.get(matches[0]) + aa_2 = amino_acids_map.get(matches[1]) + if aa_1 == aa_2: + variant = "Synonymous (silent) variant" + else: + variant = aa_1 + " mutated to " + aa_2 + matches = re.findall(variant_protein_mod_regex, hgvs_str) + if len(matches) > 0: + variant += " and " + matches_variant = re.findall(variant_type_regex, matches[0]) + variant += variant_map.get(matches_variant[0]) + matches_amino_acid = re.findall(amino_acids, matches[0]) + if len(matches_amino_acid) > 0: + variant += amino_acids_map.get(matches_amino_acid[0]) + return variant + +def parse_hgvs_unknow(hgvs_str): + return None + +def interpret_hgvs(hgvs_str): + prefix_map = { + "g.": ("gDNA", parse_hgvs_pos, parse_hgvs_variant), + "c.": ("cDNA", parse_hgvs_pos, parse_hgvs_variant), + "n.": ("ncDNA", parse_hgvs_pos, parse_hgvs_variant), + "m.": ("mtDNA", parse_hgvs_pos, parse_hgvs_variant), + "r.": ("RNA", parse_hgvs_pos, parse_hgvs_variant), + "p.": ("Protein", parse_hgvs_pos_protein, parse_hgvs_variant_protein), + } + + prefix = hgvs_str[:2] + + result = prefix_map.get(prefix, ("Unknown", parse_hgvs_unknow, parse_hgvs_unknow)) + seq = hgvs_str[2:] + + type_variant = result[0] + position = result[1](seq) + variant = result[2](seq) + + return type_variant, position, variant + + + +class HGVS_decoderPlugin(Plugin): + + def run(self, context: HGVS_decoderContext) -> dict: + + value = context.row["HGVS"] + type_variant, position, variant = interpret_hgvs(value) + + return type_variant, position, variant diff --git a/examples/plugin_system/HGVS_decoder/__init__.py b/examples/plugin_system/HGVS_decoder/__init__.py new file mode 100644 index 0000000..a9b2ec9 --- /dev/null +++ b/examples/plugin_system/HGVS_decoder/__init__.py @@ -0,0 +1,2 @@ +import .multi_test from Multi_testPlugin +import .multi_test from Multi_testContext diff --git a/examples/plugin_system/README.md b/examples/plugin_system/README.md index 0ea62c3..fdc90f3 100644 --- a/examples/plugin_system/README.md +++ b/examples/plugin_system/README.md @@ -1,3 +1,11 @@ # Plugin system examples -- [Plugin system](plugin_system.ipynb) - A simple example that two plugins. +### Plugins + +- [Add date](./add_date) - Plugin to add the current date. +- [Get length](./get_length) - Plugin to obtain the difference between two values. +- [HGVS decoder](./HGVS_decoder) - Plugin to decode the type, position and change of different variants. + +### Output example + +- [Plugin system](plugin_system.ipynb) - Unique and multiple fields plugins example. diff --git a/examples/plugin_system/plugin_system.ipynb b/examples/plugin_system/plugin_system.ipynb index 8469a7c..5a741d6 100644 --- a/examples/plugin_system/plugin_system.ipynb +++ b/examples/plugin_system/plugin_system.ipynb @@ -2,19 +2,40 @@ "cells": [ { "cell_type": "markdown", - "source": [ - "# Plugin system example" - ], "metadata": { "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "# Plugin system example" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Unique field plugin\n", + "\n", + "Includes the Add Date plugin and the Get Length plugin." + ] }, { "cell_type": "code", "execution_count": 1, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "name": "stdout", @@ -41,34 +62,74 @@ "source": [ "%%bash\n", "openvar cat ../datasets/sample3 --header" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Multiple fields plugin\n", + "\n", + "Decoding HGVS across different variants" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TYPE\tPOSITION\tVARIANT\n", + "cDNA\t-33-42\tG>T\n", + "cDNA\t-33-42\tG>C\n", + "cDNA\t-33-42\tG>A\n", + "cDNA\t-33-39\tT>C\n", + "cDNA\t-33-37\tC>A\n", + "cDNA\t-33-36\tC>T\n", + "cDNA\t-33-34\tA>G\n", + "cDNA\t-33-33\tG>A\n", + "cDNA\t-33-30\tT>C\n", + "cDNA\t-33-28\tC>A\n", + "cDNA\t-33-27\tT>C\n", + "cDNA\t-33-25\tduplication\n", + "cDNA\t-33-24\tG>C\n", + "cDNA\t-33-24\tG>A\n", + "cDNA\t-33-23\tG>A\n", + "cDNA\t-33-21\tG>A\n", + "cDNA\t-33-20\tT>A\n", + "cDNA\t-33-19\tC>G\n", + "cDNA\t-33-19\tC>A\n" + ] } - } + ], + "source": [ + "%%bash\n", + "openvar cat ../datasets/sample4 --header" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.13.2" } }, "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "nbformat_minor": 4 +} diff --git a/openvariant/annotation/annotation.py b/openvariant/annotation/annotation.py index b35dd80..8865ecd 100644 --- a/openvariant/annotation/annotation.py +++ b/openvariant/annotation/annotation.py @@ -65,7 +65,7 @@ def _check_annotation_keys(annot: dict) -> None: raise ValueError(f"'{AnnotationKeys.TYPE.value}' value is wrong.") # Field key - if AnnotationKeys.FIELD.value not in annot or not isinstance(annot[AnnotationKeys.FIELD.value], str): + if AnnotationKeys.FIELD.value not in annot or (not isinstance(annot[AnnotationKeys.FIELD.value], list) and not isinstance(annot[AnnotationKeys.FIELD.value], str)): raise KeyError(f"'{AnnotationKeys.FIELD.value}' key not found or is not a str.") # Value key @@ -123,10 +123,13 @@ def _read_annotation_file(self) -> dict: logging.error(exc) stream.close() + def _clean_annotation_keys(self): + return [item for x in self.annotations.keys() for item in (list(x) if isinstance(x, tuple) else [x])] + def _check_columns(self) -> None: """Check if columns exists as annotation fields""" for col in self._columns: - if col not in self._annotations: + if col not in self._clean_annotation_keys(): raise KeyError(f"'{col}' column unable to find.") def __init__(self, annotation_path: str) -> None: @@ -164,15 +167,15 @@ def __init__(self, annotation_path: str) -> None: self._annotations: dict = {} for k in raw_annotation.get(AnnotationGeneralKeys.ANNOTATION.value, []): - class_name = k[AnnotationKeys.TYPE.value].upper() module_name = "openvariant.annotation.builder" ClassAnnotation = import_class_from_module(module_name, class_name) instance = ClassAnnotation() - - self._annotations[k[AnnotationKeys.FIELD.value]] = instance(k, self._path) - - self._columns = raw_annotation.get(AnnotationGeneralKeys.COLUMNS.value, list(self.annotations.keys())) + if isinstance(k[AnnotationKeys.FIELD.value], list): + self._annotations[tuple(k[AnnotationKeys.FIELD.value])] = instance(k, self._path) + else: + self._annotations[k[AnnotationKeys.FIELD.value]] = instance(k, self._path) + self._columns = raw_annotation.get(AnnotationGeneralKeys.COLUMNS.value, self._clean_annotation_keys()) self._check_columns() @property diff --git a/openvariant/variant/variant.py b/openvariant/variant/variant.py index ea790ed..f0163cb 100644 --- a/openvariant/variant/variant.py +++ b/openvariant/variant/variant.py @@ -122,6 +122,7 @@ def _extract_header(file_path: str, original_header: list, annotation: Annotatio instance = ClassAnnotation() header_schema.update({field: instance(ann, original_header, file_path, header_schema)}) + return header_schema, annotation.columns @@ -194,8 +195,9 @@ def __init__(self, path: str, annotation: Annotation, skip_files: bool = False) csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2)) self._path: str = path self._annotation: Annotation = annotation - self._header: List[str] = list(annotation.annotations.keys()) if len(annotation.columns) == 0 \ - else annotation.columns + + + self._header: List[str] = annotation.columns self.skip_files = skip_files def _unify(self, base_path: str, annotation: Annotation, group_by: str = None, display_header: bool = True) \ @@ -216,7 +218,6 @@ def _parser(self, file_path: str, annotation: Annotation, group_by: str, display try: self.mm, self.file = _open_file(file_path, "rb") - for lnum, line in _base_parser(self.mm, file_path, annotation.delimiter, self.skip_files): try: if header is None: @@ -253,9 +254,15 @@ def _parser(self, file_path: str, annotation: Annotation, group_by: str, display for head, mapping in mapping_values.items(): _, builder_mapping, func = mapping line_dict[head] = _parse_mapping_field(builder_mapping, line_dict, func) + for head, plug in plugin_values.items(): _, ctxt_plugin, func_plugin = plug - line_dict[head] = _parse_plugin_field(line_dict, head, file_path, ctxt_plugin, func_plugin) + value_plugin = _parse_plugin_field(line_dict, head, file_path, ctxt_plugin, func_plugin) + if isinstance(head, tuple): + for idx, x in enumerate(head): + line_dict[x] = value_plugin[idx] + else: + line_dict[head] = value_plugin for k in annotation.columns: row[k] = line_dict[k].format(**line_dict)