From c6514f15a8b162855aa77907b6a1135e4dd843fb Mon Sep 17 00:00:00 2001 From: LuckyMD Date: Mon, 18 Jul 2022 19:27:55 +0200 Subject: [PATCH 1/9] added scvi-scarches-xgb to label projection --- .../label_projection/methods/scvi_tools.py | 88 ++++++++++++++++++- 1 file changed, 87 insertions(+), 1 deletion(-) diff --git a/openproblems/tasks/label_projection/methods/scvi_tools.py b/openproblems/tasks/label_projection/methods/scvi_tools.py index 05cd51825d..5191e89180 100644 --- a/openproblems/tasks/label_projection/methods/scvi_tools.py +++ b/openproblems/tasks/label_projection/methods/scvi_tools.py @@ -86,7 +86,7 @@ def _scanvi(adata, test=False, n_hidden=None, n_latent=None, n_layers=None): return preds -def _scanvi_scarches(adata, test=False, n_hidden=None, n_latent=None, n_layers=None): +def _scanvi_scarches(adata, test=False, n_hidden=None, n_latent=None, n_layers=None, prediction_method='scanvi'): import scvi if test: @@ -138,6 +138,15 @@ def _scanvi_scarches(adata, test=False, n_hidden=None, n_latent=None, n_layers=N train_kwargs["limit_val_batches"] = 10 query_model.train(plan_kwargs=dict(weight_decay=0.0), **train_kwargs) + if prediction_method == 'scanvi': + preds = _pred_scanvi(adata, query_model) + elif prediction_method == 'xgboost': + preds = _pred_xgb(adata, adata_train, adata_test, query_model, test=test) + + return preds + + +def _pred_scanvi(adata, query_model): # this is temporary and won't be used adata.obs["scanvi_labels"] = "Unknown" preds = query_model.predict(adata) @@ -145,6 +154,58 @@ def _scanvi_scarches(adata, test=False, n_hidden=None, n_latent=None, n_layers=N # predictions for train and test return preds +# note: add test option here +def _pred_xgb( + adata, + adata_train, + adata_test, + query_model, + label_col='labels', + test=False, + num_round: Optional[int] = None +): + import xgboost as xgb + + df = _classif_df(adata_train, query_model, label_col) + + X_train = df.drop(columns='labels') + y_train = df['labels'] + + X_test = query_model.get_latent_representation(adata_test) + + if test: + num_round = num_round or 2 + else: + num_round = num_round or 5 + + xgbc = xgb.XGBClassifier( + tree_method = 'hist', + objective = 'multi:softprob' + ) + + + xgbc.fit(X_train, y_train) + + adata_test.obs['preds_test'] = xgbc.predict(X_test) + + preds = [adata_test.obs['preds_test'][idx] if idx in adata_test.obs_names else np.nan + for idx in adata.obs_names] + + return preds + + +def _classif_df(adata, trained_model, label_col): + emb_data = trained_model.get_latent_representation(adata) + + df = pd.DataFrame( + data = emb_data, + index = adata.obs_names + ) + + df['labels'] = adata.obs[label_col] + + return df + @_scanvi_method(method_name="scANVI (All genes)") def scanvi_all_genes(adata, test=False): @@ -176,3 +237,28 @@ def scarches_scanvi_hvg(adata, test=False): adata.obs["labels_pred"] = _scanvi_scarches(bdata, test=test) adata.uns["method_code_version"] = check_version("scvi-tools") return adata + +@_scanvi_scarches_method(method_name="scArches+scANVI+xgboost (All genes)") +def scarches_scanvi_all_genes(adata, test=False): + adata.obs["labels_pred"] = _scanvi_scarches( + adata, + test=test, + prediction_method='xgboost' + ) + + adata.uns["method_code_version"] = check_version("scvi-tools") + return adata + + +@_scanvi_scarches_method(method_name="scArches+scANVI+xgboost (Seurat v3 2000 HVG)") +def scarches_scanvi_hvg(adata, test=False): + hvg_df = _hvg(adata, test) + bdata = adata[:, hvg_df.highly_variable].copy() + adata.obs["labels_pred"] = _scanvi_scarches( + bdata, + test=test, + prediction_method='xgboost' + ) + + adata.uns["method_code_version"] = check_version("scvi-tools") + return adata From 999d0a633e2d422a5d839916c1186f2db223307b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 18 Jul 2022 17:31:48 +0000 Subject: [PATCH 2/9] pre-commit --- .../label_projection/methods/scvi_tools.py | 76 +++++++++---------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/openproblems/tasks/label_projection/methods/scvi_tools.py b/openproblems/tasks/label_projection/methods/scvi_tools.py index 5191e89180..437c588019 100644 --- a/openproblems/tasks/label_projection/methods/scvi_tools.py +++ b/openproblems/tasks/label_projection/methods/scvi_tools.py @@ -86,7 +86,14 @@ def _scanvi(adata, test=False, n_hidden=None, n_latent=None, n_layers=None): return preds -def _scanvi_scarches(adata, test=False, n_hidden=None, n_latent=None, n_layers=None, prediction_method='scanvi'): +def _scanvi_scarches( + adata, + test=False, + n_hidden=None, + n_latent=None, + n_layers=None, + prediction_method="scanvi", +): import scvi if test: @@ -138,11 +145,11 @@ def _scanvi_scarches(adata, test=False, n_hidden=None, n_latent=None, n_layers=N train_kwargs["limit_val_batches"] = 10 query_model.train(plan_kwargs=dict(weight_decay=0.0), **train_kwargs) - if prediction_method == 'scanvi': + if prediction_method == "scanvi": preds = _pred_scanvi(adata, query_model) - elif prediction_method == 'xgboost': + elif prediction_method == "xgboost": preds = _pred_xgb(adata, adata_train, adata_test, query_model, test=test) - + return preds @@ -154,22 +161,23 @@ def _pred_scanvi(adata, query_model): # predictions for train and test return preds + # note: add test option here def _pred_xgb( - adata, - adata_train, - adata_test, - query_model, - label_col='labels', - test=False, - num_round: Optional[int] = None + adata, + adata_train, + adata_test, + query_model, + label_col="labels", + test=False, + num_round: Optional[int] = None, ): import xgboost as xgb - + df = _classif_df(adata_train, query_model, label_col) - X_train = df.drop(columns='labels') - y_train = df['labels'] + X_train = df.drop(columns="labels") + y_train = df["labels"] X_test = query_model.get_latent_representation(adata_test) @@ -178,32 +186,27 @@ def _pred_xgb( else: num_round = num_round or 5 - xgbc = xgb.XGBClassifier( - tree_method = 'hist', - objective = 'multi:softprob' - ) + xgbc = xgb.XGBClassifier(tree_method="hist", objective="multi:softprob") - xgbc.fit(X_train, y_train) - adata_test.obs['preds_test'] = xgbc.predict(X_test) + adata_test.obs["preds_test"] = xgbc.predict(X_test) + + preds = [ + adata_test.obs["preds_test"][idx] if idx in adata_test.obs_names else np.nan + for idx in adata.obs_names + ] - preds = [adata_test.obs['preds_test'][idx] if idx in adata_test.obs_names else np.nan - for idx in adata.obs_names] - return preds def _classif_df(adata, trained_model, label_col): emb_data = trained_model.get_latent_representation(adata) - - df = pd.DataFrame( - data = emb_data, - index = adata.obs_names - ) - - df['labels'] = adata.obs[label_col] - + + df = pd.DataFrame(data=emb_data, index=adata.obs_names) + + df["labels"] = adata.obs[label_col] + return df @@ -238,12 +241,11 @@ def scarches_scanvi_hvg(adata, test=False): adata.uns["method_code_version"] = check_version("scvi-tools") return adata + @_scanvi_scarches_method(method_name="scArches+scANVI+xgboost (All genes)") def scarches_scanvi_all_genes(adata, test=False): adata.obs["labels_pred"] = _scanvi_scarches( - adata, - test=test, - prediction_method='xgboost' + adata, test=test, prediction_method="xgboost" ) adata.uns["method_code_version"] = check_version("scvi-tools") @@ -255,10 +257,8 @@ def scarches_scanvi_hvg(adata, test=False): hvg_df = _hvg(adata, test) bdata = adata[:, hvg_df.highly_variable].copy() adata.obs["labels_pred"] = _scanvi_scarches( - bdata, - test=test, - prediction_method='xgboost' + bdata, test=test, prediction_method="xgboost" ) - + adata.uns["method_code_version"] = check_version("scvi-tools") return adata From 7b60383777ad19ae3d7fdc3817f91c553de3757c Mon Sep 17 00:00:00 2001 From: LuckyMD Date: Mon, 18 Jul 2022 19:32:10 +0200 Subject: [PATCH 3/9] added methods to init and renamed --- openproblems/tasks/label_projection/methods/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/openproblems/tasks/label_projection/methods/__init__.py b/openproblems/tasks/label_projection/methods/__init__.py index a5b099a5ae..1330b09e12 100644 --- a/openproblems/tasks/label_projection/methods/__init__.py +++ b/openproblems/tasks/label_projection/methods/__init__.py @@ -10,5 +10,7 @@ from .scvi_tools import scanvi_hvg from .scvi_tools import scarches_scanvi_all_genes from .scvi_tools import scarches_scanvi_hvg +from .scvi_tools import scarches_scanvi_xgb_all_genes +from .scvi_tools import scarches_scanvi_xgb_hvg from .xgboost import xgboost_log_cpm from .xgboost import xgboost_scran From 85ddc04ec09157e458bae16c3dc10fee49a6a80a Mon Sep 17 00:00:00 2001 From: LuckyMD Date: Mon, 18 Jul 2022 19:59:07 +0200 Subject: [PATCH 4/9] updated method naming and added imports --- openproblems/tasks/label_projection/methods/scvi_tools.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/openproblems/tasks/label_projection/methods/scvi_tools.py b/openproblems/tasks/label_projection/methods/scvi_tools.py index 437c588019..2f97013456 100644 --- a/openproblems/tasks/label_projection/methods/scvi_tools.py +++ b/openproblems/tasks/label_projection/methods/scvi_tools.py @@ -1,5 +1,6 @@ from ....tools.decorators import method from ....tools.utils import check_version +from typing import Optional import functools @@ -173,6 +174,7 @@ def _pred_xgb( num_round: Optional[int] = None, ): import xgboost as xgb + import numpy as np df = _classif_df(adata_train, query_model, label_col) @@ -201,6 +203,8 @@ def _pred_xgb( def _classif_df(adata, trained_model, label_col): + import pandas as pd + emb_data = trained_model.get_latent_representation(adata) df = pd.DataFrame(data=emb_data, index=adata.obs_names) @@ -243,7 +247,7 @@ def scarches_scanvi_hvg(adata, test=False): @_scanvi_scarches_method(method_name="scArches+scANVI+xgboost (All genes)") -def scarches_scanvi_all_genes(adata, test=False): +def scarches_scanvi_xgb_all_genes(adata, test=False): adata.obs["labels_pred"] = _scanvi_scarches( adata, test=test, prediction_method="xgboost" ) @@ -253,7 +257,7 @@ def scarches_scanvi_all_genes(adata, test=False): @_scanvi_scarches_method(method_name="scArches+scANVI+xgboost (Seurat v3 2000 HVG)") -def scarches_scanvi_hvg(adata, test=False): +def scarches_scanvi_xgb_hvg(adata, test=False): hvg_df = _hvg(adata, test) bdata = adata[:, hvg_df.highly_variable].copy() adata.obs["labels_pred"] = _scanvi_scarches( From 4e1ee2a81617bef69c721529cab607253e297eab Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 18 Jul 2022 18:02:04 +0000 Subject: [PATCH 5/9] pre-commit --- openproblems/tasks/label_projection/methods/scvi_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openproblems/tasks/label_projection/methods/scvi_tools.py b/openproblems/tasks/label_projection/methods/scvi_tools.py index 2f97013456..10052bd2db 100644 --- a/openproblems/tasks/label_projection/methods/scvi_tools.py +++ b/openproblems/tasks/label_projection/methods/scvi_tools.py @@ -173,8 +173,8 @@ def _pred_xgb( test=False, num_round: Optional[int] = None, ): - import xgboost as xgb import numpy as np + import xgboost as xgb df = _classif_df(adata_train, query_model, label_col) From 76936d0df454592b83fdd5074825df30dc9c5c93 Mon Sep 17 00:00:00 2001 From: LuckyMD Date: Tue, 19 Jul 2022 12:04:59 +0200 Subject: [PATCH 6/9] made labels categorical --- openproblems/tasks/label_projection/methods/scvi_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openproblems/tasks/label_projection/methods/scvi_tools.py b/openproblems/tasks/label_projection/methods/scvi_tools.py index 10052bd2db..6f16afc9bc 100644 --- a/openproblems/tasks/label_projection/methods/scvi_tools.py +++ b/openproblems/tasks/label_projection/methods/scvi_tools.py @@ -179,7 +179,7 @@ def _pred_xgb( df = _classif_df(adata_train, query_model, label_col) X_train = df.drop(columns="labels") - y_train = df["labels"] + y_train = df["labels"].astype("category") X_test = query_model.get_latent_representation(adata_test) From bc62d91dbf7f61c08ae6e1a889745872cc999f0b Mon Sep 17 00:00:00 2001 From: LuckyMD Date: Tue, 19 Jul 2022 13:02:37 +0200 Subject: [PATCH 7/9] test integer label encoding --- .../tasks/label_projection/methods/scvi_tools.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/openproblems/tasks/label_projection/methods/scvi_tools.py b/openproblems/tasks/label_projection/methods/scvi_tools.py index 6f16afc9bc..8636bec8bc 100644 --- a/openproblems/tasks/label_projection/methods/scvi_tools.py +++ b/openproblems/tasks/label_projection/methods/scvi_tools.py @@ -163,7 +163,7 @@ def _pred_scanvi(adata, query_model): return preds -# note: add test option here +# note: could extend test option def _pred_xgb( adata, adata_train, @@ -178,9 +178,14 @@ def _pred_xgb( df = _classif_df(adata_train, query_model, label_col) - X_train = df.drop(columns="labels") - y_train = df["labels"].astype("category") + df['labels_int'] = df['labels'].cat.codes + categories = df['labels'].cat.categories + #X_train = df.drop(columns="labels") + X_train = df.drop(columns=["labels", "labels_int"]) + #y_train = df["labels"].astype("category") + y_train = df["labels_int"].astype(int) + X_test = query_model.get_latent_representation(adata_test) if test: @@ -192,7 +197,8 @@ def _pred_xgb( xgbc.fit(X_train, y_train) - adata_test.obs["preds_test"] = xgbc.predict(X_test) + #adata_test.obs["preds_test"] = xgbc.predict(X_test) + adata_test.obs["preds_test"] = categories[xgbc.predict(X_test)] preds = [ adata_test.obs["preds_test"][idx] if idx in adata_test.obs_names else np.nan From 655a846b4fe0c03ff0779e0f9763a2f8b6e286f8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 19 Jul 2022 11:04:50 +0000 Subject: [PATCH 8/9] pre-commit --- .../tasks/label_projection/methods/scvi_tools.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/openproblems/tasks/label_projection/methods/scvi_tools.py b/openproblems/tasks/label_projection/methods/scvi_tools.py index 8636bec8bc..1fce0cbb88 100644 --- a/openproblems/tasks/label_projection/methods/scvi_tools.py +++ b/openproblems/tasks/label_projection/methods/scvi_tools.py @@ -178,14 +178,14 @@ def _pred_xgb( df = _classif_df(adata_train, query_model, label_col) - df['labels_int'] = df['labels'].cat.codes - categories = df['labels'].cat.categories + df["labels_int"] = df["labels"].cat.codes + categories = df["labels"].cat.categories - #X_train = df.drop(columns="labels") + # X_train = df.drop(columns="labels") X_train = df.drop(columns=["labels", "labels_int"]) - #y_train = df["labels"].astype("category") + # y_train = df["labels"].astype("category") y_train = df["labels_int"].astype(int) - + X_test = query_model.get_latent_representation(adata_test) if test: @@ -197,7 +197,7 @@ def _pred_xgb( xgbc.fit(X_train, y_train) - #adata_test.obs["preds_test"] = xgbc.predict(X_test) + # adata_test.obs["preds_test"] = xgbc.predict(X_test) adata_test.obs["preds_test"] = categories[xgbc.predict(X_test)] preds = [ From 6a06835ba86133331ed57bf1880921b87a5eb083 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Thu, 13 Oct 2022 13:20:34 -0400 Subject: [PATCH 9/9] share xgboost code with scarches_xgb --- .../label_projection/methods/scvi_tools.py | 68 +++++-------------- .../tasks/label_projection/methods/xgboost.py | 19 +++++- 2 files changed, 32 insertions(+), 55 deletions(-) diff --git a/openproblems/tasks/label_projection/methods/scvi_tools.py b/openproblems/tasks/label_projection/methods/scvi_tools.py index 1fce0cbb88..1a1ea42223 100644 --- a/openproblems/tasks/label_projection/methods/scvi_tools.py +++ b/openproblems/tasks/label_projection/methods/scvi_tools.py @@ -1,5 +1,6 @@ from ....tools.decorators import method from ....tools.utils import check_version +from .xgboost import _xgboost from typing import Optional import functools @@ -95,6 +96,7 @@ def _scanvi_scarches( n_layers=None, prediction_method="scanvi", ): + import numpy as np import scvi if test: @@ -106,11 +108,14 @@ def _scanvi_scarches( n_layers = n_layers or 2 n_hidden = n_hidden or 128 + unlabeled_category = "Unknown" + # new obs labels to mask test set + adata.obs["scanvi_labels"] = np.where( + adata.obs["is_train"], adata.obs["labels"], unlabeled_category + ) adata_train = adata[adata.obs["is_train"]].copy() - adata_train.obs["scanvi_labels"] = adata_train.obs["labels"].copy() adata_test = adata[~adata.obs["is_train"]].copy() - adata_test.obs["scanvi_labels"] = "Unknown" scvi.model.SCVI.setup_anndata( adata_train, batch_key="batch", labels_key="scanvi_labels" ) @@ -135,7 +140,9 @@ def _scanvi_scarches( train_kwargs["limit_train_batches"] = 10 train_kwargs["limit_val_batches"] = 10 scvi_model.train(**train_kwargs) - model = scvi.model.SCANVI.from_scvi_model(scvi_model, unlabeled_category="Unknown") + model = scvi.model.SCANVI.from_scvi_model( + scvi_model, unlabeled_category=unlabeled_category + ) model.train(**train_kwargs) query_model = scvi.model.SCANVI.load_query_data(adata_test, model) @@ -149,7 +156,7 @@ def _scanvi_scarches( if prediction_method == "scanvi": preds = _pred_scanvi(adata, query_model) elif prediction_method == "xgboost": - preds = _pred_xgb(adata, adata_train, adata_test, query_model, test=test) + preds = _pred_xgb(adata, query_model, test=test) return preds @@ -166,58 +173,15 @@ def _pred_scanvi(adata, query_model): # note: could extend test option def _pred_xgb( adata, - adata_train, - adata_test, query_model, - label_col="labels", test=False, num_round: Optional[int] = None, ): - import numpy as np - import xgboost as xgb - - df = _classif_df(adata_train, query_model, label_col) - - df["labels_int"] = df["labels"].cat.codes - categories = df["labels"].cat.categories - - # X_train = df.drop(columns="labels") - X_train = df.drop(columns=["labels", "labels_int"]) - # y_train = df["labels"].astype("category") - y_train = df["labels_int"].astype(int) - - X_test = query_model.get_latent_representation(adata_test) - - if test: - num_round = num_round or 2 - else: - num_round = num_round or 5 - - xgbc = xgb.XGBClassifier(tree_method="hist", objective="multi:softprob") - - xgbc.fit(X_train, y_train) - - # adata_test.obs["preds_test"] = xgbc.predict(X_test) - adata_test.obs["preds_test"] = categories[xgbc.predict(X_test)] - - preds = [ - adata_test.obs["preds_test"][idx] if idx in adata_test.obs_names else np.nan - for idx in adata.obs_names - ] - - return preds - - -def _classif_df(adata, trained_model, label_col): - import pandas as pd - - emb_data = trained_model.get_latent_representation(adata) - - df = pd.DataFrame(data=emb_data, index=adata.obs_names) - - df["labels"] = adata.obs[label_col] - - return df + adata.obsm["X_emb"] = query_model.get_latent_representation(adata) + adata = _xgboost( + adata, test=test, obsm="X_emb", num_round=num_round, tree_method="hist" + ) + return adata.obs["labels_pred"] @_scanvi_method(method_name="scANVI (All genes)") diff --git a/openproblems/tasks/label_projection/methods/xgboost.py b/openproblems/tasks/label_projection/methods/xgboost.py index b489f98bce..03d59a5136 100644 --- a/openproblems/tasks/label_projection/methods/xgboost.py +++ b/openproblems/tasks/label_projection/methods/xgboost.py @@ -16,7 +16,13 @@ ) -def _xgboost(adata, test: bool = False, num_round: Optional[int] = None): +def _xgboost( + adata, + test: bool = False, + obsm: Optional[str] = None, + num_round: Optional[int] = None, + **kwargs, +): import xgboost as xgb if test: @@ -30,12 +36,19 @@ def _xgboost(adata, test: bool = False, num_round: Optional[int] = None): adata_train = adata[adata.obs["is_train"]] adata_test = adata[~adata.obs["is_train"]].copy() - xg_train = xgb.DMatrix(adata_train.X, label=adata_train.obs["labels_int"]) - xg_test = xgb.DMatrix(adata_test.X, label=adata_test.obs["labels_int"]) + xg_train = xgb.DMatrix( + adata_train.obsm[obsm] if obsm else adata_train.X, + label=adata_train.obs["labels_int"], + ) + xg_test = xgb.DMatrix( + adata_test.obsm[obsm] if obsm else adata_test.X, + label=adata_test.obs["labels_int"], + ) param = dict( objective="multi:softmax", num_class=len(categories), + **kwargs, ) watchlist = [(xg_train, "train")]