diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e72e206..317c6394 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -75,6 +75,8 @@ ## MAJOR CHANGES +* Removed `methods/cytovi` from the benchmark. The implementation is preserved in the `add-cytovi-implementation` branch to be revisited in the near future (PR #124). + * Updated file schema (PR #18): * Add is_control obs to indicate whether a cell should be used as control when correcting batch effect. * Removed donor_id obs from unintegrated censored. diff --git a/src/methods/cytovi/config.vsh.yaml b/src/methods/cytovi/config.vsh.yaml deleted file mode 100644 index 4eb7ea66..00000000 --- a/src/methods/cytovi/config.vsh.yaml +++ /dev/null @@ -1,92 +0,0 @@ -# The API specifies which type of component this is. -# It contains specifications for: -# - The input/output files -# - Common parameters -# - A unit test -__merge__: ../../api/comp_method.yaml - -# A unique identifier for your component (required). -# Can contain only lowercase letters or underscores. -name: cytovi -# A relatively short label, used when rendering visualisations (required) -label: CytoVI -# A one sentence summary of how this method works (required). Used when -# rendering summary tables. -summary: "A deep generative model for correcting batch effects" -# A multi-line description of how this component works (required). Used -# when rendering reference documentation. -description: | - CytoVI is a deep generative model that utilizes antibody-based single-cell profiles to - learn a biologically meaningful latent representation of each cell. - It is part of the scvi-tools framework and is built upon the variational autoencoder (VAE) architecture. - - In this implementation, we first transform the data using minmax scaler in cytovi.scale function. - We also fit a separate sklearn minmax scaler on data from batch 1. - We then train CytoVI model on the scaled data and generate the corrected marker - expression values. - We then inverse transforms the corrected data using the scaler fitted on batch 1 data - to obtain the final corrected values. - These values are saved in the anndata object. -references: - doi: - - 10.1101/2025.09.07.674699 -links: - # URL to the documentation for this method (required). - documentation: https://docs.scvi-tools.org/en/latest/user_guide/models/cytovi.html - # URL to the code repository for this method (required). - repository: https://github.com/YosefLab/cytovi-reference-implementation - -# Component-specific parameters (optional) -arguments: - - name: --n_hidden - type: integer - default: 128 - description: Number of hidden units. - - name: --n_layers - type: integer - default: 1 - description: Number of layers. - - name: --max_epochs - type: integer - default: 1000 - description: Number of epochs to train the model. - - name: --train_size - type: double - default: 0.9 - description: Fraction of cells to subsample from each cluster for training. - -# Resources required to run the component -resources: - # The script of your component (required) - - type: python_script - path: script.py - # Additional resources your script needs (optional) - # - type: file - # path: weights.pt - -engines: - - type: docker - image: nvcr.io/nvidia/pytorch:25.08-py3 - setup: - - type: apt - packages: - - procps - - git - - type: python - packages: - - anndata>=0.11.0 - - scanpy[skmisc]>=1.10 - - scvi-tools==1.4.0.post1 - - pyyaml - - requests - - jsonschema - github: - - openproblems-bio/core#subdirectory=packages/python/openproblems - -runners: - # This platform allows running the component natively - - type: executable - # Allows turning the component into a Nextflow module / pipeline. - - type: nextflow - directives: - label: [veryhightime, midmem, midcpu, gpu] diff --git a/src/methods/cytovi/script.py b/src/methods/cytovi/script.py deleted file mode 100644 index 7c8d0a52..00000000 --- a/src/methods/cytovi/script.py +++ /dev/null @@ -1,145 +0,0 @@ -import time - -import anndata as ad -import numpy as np -import scvi -import torch -from scvi.external import cytovi -from sklearn.preprocessing import MinMaxScaler - -## VIASH START -par = { - "input": "resources_test/task_cyto_batch_integration/mouse_spleen_flow_cytometry_subset/censored_split2.h5ad", - "output": "resources_test/task_cyto_batch_integration/mouse_spleen_flow_cytometry_subset/output_cytovi_split2.h5ad", - "n_hidden": 128, - "n_layers": 1, - "max_epochs": 1000, - "train_size": 0.9, -} -meta = {"name": "cytovi"} -## VIASH END - -# setting calculation to TF32 to speed up training -torch.backends.cuda.matmul.allow_tf32 = True - -# increase num workers for data loading -scvi.settings.num_workers = 95 -scvi.settings.seed = 0 - -print("Reading and preparing input files", flush=True) -adata = ad.read_h5ad(par["input"]) - -adata.obs["batch_str"] = adata.obs["batch"].astype(str) -adata.obs["sample_key_str"] = adata.obs["sample"].astype(str) - -markers_to_correct = adata.var[adata.var["to_correct"]].index.to_numpy() -markers_not_correct = adata.var[~adata.var["to_correct"]].index.to_numpy() - -adata_to_correct = adata[:, markers_to_correct].copy() - -print("Scaling data", flush=True) - -# scale data. this will add a layer "scaled" to the anndata -cytovi.scale( - adata=adata_to_correct, - transformed_layer_key="preprocessed", - batch_key="batch_str", - scaled_layer_key="scaled", - inplace=True, - method="minmax", -) - -# create minmax scaler object for one batch to use later for untransforming batch corrected data -print("Creating minmax scaler for untransforming data", flush=True) - -# the following are taken from cytovi source code -feature_range = (0.0, 1.0) -feat_eps = 1e-6 -feature_range = (feature_range[0] + feat_eps, feature_range[1] - feat_eps) - -# get data from batch one -batch_one = ( - adata_to_correct[adata_to_correct.obs["batch_str"] == "1"] - .layers["preprocessed"] - .copy() -) -batch_one_scaler = MinMaxScaler(feature_range=feature_range) - -print( - "Fitting minmax scaler on batch one using feature range", feature_range, flush=True -) -batch_one_scaler.fit(batch_one) - -print("Memory cleanup before training", flush=True) -del batch_one - -print( - f"Train CytoVI on {adata_to_correct.shape[0]} cells", - flush=True, -) - -cytovi.CYTOVI.setup_anndata( - adata_to_correct, - layer="scaled", - batch_key="batch_str", - sample_key="sample_key_str", -) - -model = cytovi.CYTOVI( - adata_to_correct, n_hidden=par["n_hidden"], n_layers=par["n_layers"] -) - -print("Start training CytoVI model", flush=True) - -start = time.time() -model.train( - batch_size=8192, - max_epochs=par["max_epochs"], - train_size=par["train_size"], -) -end = time.time() -print(f"Training took {end - start:.2f} seconds", flush=True) - -# get batch corrected data -print("Calculating batch corrected data", flush=True) -corrected_data = model.get_normalized_expression(adata=adata_to_correct) - -# have to save the columns to be able to reconstruct later -corrected_data_markers = corrected_data.columns.to_numpy() - -print("Untransforming batch corrected data", flush=True) -# untransform data using batch one scaler -corrected_data = batch_one_scaler.inverse_transform(corrected_data.to_numpy()) - -# have to add in the uncorrected markers as well -uncorrected_data = adata[:, markers_not_correct].layers["preprocessed"].copy() - -out_matrix = np.concatenate([corrected_data, uncorrected_data], axis=1) -out_var_idx = np.concatenate([corrected_data_markers, markers_not_correct]) - -# create new anndata -out_adata = ad.AnnData( - obs=adata.obs[[]], - var=adata.var.loc[out_var_idx][[]], - layers={"integrated": out_matrix}, - uns={ - "dataset_id": adata.uns["dataset_id"], - "method_id": meta["name"], - "parameters": par, - }, -) - -# reorder var to match input -out_adata = out_adata[:, adata.var_names] - -print("Write output AnnData to file", flush=True) - -out_adata.write_h5ad(par["output"], compression="gzip") - -print( - "Written anndata of shape ", - out_adata.shape, - " to file: ", - par["output"], - flush=True, -) diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index cb9814ae..c74c192b 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -107,7 +107,6 @@ dependencies: - name: methods/batchadjust_all_controls - name: methods/rpca_to_goal - name: methods/rpca_to_mid - # - name: methods/cytovi - name: metrics/emd - name: metrics/ratio_consistent_peaks - name: metrics/average_batch_r2 diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index f4514946..0d9b1652 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -34,7 +34,6 @@ methods = [ limma_remove_batch_effect, rpca_to_goal, rpca_to_mid, - // cytovi ] // construct list of metrics