Use fast_bioservices instead of multi_bioservices (#181)

* Fix conflicting package requirements * Initial `ruff` formatting * Added command line arguments to expose additional options * Fixed imports, `ruff` formatting and import sorting * Ignore Rout files * Migrate from `multi_bioservices` to `fast_bioservices` This also formats files using `ruff`, which makes it appear as if there are many more changes than actaully occured * Use fast_bioservices instead of multi_bioservices * Format with ruff, use fast_bioservices instead of multi_bioservices. This file may be deleted/reorganized because it is related to microarray, which we are removing from COMO * Fix arguments, add biodbnet progress * Fix `Input` usage * Fix argument formatting usage * Fix argument usage * Ignore microarray.db * Fix multi_bioservices import skip microarray tests
HelikarLab · Apr 26, 2024 · ce2c833 · ce2c833
1 parent 7ad61b5
commit ce2c833
Show file tree

Hide file tree

Showing 15 changed files with 387 additions and 224 deletions.
diff --git a/.gitignore b/.gitignore
@@ -22,3 +22,4 @@ main/microarray.db
 main/data/config_sheets/*
 main/data/GSE*_RAW
 main/data/gpl*entrez.csv
+main/src/microarray.db
diff --git a/environment.yaml b/environment.yaml
@@ -60,10 +60,10 @@ dependencies:
   # - conda-forge::xlrd~=2.0.1
   - gurobi::gurobi
   - pip:
-      - multi_bioservices
       # - escher==1.7.3
       - git+https://github.com/JoshLoecker/escher.git@python38#subdirectory=py
       - framed==0.5.*
       - memote<=1.0
       - git+https://github.com/JoshLoecker/cobamp.git
       - git+https://github.com/JoshLoecker/troppo.git
+      - git+https://github.com/JoshLoecker/fast_bioservices.git
diff --git a/main/COMO.ipynb b/main/COMO.ipynb
@@ -799,11 +799,17 @@
     "mrna_weight = 6\n",
     "single_cell_weight = 6\n",
     "proteomics_weight = 10\n",
+    "taxon_id = 9606  # Human\n",
+    "show_biodbnet_progress = True\n",
+    "use_biodbnet_cache = True\n",
     "\n",
     "cmd = \" \".join(\n",
     "    [\n",
     "        \"python3\", \"src/merge_xomics.py\",\n",
     "        \"--merge-distribution\",\n",
+    "        \"--taxon-id\", f\"{taxon_id}\",\n",
+    "        \"--show-biodbnet-progress\", f\"{show_biodbnet_progress}\",\n",
+    "        \"--use-biodbnet-cache\", f\"{use_biodbnet_cache}\",\n",
     "        #\"--microarray-config-file\", f\"{microarray_config_file}\",  # If using micro-array, uncomment the start of this line\n",
     "        \"--total-rnaseq-config-file\", f\"{trnaseq_config_file}\",\n",
     "        # \"--mrnaseq-config-file\", f\"{mrnaseq_config_file}\",\n",
@@ -1229,6 +1235,9 @@
     "import json\n",
     "from src.utilities import stringlist_to_list\n",
     "\n",
+    "show_biodbnet_progress = True\n",
+    "use_biodbnet_cache = True\n",
+    "\n",
     "drug_raw_file = \"Repurposing_Hub_export.txt\"\n",
     "for context in stringlist_to_list(context_names):\n",
     "    for recon_algorithm in recon_algorithms:\n",
@@ -1275,6 +1284,9 @@
     "                \"--disease-up\", f\"{up_regulated_disease_genes}\",\n",
     "                \"--disease-down\", f\"{down_regulated_disease_genes}\",\n",
     "                \"--raw-drug-file\", f\"{drug_raw_file}\",\n",
+    "                \"--taxon-id\", f\"{taxon_id}\",\n",
+    "                \"--show-biodbnet-progress\", f\"{show_biodbnet_progress}\",\n",
+    "                \"--use-biodbnet-cache\", f\"{use_biodbnet_cache}\",\n",
     "                \"--solver\", f\"{sovler}\",\n",
     "                #\"--test-all\"\n",
     "            ]\n",

diff --git a/main/src/GSEpipelineFast.py b/main/src/GSEpipelineFast.py
@@ -7,11 +7,9 @@
 import numpy as np
 import pandas as pd
 import rpy2.robjects as ro
+from fast_bioservices import BioDBNet, Input, Output
 from GSEpipeline import load_gse_soft
 from instruments import AffyIO
-
-# from fast_bioservices import BioDBNet, Input, Output
-from multi_bioservices.biodbnet import InputDatabase, OutputDatabase, TaxonID, db2db
 from rpy2.robjects import pandas2ri
 
 pandas2ri.activate()
@@ -21,8 +19,17 @@
 
 # gse = load_gse_soft(gsename)
 
+from fast_bioservices import BioDBNet, Input, Output
+
 
-def download_gsm_id_maps(datadir, gse, gpls: Optional[list[str]] = None, vendor="affy"):
+def download_gsm_id_maps(
+    datadir,
+    gse,
+    biodbnet: BioDBNet,
+    taxon_id: int,
+    gpls: Optional[list[str]] = None,
+    vendor="affy",
+):
     """
     download ID to ENTREZ_GENE_ID maps, create a csv file for each platform, and return dictionary
     :param gpls:
@@ -46,18 +53,19 @@ def download_gsm_id_maps(datadir, gse, gpls: Optional[list[str]] = None, vendor=
                 table["CONTROL_TYPE"] == "FALSE", "SPOT_ID"
             ].tolist()
 
-            temp = db2db(
+            temp = biodbnet.db2db(
                 input_values=input_values,
-                input_db=InputDatabase.AGILENT_ID,
-                output_db=[OutputDatabase.GENE_ID, OutputDatabase.ENSEMBL_GENE_ID],
+                input_db=Input.AGILENT_ID,
+                output_db=[Output.GENE_ID, Output.ENSEMBL_GENE_ID],
+                taxon=taxon_id,
             )
 
             temp.drop(columns=["Ensembl Gene ID"], inplace=True)
             temp.reset_index(inplace=True)
             temp.rename(
                 columns={
-                    InputDatabase.AGILENT_ID.value: "ID",
-                    OutputDatabase.GENE_ID.value: "ENTREZ_GENE_ID",
+                    Input.AGILENT_ID.value: "ID",
+                    Output.GENE_ID.value: "ENTREZ_GENE_ID",
                 },
                 inplace=True,
             )
@@ -74,14 +82,27 @@ def download_gsm_id_maps(datadir, gse, gpls: Optional[list[str]] = None, vendor=
 
 
 class GSEproject:
-    def __init__(self, gsename, querytable, rootdir="../"):
+    def __init__(
+        self,
+        gsename,
+        querytable,
+        show_biodbnet_progress: bool = False,
+        use_biodbnet_cache: bool = True,
+        rootdir="../",
+    ):
         self.gsename = gsename
         # Setup paths
         self.querytable = querytable
         self.rootdir = rootdir
         self.datadir = os.path.join(self.rootdir, "data")
         self.outputdir = os.path.join(self.rootdir, "output")
         self.gene_dir = os.path.join(self.datadir, self.gsename + "_RAW")
+
+        self.biodbnet = BioDBNet(
+            show_progress=show_biodbnet_progress,
+            cache=use_biodbnet_cache,
+        )
+
         print(
             "Initialize project ({}):\nRoot: {}\nRaw data: {}".format(
                 self.gsename, self.rootdir, self.gene_dir
@@ -137,7 +158,13 @@ def get_gsm_tables(self):
             if not os.path.isfile(filepath):
                 # Could improve to automatic download new tables based on platform
                 gse = load_gse_soft(self.gsename)
-                download_gsm_id_maps(self.datadir, gse, gpls=[gpl], vendor=vendor)
+                download_gsm_id_maps(
+                    self.datadir,
+                    gse,
+                    gpls=[gpl],
+                    vendor=vendor,
+                    biodbnet=self.biodbnet,
+                )
                 print("Skip Unsupported Platform: {}, {}".format(gpl, vendor))
                 # continue
             temp = pd.read_csv(filepath)
@@ -225,16 +252,6 @@ def get_entrez_table_pipeline(self, fromcsv=True):
                         output_db=[OutputDatabase.GENE_ID],
                     )
 
-                    outputdf = instruments.readagilent(
-                        platformdir, list(self.gsm_platform.keys())
-                    )
-
-                    gsm_maps[key] = db2db(
-                        input_values=list(map(str, list(outputdf["ProbeName"]))),
-                        input_db=InputDatabase.AGILENT_ID,
-                        output_db=[OutputDatabase.GENE_ID],
-                    )
-
                     gsm_maps[key].rename(
                         columns={"Gene ID": "ENTREZ_GENE_ID"}, inplace=True
                     )
@@ -271,23 +288,23 @@ def get_entrez_table_pipeline(self, fromcsv=True):
                     how="outer",
                 )
 
-        df_outer_sc500.dropna(how="all", inplace=True)  # type: ignore
-        print("Full: {}".format(df_outer_sc500.shape))  # type: ignore
-        df_outer_sc500.rename(str.lower, axis="columns", inplace=True)  # type: ignore
+        df_outer_sc500.dropna(how="all", inplace=True)
+        print("Full: {}".format(df_outer_sc500.shape))
+        df_outer_sc500.rename(str.lower, axis="columns", inplace=True)
         keys = []
         vals = []
         gsms_loaded = []
 
-        for col in list(df_outer_sc500):  # type: ignore
-            if ".cel.gz" in col:  # type: ignore
-                strs = col.split(".cel.gz")  # type: ignore
+        for col in list(df_outer_sc500):
+            if ".cel.gz" in col:
+                strs = col.split(".cel.gz")
                 gsm = strs[0].split("_")[0]
                 newcol = "{}.cel.gz{}".format(gsm, strs[-1])
                 vals.append(newcol)
                 keys.append(col)
                 gsms_loaded.append(gsm)
 
-        df_outer_sc500.rename(columns=dict(zip(keys, vals)), inplace=True)  # type: ignore
+        df_outer_sc500.rename(columns=dict(zip(keys, vals)), inplace=True)
         gsms_loaded = list(set(gsms_loaded).union(set(self.gsm_platform.keys())))
 
         # Remove duplicated items, keep largest VALUE for each GSM
@@ -329,7 +346,7 @@ def get_entrez_table_pipeline(self, fromcsv=True):
             )
 
             try:
-                temp = df_outer_sc500.loc[:, [col1, col2, col3]]  # type: ignore
+                temp = df_outer_sc500.loc[:, [col1, col2, col3]]
 
             except:
                 if key in list(self.gsm_platform.keys()):

diff --git a/main/src/arguments.py b/main/src/arguments.py
@@ -54,6 +54,24 @@ def range_checker(arg: str):
     "help": "Tissue/cell name of models to generate. If making multiple models in a batch, then use the format: 'context1 context2 context3' ",
 }
 
+show_biodbnet_progress_arg = {
+    "flag": "--show-biodbnet-progress",
+    "action": "store_true",
+    "required": False,
+    "default": False,
+    "dest": "show_biodbnet_progress",
+    "help": "Show progress of biodbnet queries",
+}
+
+use_biodbnet_cache_arg = {
+    "flag": "--use-biodbnet-cache",
+    "action": "store_true",
+    "required": False,
+    "default": False,
+    "dest": "use_biodbnet_cache",
+    "help": "Use biodbnet cache",
+}
+
 filtering_technique_arg = {
     "flag": "--filtering-technique",
     "type": str,
@@ -139,7 +157,7 @@ def range_checker(arg: str):
 
 min_count_arg = {
     "flag": "--min-count",
-    "type": int | str,
+    "type": str,
     "required": False,
     "default": "default",
     "dest": "min_count",
@@ -405,7 +423,7 @@ def range_checker(arg: str):
 
 expression_requirement_arg = {
     "flag": "--expression-requirement",
-    "type": int | str,
+    "type": str,
     "required": False,
     "default": "default",
     "dest": "expression_requirement",

diff --git a/main/src/cluster_rnaseq.py b/main/src/cluster_rnaseq.py
@@ -47,7 +47,7 @@
 # cluster_io = SignatureTranslatedAnonymousPackage(string, "cluster_io")
 
 
-def main() -> None:
+def main(argv) -> None:
     """
     Cluster RNA-seq Data
     """
@@ -205,24 +205,6 @@ def main() -> None:
         seed=seed,
     )
     cluster_samples.call_function("cluster_samples_main")
-    # cluster_io = rpy2_api.Rpy2(r_file_path=r_file_path)
-    # cluster_io_function = cluster_io.call_function("cluster_samples_main")
-    # cluster_io_function(
-    #     wd,
-    #     context_names,
-    #     technique,
-    #     clust_algo,
-    #     label,
-    #     min_dist=min_dist,
-    #     n_neigh_rep=n_neigh_rep,
-    #     n_neigh_batch=n_neigh_batch,
-    #     n_neigh_cont=n_neigh_cont,
-    #     rep_ratio=rep_ratio,
-    #     batch_ratio=batch_ratio,
-    #     quantile=quantile,
-    #     min_count=min_count,
-    #     seed=seed,
-    # )
 
 
 if __name__ == "__main__":

diff --git a/main/src/create_context_specific_model.py b/main/src/create_context_specific_model.py
@@ -589,18 +589,20 @@ def parse_args(argv):
         "https://github.com/HelikarLab/MADRID or email [email protected]",
     )
 
-    parser.add_argument(**context_names_arg)
-    parser.add_argument(**reference_model_filepath_arg)
-    parser.add_argument(**active_genes_filepath_arg)
-    parser.add_argument(**objective_function_arg)
-    parser.add_argument(**boundary_reactions_filepath_arg)
-    parser.add_argument(**exclude_reactions_filepath_arg)
-    parser.add_argument(**force_reactions_filepath_arg)
-    parser.add_argument(**reconstruction_algorithm_arg)
-    parser.add_argument(**imat_low_threshold_arg)
-    parser.add_argument(**imat_high_threshold_arg)
-    parser.add_argument(**reconstruction_solver_arg)
-    parser.add_argument(**output_filetypes_arg)
+    # fmt: off
+    parser.add_argument(context_names_arg["flag"], **{k: v for k, v in context_names_arg.items() if k != "flag"})
+    parser.add_argument(reference_model_filepath_arg["flag"], **{k: v for k, v in reference_model_filepath_arg.items() if k != "flag"})
+    parser.add_argument(active_genes_filepath_arg["flag"], **{k: v for k, v in active_genes_filepath_arg.items() if k != "flag"})
+    parser.add_argument(objective_function_arg["flag"], **{k: v for k, v in objective_function_arg.items() if k != "flag"})
+    parser.add_argument(boundary_reactions_filepath_arg["flag"], **{k: v for k, v in boundary_reactions_filepath_arg.items() if k != "flag"})
+    parser.add_argument(exclude_reactions_filepath_arg["flag"], **{k: v for k, v in exclude_reactions_filepath_arg.items() if k != "flag"})
+    parser.add_argument(force_reactions_filepath_arg["flag"], **{k: v for k, v in force_reactions_filepath_arg.items() if k != "flag"})
+    parser.add_argument(reconstruction_algorithm_arg["flag"], **{k: v for k, v in reconstruction_algorithm_arg.items() if k != "flag"})
+    parser.add_argument(imat_low_threshold_arg["flag"], **{k: v for k, v in imat_low_threshold_arg.items() if k != "flag"})
+    parser.add_argument(imat_high_threshold_arg["flag"], **{k: v for k, v in imat_high_threshold_arg.items() if k != "flag"})
+    parser.add_argument(reconstruction_solver_arg["flag"], **{k: v for k, v in reconstruction_solver_arg.items() if k != "flag"})
+    parser.add_argument(output_filetypes_arg["flag"], **{k: v for k, v in output_filetypes_arg.items() if k != "flag"})
+    # fmt: on
 
     args = parser.parse_args()
     return args