Skip to content

Commit

Permalink
Use fast_bioservices instead of multi_bioservices (#181)
Browse files Browse the repository at this point in the history
* Fix conflicting package requirements

* Initial `ruff` formatting

* Added command line arguments to expose additional options

* Fixed imports, `ruff` formatting and import sorting

* Ignore Rout files

* Migrate from `multi_bioservices` to `fast_bioservices`

This also formats files using `ruff`, which makes it appear as if there are many more changes than actaully occured

* Use fast_bioservices instead of multi_bioservices

* Format with ruff, use fast_bioservices instead of multi_bioservices. This file may be deleted/reorganized because it is related to microarray, which we are removing from COMO

* Fix arguments, add biodbnet progress

* Fix `Input` usage

* Fix argument formatting usage

* Fix argument usage

* Ignore microarray.db

* Fix multi_bioservices import skip microarray tests
  • Loading branch information
Josh Loecker authored Apr 26, 2024
1 parent 7ad61b5 commit ce2c833
Show file tree
Hide file tree
Showing 15 changed files with 387 additions and 224 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ main/microarray.db
main/data/config_sheets/*
main/data/GSE*_RAW
main/data/gpl*entrez.csv
main/src/microarray.db
2 changes: 1 addition & 1 deletion environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,10 @@ dependencies:
# - conda-forge::xlrd~=2.0.1
- gurobi::gurobi
- pip:
- multi_bioservices
# - escher==1.7.3
- git+https://github.com/JoshLoecker/escher.git@python38#subdirectory=py
- framed==0.5.*
- memote<=1.0
- git+https://github.com/JoshLoecker/cobamp.git
- git+https://github.com/JoshLoecker/troppo.git
- git+https://github.com/JoshLoecker/fast_bioservices.git
12 changes: 12 additions & 0 deletions main/COMO.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -799,11 +799,17 @@
"mrna_weight = 6\n",
"single_cell_weight = 6\n",
"proteomics_weight = 10\n",
"taxon_id = 9606 # Human\n",
"show_biodbnet_progress = True\n",
"use_biodbnet_cache = True\n",
"\n",
"cmd = \" \".join(\n",
" [\n",
" \"python3\", \"src/merge_xomics.py\",\n",
" \"--merge-distribution\",\n",
" \"--taxon-id\", f\"{taxon_id}\",\n",
" \"--show-biodbnet-progress\", f\"{show_biodbnet_progress}\",\n",
" \"--use-biodbnet-cache\", f\"{use_biodbnet_cache}\",\n",
" #\"--microarray-config-file\", f\"{microarray_config_file}\", # If using micro-array, uncomment the start of this line\n",
" \"--total-rnaseq-config-file\", f\"{trnaseq_config_file}\",\n",
" # \"--mrnaseq-config-file\", f\"{mrnaseq_config_file}\",\n",
Expand Down Expand Up @@ -1229,6 +1235,9 @@
"import json\n",
"from src.utilities import stringlist_to_list\n",
"\n",
"show_biodbnet_progress = True\n",
"use_biodbnet_cache = True\n",
"\n",
"drug_raw_file = \"Repurposing_Hub_export.txt\"\n",
"for context in stringlist_to_list(context_names):\n",
" for recon_algorithm in recon_algorithms:\n",
Expand Down Expand Up @@ -1275,6 +1284,9 @@
" \"--disease-up\", f\"{up_regulated_disease_genes}\",\n",
" \"--disease-down\", f\"{down_regulated_disease_genes}\",\n",
" \"--raw-drug-file\", f\"{drug_raw_file}\",\n",
" \"--taxon-id\", f\"{taxon_id}\",\n",
" \"--show-biodbnet-progress\", f\"{show_biodbnet_progress}\",\n",
" \"--use-biodbnet-cache\", f\"{use_biodbnet_cache}\",\n",
" \"--solver\", f\"{sovler}\",\n",
" #\"--test-all\"\n",
" ]\n",
Expand Down
75 changes: 46 additions & 29 deletions main/src/GSEpipelineFast.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,9 @@
import numpy as np
import pandas as pd
import rpy2.robjects as ro
from fast_bioservices import BioDBNet, Input, Output
from GSEpipeline import load_gse_soft
from instruments import AffyIO

# from fast_bioservices import BioDBNet, Input, Output
from multi_bioservices.biodbnet import InputDatabase, OutputDatabase, TaxonID, db2db
from rpy2.robjects import pandas2ri

pandas2ri.activate()
Expand All @@ -21,8 +19,17 @@

# gse = load_gse_soft(gsename)

from fast_bioservices import BioDBNet, Input, Output


def download_gsm_id_maps(datadir, gse, gpls: Optional[list[str]] = None, vendor="affy"):
def download_gsm_id_maps(
datadir,
gse,
biodbnet: BioDBNet,
taxon_id: int,
gpls: Optional[list[str]] = None,
vendor="affy",
):
"""
download ID to ENTREZ_GENE_ID maps, create a csv file for each platform, and return dictionary
:param gpls:
Expand All @@ -46,18 +53,19 @@ def download_gsm_id_maps(datadir, gse, gpls: Optional[list[str]] = None, vendor=
table["CONTROL_TYPE"] == "FALSE", "SPOT_ID"
].tolist()

temp = db2db(
temp = biodbnet.db2db(
input_values=input_values,
input_db=InputDatabase.AGILENT_ID,
output_db=[OutputDatabase.GENE_ID, OutputDatabase.ENSEMBL_GENE_ID],
input_db=Input.AGILENT_ID,
output_db=[Output.GENE_ID, Output.ENSEMBL_GENE_ID],
taxon=taxon_id,
)

temp.drop(columns=["Ensembl Gene ID"], inplace=True)
temp.reset_index(inplace=True)
temp.rename(
columns={
InputDatabase.AGILENT_ID.value: "ID",
OutputDatabase.GENE_ID.value: "ENTREZ_GENE_ID",
Input.AGILENT_ID.value: "ID",
Output.GENE_ID.value: "ENTREZ_GENE_ID",
},
inplace=True,
)
Expand All @@ -74,14 +82,27 @@ def download_gsm_id_maps(datadir, gse, gpls: Optional[list[str]] = None, vendor=


class GSEproject:
def __init__(self, gsename, querytable, rootdir="../"):
def __init__(
self,
gsename,
querytable,
show_biodbnet_progress: bool = False,
use_biodbnet_cache: bool = True,
rootdir="../",
):
self.gsename = gsename
# Setup paths
self.querytable = querytable
self.rootdir = rootdir
self.datadir = os.path.join(self.rootdir, "data")
self.outputdir = os.path.join(self.rootdir, "output")
self.gene_dir = os.path.join(self.datadir, self.gsename + "_RAW")

self.biodbnet = BioDBNet(
show_progress=show_biodbnet_progress,
cache=use_biodbnet_cache,
)

print(
"Initialize project ({}):\nRoot: {}\nRaw data: {}".format(
self.gsename, self.rootdir, self.gene_dir
Expand Down Expand Up @@ -137,7 +158,13 @@ def get_gsm_tables(self):
if not os.path.isfile(filepath):
# Could improve to automatic download new tables based on platform
gse = load_gse_soft(self.gsename)
download_gsm_id_maps(self.datadir, gse, gpls=[gpl], vendor=vendor)
download_gsm_id_maps(
self.datadir,
gse,
gpls=[gpl],
vendor=vendor,
biodbnet=self.biodbnet,
)
print("Skip Unsupported Platform: {}, {}".format(gpl, vendor))
# continue
temp = pd.read_csv(filepath)
Expand Down Expand Up @@ -225,16 +252,6 @@ def get_entrez_table_pipeline(self, fromcsv=True):
output_db=[OutputDatabase.GENE_ID],
)

outputdf = instruments.readagilent(
platformdir, list(self.gsm_platform.keys())
)

gsm_maps[key] = db2db(
input_values=list(map(str, list(outputdf["ProbeName"]))),
input_db=InputDatabase.AGILENT_ID,
output_db=[OutputDatabase.GENE_ID],
)

gsm_maps[key].rename(
columns={"Gene ID": "ENTREZ_GENE_ID"}, inplace=True
)
Expand Down Expand Up @@ -271,23 +288,23 @@ def get_entrez_table_pipeline(self, fromcsv=True):
how="outer",
)

df_outer_sc500.dropna(how="all", inplace=True) # type: ignore
print("Full: {}".format(df_outer_sc500.shape)) # type: ignore
df_outer_sc500.rename(str.lower, axis="columns", inplace=True) # type: ignore
df_outer_sc500.dropna(how="all", inplace=True)
print("Full: {}".format(df_outer_sc500.shape))
df_outer_sc500.rename(str.lower, axis="columns", inplace=True)
keys = []
vals = []
gsms_loaded = []

for col in list(df_outer_sc500): # type: ignore
if ".cel.gz" in col: # type: ignore
strs = col.split(".cel.gz") # type: ignore
for col in list(df_outer_sc500):
if ".cel.gz" in col:
strs = col.split(".cel.gz")
gsm = strs[0].split("_")[0]
newcol = "{}.cel.gz{}".format(gsm, strs[-1])
vals.append(newcol)
keys.append(col)
gsms_loaded.append(gsm)

df_outer_sc500.rename(columns=dict(zip(keys, vals)), inplace=True) # type: ignore
df_outer_sc500.rename(columns=dict(zip(keys, vals)), inplace=True)
gsms_loaded = list(set(gsms_loaded).union(set(self.gsm_platform.keys())))

# Remove duplicated items, keep largest VALUE for each GSM
Expand Down Expand Up @@ -329,7 +346,7 @@ def get_entrez_table_pipeline(self, fromcsv=True):
)

try:
temp = df_outer_sc500.loc[:, [col1, col2, col3]] # type: ignore
temp = df_outer_sc500.loc[:, [col1, col2, col3]]

except:
if key in list(self.gsm_platform.keys()):
Expand Down
22 changes: 20 additions & 2 deletions main/src/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,24 @@ def range_checker(arg: str):
"help": "Tissue/cell name of models to generate. If making multiple models in a batch, then use the format: 'context1 context2 context3' ",
}

show_biodbnet_progress_arg = {
"flag": "--show-biodbnet-progress",
"action": "store_true",
"required": False,
"default": False,
"dest": "show_biodbnet_progress",
"help": "Show progress of biodbnet queries",
}

use_biodbnet_cache_arg = {
"flag": "--use-biodbnet-cache",
"action": "store_true",
"required": False,
"default": False,
"dest": "use_biodbnet_cache",
"help": "Use biodbnet cache",
}

filtering_technique_arg = {
"flag": "--filtering-technique",
"type": str,
Expand Down Expand Up @@ -139,7 +157,7 @@ def range_checker(arg: str):

min_count_arg = {
"flag": "--min-count",
"type": int | str,
"type": str,
"required": False,
"default": "default",
"dest": "min_count",
Expand Down Expand Up @@ -405,7 +423,7 @@ def range_checker(arg: str):

expression_requirement_arg = {
"flag": "--expression-requirement",
"type": int | str,
"type": str,
"required": False,
"default": "default",
"dest": "expression_requirement",
Expand Down
20 changes: 1 addition & 19 deletions main/src/cluster_rnaseq.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
# cluster_io = SignatureTranslatedAnonymousPackage(string, "cluster_io")


def main() -> None:
def main(argv) -> None:
"""
Cluster RNA-seq Data
"""
Expand Down Expand Up @@ -205,24 +205,6 @@ def main() -> None:
seed=seed,
)
cluster_samples.call_function("cluster_samples_main")
# cluster_io = rpy2_api.Rpy2(r_file_path=r_file_path)
# cluster_io_function = cluster_io.call_function("cluster_samples_main")
# cluster_io_function(
# wd,
# context_names,
# technique,
# clust_algo,
# label,
# min_dist=min_dist,
# n_neigh_rep=n_neigh_rep,
# n_neigh_batch=n_neigh_batch,
# n_neigh_cont=n_neigh_cont,
# rep_ratio=rep_ratio,
# batch_ratio=batch_ratio,
# quantile=quantile,
# min_count=min_count,
# seed=seed,
# )


if __name__ == "__main__":
Expand Down
26 changes: 14 additions & 12 deletions main/src/create_context_specific_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,18 +589,20 @@ def parse_args(argv):
"https://github.com/HelikarLab/MADRID or email [email protected]",
)

parser.add_argument(**context_names_arg)
parser.add_argument(**reference_model_filepath_arg)
parser.add_argument(**active_genes_filepath_arg)
parser.add_argument(**objective_function_arg)
parser.add_argument(**boundary_reactions_filepath_arg)
parser.add_argument(**exclude_reactions_filepath_arg)
parser.add_argument(**force_reactions_filepath_arg)
parser.add_argument(**reconstruction_algorithm_arg)
parser.add_argument(**imat_low_threshold_arg)
parser.add_argument(**imat_high_threshold_arg)
parser.add_argument(**reconstruction_solver_arg)
parser.add_argument(**output_filetypes_arg)
# fmt: off
parser.add_argument(context_names_arg["flag"], **{k: v for k, v in context_names_arg.items() if k != "flag"})
parser.add_argument(reference_model_filepath_arg["flag"], **{k: v for k, v in reference_model_filepath_arg.items() if k != "flag"})
parser.add_argument(active_genes_filepath_arg["flag"], **{k: v for k, v in active_genes_filepath_arg.items() if k != "flag"})
parser.add_argument(objective_function_arg["flag"], **{k: v for k, v in objective_function_arg.items() if k != "flag"})
parser.add_argument(boundary_reactions_filepath_arg["flag"], **{k: v for k, v in boundary_reactions_filepath_arg.items() if k != "flag"})
parser.add_argument(exclude_reactions_filepath_arg["flag"], **{k: v for k, v in exclude_reactions_filepath_arg.items() if k != "flag"})
parser.add_argument(force_reactions_filepath_arg["flag"], **{k: v for k, v in force_reactions_filepath_arg.items() if k != "flag"})
parser.add_argument(reconstruction_algorithm_arg["flag"], **{k: v for k, v in reconstruction_algorithm_arg.items() if k != "flag"})
parser.add_argument(imat_low_threshold_arg["flag"], **{k: v for k, v in imat_low_threshold_arg.items() if k != "flag"})
parser.add_argument(imat_high_threshold_arg["flag"], **{k: v for k, v in imat_high_threshold_arg.items() if k != "flag"})
parser.add_argument(reconstruction_solver_arg["flag"], **{k: v for k, v in reconstruction_solver_arg.items() if k != "flag"})
parser.add_argument(output_filetypes_arg["flag"], **{k: v for k, v in output_filetypes_arg.items() if k != "flag"})
# fmt: on

args = parser.parse_args()
return args
Expand Down
Loading

0 comments on commit ce2c833

Please sign in to comment.