From 9dd8bc39d1be6cb8be42a7eb44e9fe018e440f7a Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Mon, 8 Jan 2024 14:26:48 +0000 Subject: [PATCH] allow .csv, tsv, txt, xlsx input to gsea --- anoexpress/anoexpress.py | 41 ++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/anoexpress/anoexpress.py b/anoexpress/anoexpress.py index f1dc220..a3572de 100644 --- a/anoexpress/anoexpress.py +++ b/anoexpress/anoexpress.py @@ -78,6 +78,27 @@ def metadata(analysis, microarray=False): return metadata +def resolve_gene_id(gene_id, analysis): + + if isinstance(gene_id, str): + if gene_id.startswith(('2L', '2R', '3L', '3R', 'X', '2RL', '3RL')): + import malariagen_data + if analysis == 'fun': + assert "Unfortunately the genome feature file in malariagen_data does not contain AFUN identifiers, so we cannot subset by genomic span for An. funestus." + else: + ag3 = malariagen_data.Ag3() + gff = ag3.genome_features(region=gene_id).query("type == 'gene'") + gene_id = gff.ID.to_list() + elif gene_id.endswith(('.tsv', '.txt')): + gene_id = pd.read_csv(gene_id, sep="\t", header=None).iloc[:, 0].to_list() + elif gene_id.endswith('.csv'): + gene_id = pd.read_csv(gene_id, header=None).iloc[:, 0].to_list() + elif gene_id.endswith('.xlsx'): + gene_id = pd.read_excel(gene_id, header=None).iloc[:, 0].to_list() + + return gene_id + + def data(data_type, analysis, microarray=False, gene_id=None, sort_by=None, annotations=False, pvalue_filter=None, fraction_na_allowed=None): """ Load the combined data for a given analysis and sample query @@ -131,21 +152,7 @@ def data(data_type, analysis, microarray=False, gene_id=None, sort_by=None, anno # subset to the gene ids of interest including reading file if gene_id is not None: - if isinstance(gene_id, str): - if gene_id.startswith(('2L', '2R', '3L', '3R', 'X', '2RL', '3RL')): - import malariagen_data - if analysis == 'fun': - assert "Unfortunately the genome feature file in malariagen_data does not contain AFUN identifiers, so we cannot subset by genomic span for An. funestus." - else: - ag3 = malariagen_data.Ag3() - gff = ag3.genome_features(region=gene_id).query("type == 'gene'") - gene_id = gff.ID.to_list() - elif gene_id.endswith(('.tsv', '.txt')): - gene_id = pd.read_csv(gene_id, sep="\t", header=None).iloc[:, 0].to_list() - elif gene_id.endswith('.csv'): - gene_id = pd.read_csv(gene_id, header=None).iloc[:, 0].to_list() - elif gene_id.endswith('.xlsx'): - gene_id = pd.read_excel(gene_id, header=None).iloc[:, 0].to_list() + gene_id = resolve_gene_id(gene_id=gene_id) df = df.query("GeneID in @gene_id") if annotations: # add gene name and description to the dataframe as index @@ -217,6 +224,8 @@ def plot_gene_expression(gene_id, analysis="gamb_colu_arab_fun", microarray=Fals sort_by : {"median", "mean", "agap", None}, optional sort by median/mean of fold changes (descending), or by AGAP, or dont sort input gene ids. identifier + pvalue_filter: float, optional + if provided, fold-change entries with an adjusted p-value below the threshold will be removed from the plot. Default is None. width : int Width in pixels of the plotly figure height: int, optional @@ -444,7 +453,7 @@ def load_genes_for_enrichment(analysis, func, gene_ids, percentile): percentile_idx = fc_ranked.reset_index()['GeneID'].unique().shape[0] * percentile top_geneIDs = fc_ranked.reset_index().loc[:, 'GeneID'][:int(percentile_idx)] elif gene_ids: - top_geneIDs = gene_ids + top_geneIDs = resolve_gene_id(gene_id=gene_ids, analysis=analysis) return top_geneIDs, fc_genes