Skip to content

Commit

Permalink
fix ciruclar import and split data
Browse files Browse the repository at this point in the history
  • Loading branch information
sanjaynagi committed Sep 29, 2024
1 parent b654a25 commit 7f17376
Show file tree
Hide file tree
Showing 6 changed files with 141 additions and 392 deletions.
5 changes: 3 additions & 2 deletions anoexpress/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# flake8: noqa
from .load_data import *
from .gsea import *
from .data import *
from .utils import *
from .candidates import *
from .plot import *
from .gsea import *
135 changes: 135 additions & 0 deletions anoexpress/candidates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import numpy as np
import pandas as pd

from .data import data, load_annotations
from .utils import resolve_gene_id, load_gff, _gene_ids_from_annotation


def load_candidates(analysis, name='median', func=np.nanmedian, query_annotation=None, query_fc=None, microarray=False, low_count_filter=None, fraction_na_allowed=None):
"""
Load the candidate genes for a given analysis. Optionally, filter by annotation or fold change data.
Parameters
----------
analysis: {"gamb_colu", "gamb_colu_arab", "gamb_colu_arab_fun", "fun"}
which analysis to load gene expression data for. analyses with more species will have less genes
present, due to the process of finding orthologs.
name: str, optional
name of the function to rank genes by. Defaults to 'median'
func: function, optional
function to rank genes by. Defaults to np.nanmedian
query_annotation: str or list, optional
filter genes by GO or PFAM annotation. Defaults to None
query_fc: float, optional
filter genes by fold change. Defaults to None
microarray: bool, optional
whether to include the IR-Tex microarray data in the requested data. Default is False.
low_count_filter: int, optional
if provided, genes with a median count below the threshold will be removed before gene ranking. Default is None.
fraction_nas_allowed: float, optional
fraction of missing values allowed in the data. Defaults to 0.5
Returns
-------
fc_ranked: pd.DataFrame
"""

fc_data = data(data_type='fcs', analysis=analysis, microarray=microarray, annotations=True, sort_by=None, low_count_filter=low_count_filter, fraction_na_allowed=fraction_na_allowed)

if query_annotation is not None:
gene_annot_df = load_annotations()
gene_ids = _gene_ids_from_annotation(gene_annot_df=gene_annot_df, annotation=query_annotation)
fc_data = fc_data.query("GeneID in @gene_ids")
assert not fc_data.empty, "No genes were found for the selection. It is possible these genes were removed by the ortholog finding process"

fc_ranked = fc_data.apply(func, axis=1).to_frame().rename(columns={0:f'{name} log2 Fold Change'}).copy()
fc_ranked = fc_ranked.sort_values(f'{name} log2 Fold Change', ascending=False)
fc_ranked = fc_ranked.reset_index()
fc_ranked.loc[:, f'{name} Fold Change'] = np.round(2**fc_ranked.loc[:, f'{name} log2 Fold Change'], 2)

if query_fc is not None:
fc_ranked = fc_ranked.query(f'`{name} Fold Change` > {query_fc}')

return(fc_ranked)





def consistent_genes(analysis, direction, n_experiments, low_count_filter=None):

fc_data = data(data_type="fcs", analysis=analysis, microarray=False, annotations=True, low_count_filter=low_count_filter)
pval_data = data(data_type='pvals', analysis=analysis, microarray=False, annotations=True, low_count_filter=low_count_filter)

print(f"There are {fc_data.shape[0]} genes and {fc_data.shape[1]} differential expression comparisons in {analysis}")
if direction == 'up':
res_df = fc_data[fc_data.apply(lambda x: (x > 0).sum() >= n_experiments , axis=1)]
res_pval = pval_data[pval_data.apply(lambda x: x < 0.05, axis=1).sum(axis=1) > n_experiments].reset_index()['GeneID'].to_list()
res_df = res_df.query("GeneID in @res_pval")

if res_df.empty:
print(f"There are no genes expressed direction={direction} in {n_experiments} experiments")
return
else:
return(res_df)
else:
res_df = fc_data[fc_data.apply(lambda x: (x < 0).sum() >= n_experiments, axis=1)]
res_pval = pval_data[pval_data.apply(lambda x: x < 0.05, axis=1).sum(axis=1) > n_experiments].reset_index()['GeneID'].to_list()
res_df = res_df.query("GeneID in @res_pval")
if res_df.empty:
print(f"There are no genes expressed {direction} in {n_experiments} experiments")
return
return(res_df)



def contig_expression(contig, analysis, data_type='fcs', microarray=False, pvalue_filter=None, size=10, step=5, fraction_na_allowed=None):
"""
Calculate fold change data for a given contig. Returns both raw fold change data for each experiment and a moving average
Parameters
----------
contig: str
contig, one of 2L, 2R, 3L, 3R, X or 2RL, 3RL
analysis: {"gamb_colu", "gamb_colu_arab", "gamb_colu_arab_fun", "fun"}
which analysis to load gene expression data for. analyses with more species will have less genes
present, due to the process of finding orthologs.
data_type: {"fcs", "log2counts"}, optional
whether to load fold change data or log2 counts data. Defaults to 'fcs'
microarray: bool, optional
whether to include the IR-Tex microarray data in the plot
pvalue_filter: float, optional
filter genes by pvalue. Defaults to None
size: int, optional
size of window in genes for moving average. Defaults to 10
step: int, optional
step size in genes for moving average. Defaults to 5
fraction_na_allowed: float, optional
fraction of missing values allowed for each gene in the data. Defaults to no filter.
"""
import malariagen_data
import allel

fc_data = data(
data_type=data_type,
analysis=analysis,
microarray=microarray,
annotations=True,
pvalue_filter=pvalue_filter,
fraction_na_allowed=fraction_na_allowed
).reset_index()

ag3 = malariagen_data.Ag3()
gff = ag3.genome_features(contig).query("type == 'gene'").assign(midpoint = lambda x: (x.start + x.end)/2)
genes_df = gff[['ID', 'midpoint']].rename(columns={'ID':'GeneID'})
genes_df = genes_df.merge(fc_data, how='left').set_index(['GeneID', 'GeneName', 'GeneDescription', 'midpoint'])
# calculate moving average of fold changes
median_fc = genes_df.mean(axis=1).to_frame().rename(columns={0:'median_fc'}).reset_index()
moving_mid = allel.moving_statistic(median_fc.midpoint, np.nanmedian, size=size, step=step)
moving_med = allel.moving_statistic(median_fc['median_fc'], np.nanmedian, size=size, step=step)
windowed_fold_change_df = pd.DataFrame({'midpoint':moving_mid, 'median_fc':moving_med})

fold_change_df = genes_df.reset_index().melt(id_vars=['midpoint', 'GeneID', 'GeneName', 'GeneDescription'], var_name='comparison', value_name='fold_change')
return fold_change_df, windowed_fold_change_df

2 changes: 1 addition & 1 deletion anoexpress/gsea.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
import numpy as np

from .load_data import load_candidates
from .data import load_candidates
from .utils import resolve_gene_id


Expand Down
Loading

0 comments on commit 7f17376

Please sign in to comment.