Skip to content

Commit

Permalink
Merge pull request SouthernBio#16 from SouthernBio/feature/zscore
Browse files Browse the repository at this point in the history
Functions for analyzing dataframes
  • Loading branch information
fx-biocoder authored Mar 14, 2024
2 parents 40f3408 + fa1cae0 commit f22722d
Show file tree
Hide file tree
Showing 3 changed files with 185 additions and 62 deletions.
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ numpy = "*"
pandas = "*"
matplotlib = "*"
sh = "*"
scipy = "*"

[dev-packages]

Expand Down
149 changes: 87 additions & 62 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

97 changes: 97 additions & 0 deletions tools/analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import pandas as pd
from scipy.stats import zscore
from fnmatch import fnmatch


class NotADataframe(Exception):
"""Custom exception for when a function receives a file that is not a CSV dataframe"""
message = "Error: the function must receive a dataframe in CSV format."

def __init__(self):
super().__init__(self.message)


def normalize_conservation_rate(file_path: str) -> pd.DataFrame:
"""Normalizes conservation rates by calculating the z-score for each observed conservation rate.
Args:
file_path (pd.DataFrame): Pandas dataframe containing conservation rates
Returns:
pd.DataFrame: Pandas dataframe with normalized conservation rates
"""
if not fnmatch(file_path, '*.csv'):
raise NotADataframe()

dataframe = pd.read_csv(file_path, delimiter=',', header=0, index_col=0)

# Calculate z-score
dataframe[['NormalizedConservationRate']] = zscore(dataframe[['ConservationRate']], axis=None)

# IMPORTANT: Modify this function (or create a new one)
# to handle conservation rate values of zero

return dataframe


def split_codon_pairs(file_path: str) -> pd.DataFrame:
"""Generate two columns with each constituent codon from a codon pair
Args:
file_path (str): File which contains relevant
Returns:
pd.DataFrame: Pandas dataframe with two new columns
"""
if not fnmatch(file_path, '*.csv'):
raise NotADataframe()

dataframe = pd.read_csv(file_path, delimiter=',', header=0)

# Get codon pairs
codon_pairs = dataframe[dataframe.columns[0]]

# Generate lists with each constituent codon
first_codon, second_codon = [], []
for pair in codon_pairs:
first_codon.append(pair[:3])
second_codon.append(pair[3:])

# Join the lists to the dataframe
df_codons = pd.DataFrame({
'FirstCodon': pd.Series(first_codon, index=None),
'SecondCodon': pd.Series(second_codon, index=None)
})
dataframe = dataframe.join(df_codons)

return dataframe


def expected_codon_pair_conservation_rate(file_path: str, codons: str) -> pd.DataFrame:

codon_information = pd.read_csv(codons, delimiter=',', header=0, index_col=0)
dataframe = pd.read_csv(file_path, delimiter=',', header=0, index_col=0)

# Auxiliary lists to store normalized conservation values
first_codon_cr, second_codon_cr, expected_product = [], [], []

for codon in dataframe['FirstCodon']:
for i in codon_information['Codon']:
if codon == i:
first_codon_cr.append(codon_information.filter(index=i, axis=0)[['ConservationRate']])

for codon in dataframe['SecondCodon']:
for i in codon_information['Codon']:
if codon == i:
second_codon_cr.append(codon_information.filter(index=i, axis=0)[['ConservationRate']])

for index, item in enumerate(first_codon_cr):
expected_product.append(first_codon_cr[index] * second_codon_cr[index])

expectedCodonPairConservationRate = pd.Series(expected_product, index=None)
df_expected_conservation_rate = pd.DataFrame({
'ExpectedCodonPairConservationRate': expectedCodonPairConservationRate
})

dataframe = dataframe.join(df_expected_conservation_rate)

return dataframe

0 comments on commit f22722d

Please sign in to comment.