Skip to content

Commit

Permalink
Adding stats wrappers and improving output.
Browse files Browse the repository at this point in the history
  • Loading branch information
spiderbaby committed Feb 2, 2015
1 parent 7f5bba1 commit 24cade5
Show file tree
Hide file tree
Showing 4 changed files with 137 additions and 33 deletions.
48 changes: 44 additions & 4 deletions analysis/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,28 +21,68 @@
import sys
import os
from libraries import docopt
from stats import get_xy_dataset_statistics, plot, read_file, RInterface
from stats import get_xy_dataset_statistics, plot, read_file, RInterface, format_stats_for_printing
correlation_coefficient_scatterplotplot = RInterface.correlation_coefficient_gplot


def read_json(filename):
try:
try:
import json
except:
import simplejson as json
return json.loads(read_file(filename))
except:
return None


def parse_csv(filename):
separator = ','
if filename.endswith('.tsv'):
separator = '\t'
try:
table = []
id = 1
contents = read_file(filename)
lines = [l.strip().split(separator) for l in contents.split('\n') if l.strip() and not(l.strip().startswith('#'))]
for linetokens in lines:
if len(linetokens) >= 3:
table.append(dict(Experimental = float(linetokens[0]), Predicted = float(linetokens[1]), ID = str(linetokens[2])))
elif len(linetokens) == 2:
table.append(dict(Experimental = float(linetokens[0]), Predicted = float(linetokens[1]), ID = id))
id += 1
else:
raise Exception('At least two columns (experimental DDG, predicted DDG) are expected.')
return table
except Exception, e:
raise Exception('An exception occurred parsing the CSV/TSV file: %s' % str(e))


if __name__ == '__main__':
try:
arguments = docopt.docopt(__doc__.format(**locals()))
except Exception, e:
print('Failed while parsing arguments: %s.' % str(e))
sys.exit(1)

# Read file input file
input_filename = arguments['<inputfile>'][0]
if not os.path.exists(input_filename):
print('Error') # todo:
sys.exit(2)
analysis_table = read_file(input_filename)
analysis_table = read_json(input_filename)
if not analysis_table:
analysis_table = parse_csv(input_filename)

# Set up the output filename
output_filename = arguments['--output']
output_filename_ext = os.path.splitext(output_filename)[1].lower()
if output_filename_ext not in ['png', 'pdf', 'eps']:
if output_filename_ext not in ['.png', '.pdf', '.eps']:
output_filename += '.png'

print('\n' + '*'*10 + ' Statistics ' +'*'*10)
print(format_stats_for_printing(get_xy_dataset_statistics(analysis_table)))

print('\nSaving scatterplot to %s.\n' % output_filename)
plot(analysis_table, output_filename, correlation_coefficient_scatterplotplot)

118 changes: 91 additions & 27 deletions analysis/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import math
import time
import tempfile
import copy
import subprocess
import traceback
import inspect
Expand Down Expand Up @@ -84,30 +85,6 @@ def create_csv(analysis_table):
return write_temp_file('.', contents)


def plot(analysis_table, output_filename, RFunction):
#R_return_values = {}
filetype = os.path.splitext(output_filename)[1].lower()
assert(filetype == '.png' or filetype == '.pdf' or filetype == '.eps')
filetype = filetype[1:]
if len(analysis_table) <= 1:
raise Exception("The analysis table must have at least two points.")
else:
input_filename = create_csv(analysis_table)
print(input_filename )
print(read_file(input_filename))
try:
R_output = RFunction(input_filename, output_filename, filetype)
#R_return_values = RUtilities.parse_R_output(R_output)
#for k, v in sorted(R_return_values.iteritems()):
# print(" %s: %s" % (str(k), str(v)))
except Exception, e:
print(traceback.format_exc())
delete_file(input_filename)
raise Exception(e)
delete_file(input_filename)
return output_filename


def get_ranks(values):
'''
Converts raw values into ranks for rank correlation coefficients
Expand Down Expand Up @@ -255,9 +232,18 @@ def mae(x_values, y_values):
return numpy.sum(numpy.apply_along_axis(numpy.abs, 0, numpy.subtract(x_values, y_values))) / float(num_points)


def get_xy_dataset_statistics(x_values, y_values, fcorrect_x_cutoff = 1.0, fcorrect_y_cutoff = 1.0, x_fuzzy_range = 0.1, y_scalar = 1.0):
'''A function which takes two lists of values of equal length with corresponding entries and returns a dict containing
a variety of metrics.'''
def _get_xy_dataset_statistics(x_values, y_values, fcorrect_x_cutoff = 1.0, fcorrect_y_cutoff = 1.0, x_fuzzy_range = 0.1, y_scalar = 1.0):
'''
A function which takes two lists of values of equal length with corresponding entries and returns a dict containing
a variety of metrics.
:param x_values: A list of values for the X-axis (experimental values).
:param y_values: A list of values for the X-axis (predicted values).
:param fcorrect_x_cutoff: See get_xy_dataset_statistics.
:param fcorrect_y_cutoff: See get_xy_dataset_statistics.
:param x_fuzzy_range: See get_xy_dataset_statistics.
:param y_scalar: See get_xy_dataset_statistics.
:return: A table of statistics.
'''
assert(len(x_values) == len(y_values))
return dict(
pearsonr = pearsonr(x_values, y_values),
Expand All @@ -274,6 +260,84 @@ def get_xy_dataset_statistics(x_values, y_values, fcorrect_x_cutoff = 1.0, fcorr
)


def get_xy_dataset_statistics(analysis_table, fcorrect_x_cutoff = 1.0, fcorrect_y_cutoff = 1.0, x_fuzzy_range = 0.1, y_scalar = 1.0):
'''
A version of _get_xy_dataset_statistics which accepts a list of dicts rather than X- and Y-value lists.
:param analysis_table: A list of dict where each dict has Experimental and Predicted float elements
:param fcorrect_x_cutoff: The X-axis cutoff value for the fraction correct metric.
:param fcorrect_y_cutoff: The Y-axis cutoff value for the fraction correct metric.
:param x_fuzzy_range: The X-axis fuzzy range value for the fuzzy fraction correct metric.
:param y_scalar: The Y-axis scalar multiplier for the fuzzy fraction correct metric (used to calculate y_cutoff and y_fuzzy_range in that metric)
:return: A table of statistics.
'''

x_values = [record['Experimental'] for record in analysis_table]
y_values = [record['Predicted'] for record in analysis_table]
return _get_xy_dataset_statistics(x_values, y_values, fcorrect_x_cutoff = fcorrect_x_cutoff, fcorrect_y_cutoff = fcorrect_y_cutoff, x_fuzzy_range = x_fuzzy_range, y_scalar = y_scalar)


keymap = dict(
pearsonr = "Pearson's R",
spearmanr = "Spearman's R",
gamma_CC = "Gamma correlation coef.",
fraction_correct = "Fraction correct",
fraction_correct_fuzzy_linear = "Fraction correct (fuzzy)",
ks_2samp = "Kolmogorov-Smirnov test (XY)",
kstestx = "X-axis Kolmogorov-Smirnov test",
kstesty = "Y-axis Kolmogorov-Smirnov test",
normaltestx = "X-axis normality test",
normaltesty = "Y-axis normality test",
)


def format_stats_for_printing(stats):
s = []
newstats = {}
for k, v in stats.iteritems():
key = keymap.get(k, k)
if k == 'ks_2samp':
newstats[key] = '%0.3f (2-tailed p-value=%s)' % (v[0], str(v[1]))
elif k == 'kstestx':
newstats[key] = '%0.3f (p-value=%s)' % (v[0], str(v[1]))
elif k == 'kstesty':
newstats[key] = '%0.3f (p-value=%s)' % (v[0], str(v[1]))
elif k == 'normaltestx':
newstats[key] = '%0.3f (2-sided chi^2 p-value=%s)' % (v[0], str(v[1]))
elif k == 'normaltesty':
newstats[key] = '%0.3f (2-sided chi^2 p-value=%s)' % (v[0], str(v[1]))
elif k == 'pearsonr':
newstats[key] = '%0.3f (2-tailed p-value=%s)' % (v[0], str(v[1]))
elif k == 'spearmanr':
newstats[key] = '%0.3f (2-tailed p-value=%s)' % (v[0], str(v[1]))
else:
newstats[key] = v
for k, v in sorted(newstats.iteritems()):
s.append('%s: %s' % (str(k).ljust(32), str(v)))
return '\n'.join(s)


def plot(analysis_table, output_filename, RFunction):
#R_return_values = {}
filetype = os.path.splitext(output_filename)[1].lower()
assert(filetype == '.png' or filetype == '.pdf' or filetype == '.eps')
filetype = filetype[1:]
if len(analysis_table) <= 1:
raise Exception("The analysis table must have at least two points.")
else:
input_filename = create_csv(analysis_table)
try:
R_output = RFunction(input_filename, output_filename, filetype)
#R_return_values = RUtilities.parse_R_output(R_output)
#for k, v in sorted(R_return_values.iteritems()):
# print(" %s: %s" % (str(k), str(v)))
except Exception, e:
print(traceback.format_exc())
delete_file(input_filename)
raise Exception(e)
delete_file(input_filename)
return output_filename


if __name__ == '__main__':
import random

Expand Down
2 changes: 1 addition & 1 deletion output/sample/curatedPT_r57471.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Experimental,Predicted,ID,RecordID
#Experimental,Predicted,ID,RecordID
3.000000,2.233660,71686,255
0.600000,1.491790,71687,257
3.100000,1.216640,71688,259
Expand Down
2 changes: 1 addition & 1 deletion output/sample/kellogg_r57471.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Experimental,Predicted,ID,RecordID
#Experimental,Predicted,ID,RecordID
0.800000,2.147640,71689,332
2.600000,3.888480,71692,333
1.200000,2.757570,71694,331
Expand Down

0 comments on commit 24cade5

Please sign in to comment.