Adding stats wrappers and improving output.

Kortemme-Lab · Feb 2, 2015 · 24cade5 · 24cade5
1 parent 7f5bba1
commit 24cade5
Show file tree

Hide file tree

Showing 4 changed files with 137 additions and 33 deletions.
diff --git a/analysis/analyze.py b/analysis/analyze.py
@@ -21,28 +21,68 @@
 import sys
 import os
 from libraries import docopt
-from stats import get_xy_dataset_statistics, plot, read_file, RInterface
+from stats import get_xy_dataset_statistics, plot, read_file, RInterface, format_stats_for_printing
 correlation_coefficient_scatterplotplot = RInterface.correlation_coefficient_gplot
 
+
+def read_json(filename):
+    try:
+        try:
+            import json
+        except:
+            import simplejson as json
+        return json.loads(read_file(filename))
+    except:
+        return None
+
+
+def parse_csv(filename):
+    separator = ','
+    if filename.endswith('.tsv'):
+        separator = '\t'
+    try:
+        table = []
+        id = 1
+        contents = read_file(filename)
+        lines = [l.strip().split(separator) for l in contents.split('\n') if l.strip() and not(l.strip().startswith('#'))]
+        for linetokens in lines:
+            if len(linetokens) >= 3:
+                table.append(dict(Experimental = float(linetokens[0]), Predicted = float(linetokens[1]), ID = str(linetokens[2])))
+            elif len(linetokens) == 2:
+                table.append(dict(Experimental = float(linetokens[0]), Predicted = float(linetokens[1]), ID = id))
+                id += 1
+            else:
+                raise Exception('At least two columns (experimental DDG, predicted DDG) are expected.')
+        return table
+    except Exception, e:
+        raise Exception('An exception occurred parsing the CSV/TSV file: %s' % str(e))
+
+
 if __name__ == '__main__':
     try:
         arguments = docopt.docopt(__doc__.format(**locals()))
     except Exception, e:
         print('Failed while parsing arguments: %s.' % str(e))
         sys.exit(1)
-   
+
     # Read file input file
     input_filename = arguments['<inputfile>'][0]
     if not os.path.exists(input_filename):
         print('Error') # todo:
         sys.exit(2)
-    analysis_table = read_file(input_filename)
+    analysis_table = read_json(input_filename)
+    if not analysis_table:
+        analysis_table = parse_csv(input_filename)
 
     # Set up the output filename
     output_filename = arguments['--output']
     output_filename_ext = os.path.splitext(output_filename)[1].lower()
-    if output_filename_ext not in ['png', 'pdf', 'eps']:
+    if output_filename_ext not in ['.png', '.pdf', '.eps']:
         output_filename += '.png'
 
+    print('\n' + '*'*10 + ' Statistics ' +'*'*10)
+    print(format_stats_for_printing(get_xy_dataset_statistics(analysis_table)))
+
+    print('\nSaving scatterplot to %s.\n' % output_filename)
     plot(analysis_table, output_filename, correlation_coefficient_scatterplotplot)
 
diff --git a/analysis/stats.py b/analysis/stats.py
@@ -4,6 +4,7 @@
 import math
 import time
 import tempfile
+import copy
 import subprocess
 import traceback
 import inspect
@@ -84,30 +85,6 @@ def create_csv(analysis_table):
     return write_temp_file('.', contents)
 
 
-def plot(analysis_table, output_filename, RFunction):
-    #R_return_values = {}
-    filetype = os.path.splitext(output_filename)[1].lower()
-    assert(filetype == '.png' or filetype == '.pdf' or filetype == '.eps')
-    filetype = filetype[1:]
-    if len(analysis_table) <= 1:
-        raise Exception("The analysis table must have at least two points.")
-    else:
-        input_filename = create_csv(analysis_table)
-        print(input_filename )
-        print(read_file(input_filename))
-        try:
-            R_output = RFunction(input_filename, output_filename, filetype)
-            #R_return_values = RUtilities.parse_R_output(R_output)
-            #for k, v in sorted(R_return_values.iteritems()):
-            #    print("  %s: %s" % (str(k), str(v)))
-        except Exception, e:
-            print(traceback.format_exc())
-            delete_file(input_filename)
-            raise Exception(e)
-        delete_file(input_filename)
-    return output_filename
-
-
 def get_ranks(values):
     '''
     Converts raw values into ranks for rank correlation coefficients
@@ -255,9 +232,18 @@ def mae(x_values, y_values):
     return numpy.sum(numpy.apply_along_axis(numpy.abs, 0, numpy.subtract(x_values, y_values))) / float(num_points)
 
 
-def get_xy_dataset_statistics(x_values, y_values, fcorrect_x_cutoff = 1.0, fcorrect_y_cutoff = 1.0, x_fuzzy_range = 0.1, y_scalar = 1.0):
-    '''A function which takes two lists of values of equal length with corresponding entries and returns a dict containing
-       a variety of metrics.'''
+def _get_xy_dataset_statistics(x_values, y_values, fcorrect_x_cutoff = 1.0, fcorrect_y_cutoff = 1.0, x_fuzzy_range = 0.1, y_scalar = 1.0):
+    '''
+    A function which takes two lists of values of equal length with corresponding entries and returns a dict containing
+    a variety of metrics.
+    :param x_values: A list of values for the X-axis (experimental values).
+    :param y_values: A list of values for the X-axis (predicted values).
+    :param fcorrect_x_cutoff: See get_xy_dataset_statistics.
+    :param fcorrect_y_cutoff: See get_xy_dataset_statistics.
+    :param x_fuzzy_range: See get_xy_dataset_statistics.
+    :param y_scalar: See get_xy_dataset_statistics.
+    :return: A table of statistics.
+    '''
     assert(len(x_values) == len(y_values))
     return dict(
         pearsonr = pearsonr(x_values, y_values),
@@ -274,6 +260,84 @@ def get_xy_dataset_statistics(x_values, y_values, fcorrect_x_cutoff = 1.0, fcorr
     )
 
 
+def get_xy_dataset_statistics(analysis_table, fcorrect_x_cutoff = 1.0, fcorrect_y_cutoff = 1.0, x_fuzzy_range = 0.1, y_scalar = 1.0):
+    '''
+    A version of _get_xy_dataset_statistics which accepts a list of dicts rather than X- and Y-value lists.
+    :param analysis_table: A list of dict where each dict has Experimental and Predicted float elements
+    :param fcorrect_x_cutoff: The X-axis cutoff value for the fraction correct metric.
+    :param fcorrect_y_cutoff: The Y-axis cutoff value for the fraction correct metric.
+    :param x_fuzzy_range: The X-axis fuzzy range value for the fuzzy fraction correct metric.
+    :param y_scalar: The Y-axis scalar multiplier for the fuzzy fraction correct metric (used to calculate y_cutoff and y_fuzzy_range in that metric)
+    :return: A table of statistics.
+    '''
+
+    x_values = [record['Experimental'] for record in analysis_table]
+    y_values = [record['Predicted'] for record in analysis_table]
+    return _get_xy_dataset_statistics(x_values, y_values, fcorrect_x_cutoff = fcorrect_x_cutoff, fcorrect_y_cutoff = fcorrect_y_cutoff, x_fuzzy_range = x_fuzzy_range, y_scalar = y_scalar)
+
+
+keymap = dict(
+    pearsonr = "Pearson's R",
+    spearmanr = "Spearman's R",
+    gamma_CC = "Gamma correlation coef.",
+    fraction_correct = "Fraction correct",
+    fraction_correct_fuzzy_linear = "Fraction correct (fuzzy)",
+    ks_2samp = "Kolmogorov-Smirnov test (XY)",
+    kstestx = "X-axis Kolmogorov-Smirnov test",
+    kstesty = "Y-axis Kolmogorov-Smirnov test",
+    normaltestx = "X-axis normality test",
+    normaltesty = "Y-axis normality test",
+)
+
+
+def format_stats_for_printing(stats):
+    s = []
+    newstats = {}
+    for k, v in stats.iteritems():
+        key = keymap.get(k, k)
+        if k == 'ks_2samp':
+            newstats[key] = '%0.3f (2-tailed p-value=%s)' % (v[0], str(v[1]))
+        elif k == 'kstestx':
+            newstats[key] = '%0.3f (p-value=%s)' % (v[0], str(v[1]))
+        elif k == 'kstesty':
+            newstats[key] = '%0.3f (p-value=%s)' % (v[0], str(v[1]))
+        elif k == 'normaltestx':
+            newstats[key] = '%0.3f (2-sided chi^2 p-value=%s)' % (v[0], str(v[1]))
+        elif k == 'normaltesty':
+            newstats[key] = '%0.3f (2-sided chi^2 p-value=%s)' % (v[0], str(v[1]))
+        elif k == 'pearsonr':
+            newstats[key] = '%0.3f (2-tailed p-value=%s)' % (v[0], str(v[1]))
+        elif k == 'spearmanr':
+            newstats[key] = '%0.3f (2-tailed p-value=%s)' % (v[0], str(v[1]))
+        else:
+            newstats[key] = v
+    for k, v in sorted(newstats.iteritems()):
+        s.append('%s: %s' % (str(k).ljust(32), str(v)))
+    return '\n'.join(s)
+
+
+def plot(analysis_table, output_filename, RFunction):
+    #R_return_values = {}
+    filetype = os.path.splitext(output_filename)[1].lower()
+    assert(filetype == '.png' or filetype == '.pdf' or filetype == '.eps')
+    filetype = filetype[1:]
+    if len(analysis_table) <= 1:
+        raise Exception("The analysis table must have at least two points.")
+    else:
+        input_filename = create_csv(analysis_table)
+        try:
+            R_output = RFunction(input_filename, output_filename, filetype)
+            #R_return_values = RUtilities.parse_R_output(R_output)
+            #for k, v in sorted(R_return_values.iteritems()):
+            #    print("  %s: %s" % (str(k), str(v)))
+        except Exception, e:
+            print(traceback.format_exc())
+            delete_file(input_filename)
+            raise Exception(e)
+        delete_file(input_filename)
+    return output_filename
+
+
 if __name__ == '__main__':
     import random
 

diff --git a/output/sample/curatedPT_r57471.txt b/output/sample/curatedPT_r57471.txt
@@ -1,4 +1,4 @@
-Experimental,Predicted,ID,RecordID
+#Experimental,Predicted,ID,RecordID
 3.000000,2.233660,71686,255
 0.600000,1.491790,71687,257
 3.100000,1.216640,71688,259

diff --git a/output/sample/kellogg_r57471.txt b/output/sample/kellogg_r57471.txt
@@ -1,4 +1,4 @@
-Experimental,Predicted,ID,RecordID
+#Experimental,Predicted,ID,RecordID
 0.800000,2.147640,71689,332
 2.600000,3.888480,71692,333
 1.200000,2.757570,71694,331