Merge pull request #596 from ElDeveloper/collated_alpha_map

Collated alpha map
biocore · Dec 23, 2012 · baa350d · baa350d
2 parents aba9815 + fd542e9
commit baa350d
Show file tree

Hide file tree

Showing 3 changed files with 178 additions and 16 deletions.
diff --git a/qiime/add_alpha_to_mapping_file.py b/qiime/add_alpha_to_mapping_file.py
@@ -13,8 +13,9 @@
 
 
 from copy import deepcopy
-from numpy import searchsorted
 from qiime.stats import quantile
+from numpy import searchsorted, array
+from qiime.parse import parse_rarefaction
 
 def add_alpha_diversity_values_to_mapping_file(metrics, alpha_sample_ids,
                                             alpha_data, mapping_file_headers,
@@ -127,3 +128,52 @@ def _get_level(value, levels, prefix=None):
         output = value_level
 
     return output
+
+def mean_alpha(alpha_dict, depth):
+    """mean collated alpha diversity data at a given depth
+
+    Input:
+    alpha_dict: dictionary where the values are the lines of a collated alpha
+    diversity data files and the keys are the names of each of these files with
+    no extension, this name is usually the metric used to compute the alpha
+    diversity.
+    depth: selected depth to mean the computed alpha diversity values for the
+    alpha_dict data.
+
+    Output:
+    metrics: list of metric names i. e. the name of each collated alpha div file
+    sample_ids: list of sample identifiers represented
+    data: a list of lists with the mean of alpha diversity data at a given
+    depth for the different metrics, each column is a different metric.
+    """
+
+    assert type(alpha_dict) == dict, "Input data must be a dictionary"
+    assert depth >= 0 and type(depth) == int, "The spcified depth must be a "+\
+        "positive integer."
+
+    metrics = []
+    sample_ids = []
+    data = []
+
+    for key, value in alpha_dict.iteritems():
+        metrics.append('{0}_even_{1}'.format(key, depth))
+        identifiers, _, _, rarefaction_data = parse_rarefaction(value)
+
+        # check all the files have the same sample ids in the same order
+        if sample_ids:
+            if not sample_ids == identifiers[3:]:
+                raise (ValueError, "Non-matching sample ids were found in the "
+                    "collated alpha diversity files. Make sure all the files "
+                    "contain data for the same samples.")
+        else:
+            sample_ids = identifiers[3:]
+
+        # find all the data at the desired depth and get the mean values, remove
+        # the first two elements ([depth, iteration]) as those are not needed
+        data.append(array([row[2:] for row in rarefaction_data if\
+            row[0] == depth]).mean(axis=0))
+
+    # transpose the data to match the formatting of non-collated alpha div data
+    data = array(data).T.tolist()
+
+    return metrics, sample_ids, data
diff --git a/scripts/add_alpha_to_mapping_file.py b/scripts/add_alpha_to_mapping_file.py
@@ -11,10 +11,12 @@
 __email__ = "[email protected]"
 __status__ = "Development"
 
+from os.path import basename, splitext
 from qiime.format import format_mapping_file
 from qiime.parse import parse_matrix, parse_mapping_file
 from qiime.util import parse_command_line_parameters, make_option
-from qiime.add_alpha_to_mapping_file import add_alpha_diversity_values_to_mapping_file
+from qiime.add_alpha_to_mapping_file import (mean_alpha,
+    add_alpha_diversity_values_to_mapping_file)
 
 script_info = {}
 script_info['brief_description'] = "Add alpha diversity data to a metadata "+\
@@ -36,15 +38,23 @@
     " classify the normalized values using the quartiles of the distribution "
     "of this values.", "%prog -i adiv_pd.txt -m mapping.txt -b 4 -o alpha_"
     "mapping_quantile.txt --binning_method=quantile"))
+script_info['script_usage'].append(("Adding collated alpha diversity data",
+    "Add the mean of the alpha diversity values at a specified rarefaction"
+    " depth, this case is for use with the output of collated_alpha.py. It is "
+    "recommended that the filenames are the name of the metric used in each "
+    "file.", "%prog -i 'shannon.txt,chao1.txt' -m mapping.txt -b 4 -o collated_"
+    "alpha_mapping.txt --depth=49"))
 script_info['output_description']= "The result of running this script is a "+\
     "metadata mapping file that will include 3 new columns per alpha "+\
     "diversity metric included in the alpha diversity file. For example, with"+\
     " an alpha diversity file with only PD_whole_tree, the new columns will "+\
     "PD_whole_tree_alpha, PD_whole_tree_normalized and PD_whole_tree_bin."
 script_info['required_options'] = [
-make_option('-i','--alpha_fp',type="existing_filepath",
+make_option('-i','--alpha_fps',type="existing_filepaths",
     help='alpha diversity data with one or multiple metrics i. e. the output'
-    ' of alpha_diversity.py'),
+    ' of alpha_diversity.py. This can also be a comma-separated list of colla'
+    'ted alpha diversity file paths i. e. the output of collate_alpha.py, when'
+    ' using collated alpha diversity data the --depth option is required'),
 make_option('-m','--mapping_fp',type="existing_filepath",
     help='mapping file to modify by adding the alpha diversity data'),
 ]
@@ -53,20 +63,24 @@
     help='filepath for the modified mapping file [default: %default]',
     default='mapping_file_with_alpha.txt'),
 make_option('-b','--number_of_bins',type="int",
-    help='number of bins [default: %default]', default=4),
+    help='number of bins [default: %default].', default=4),
 make_option('-x','--missing_value_name',type="string",
     help='bin prefix name for the sample identifiers that exist in the '
         'mapping file (mapping_fp) but not in the alpha diversity file '
-        '(alpha_fp) [default: %default]', default='N/A'),
+        '(alpha_fp) [default: %default].', default='N/A'),
 make_option('--binning_method', type='string', default='equal',
     help='Select the method name to create the bins, the options are '
-        '\'equal\' and \'quantile\'. Both methods work over the normalized '
-        'alpha diversity values. On the one hand \'equal\' will assign the '
-        'bins on equally spaced limits, depending on the value of --number'
-        '_of_bins i. e. if you select 4 the limits will be [0.25, 0.50, 0.75]'
-        '. On the other hand \'quantile\' will select the limits based on the'
-        ' --number_of_bins i. e. the limits will be the quartiles if 4 is '
-        'selected [default: %default].')
+    '\'equal\' and \'quantile\'. Both methods work over the normalized alpha '
+    'diversity values. On the one hand \'equal\' will assign the bins on '
+    'equally spaced limits, depending on the value of --number_of_bins i. e. '
+    'if you select 4 the limits will be [0.25, 0.50, 0.75]. On the other hand '
+    '\'quantile\' will select the limits based on the --number_of_bins i. e. '
+    'the limits will be the quartiles if 4 is selected [default: %default].'),
+make_option('--depth', type='int', default=None, help='Select the rarefaction '
+    'depth to use when the alpha_fps refers to collated alpha diversity file(s)'
+    ' i. e. the output of collate_alpha.py. All the iterations contained at '
+    'this depth will be averaged to form a single mean value [default: %defaul'
+    't].')
 ]
 script_info['version'] = __version__
 
@@ -75,11 +89,12 @@
 def main():
     option_parser, opts, args = parse_command_line_parameters(**script_info)
 
-    alpha_fp = opts.alpha_fp
+    alpha_fps = opts.alpha_fps
     mapping_fp = opts.mapping_fp
     output_mapping_fp = opts.output_mapping_fp
     binning_method = opts.binning_method
     missing_value_name = opts.missing_value_name
+    depth = opts.depth
 
     # make sure the number of bins is an integer
     try:
@@ -88,10 +103,33 @@ def main():
         raise ValueError, 'The number of bins must be an integer, not %s'\
             % opts.number_of_bins
 
+    # if using collated data, make sure they specify a depth
+    if depth is not None:
+        alpha_dict = {}
+
+        # build up a dictionary with the filenames as keys and lines as values
+        for single_alpha_fp in alpha_fps:
+            alpha_dict[splitext(basename(single_alpha_fp))[0]] = open(
+                single_alpha_fp, 'U').readlines()
+
+        # format the collated data
+        metrics, alpha_sample_ids, alpha_data = mean_alpha(alpha_dict,
+            depth)
+
+    # when not using collated data, the user can only specify one input file
+    else:
+        if len(alpha_fps) > 1:
+            option_parser.error('A comma-separated list of files should only be'
+                ' passed with the --alpha_fps option when using collated alpha '
+                'diversity data and also selecting a rarefaction depth with the'
+                ' --depth option.')
+        else:
+            metrics, alpha_sample_ids, alpha_data = parse_matrix(open(
+                alpha_fps[0], 'U'))
+
     # parse the data from the files
     mapping_file_data, mapping_file_headers, comments = parse_mapping_file(
         open(mapping_fp, 'U'))
-    metrics, alpha_sample_ids, alpha_data = parse_matrix(open(alpha_fp, 'U'))
 
     # add the alpha diversity data to the mapping file
     out_mapping_file_data, out_mapping_file_headers = \

diff --git a/tests/test_add_alpha_to_mapping_file.py b/tests/test_add_alpha_to_mapping_file.py
@@ -14,7 +14,7 @@
 from numpy import array, median
 from cogent.util.unit_test import TestCase, main
 from qiime.add_alpha_to_mapping_file import (add_alpha_diversity_values_to_mapping_file,
-                                            _get_level)
+                                            _get_level, mean_alpha)
 
 class TopLevelTests(TestCase):
 
@@ -26,6 +26,9 @@ def setUp(self):
         self.sample_ids = ['PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354',
             'PC.593', 'PC.355', 'PC.607', 'PC.634']
 
+        self.collated_alpha_dict_a = COLLATED_ALPHA_DICT_A
+        self.collated_alpha_dict_b = COLLATED_ALPHA_DICT_B
+
         self.mapping_file_data = MAPPING_FILE_DATA
         self.mapping_file_headers = ['SampleID', 'BarcodeSequence',
             'LinkerPrimerSequence', 'Treatment', 'DOB', 'Description']
@@ -100,6 +103,44 @@ def test__get_level(self):
         with self.assertRaises(AssertionError):
             output = _get_level(-1, [0.25, 0.5, 0.75])
 
+    def test_mean_alpha(self):
+        """checks data is being correctly averaged"""
+
+        # regular use-cases for this function
+        expected_data = [[9.441785, 82.93], [0.42877, 5.2006], [9.625995, 8.18]]
+        expected_metrics = ['PD_whole_tree_even_310', 'chao1_even_310']
+        expected_sample_ids = ['s1', 's2', 's3']
+
+        o_metrics, o_sample_ids, o_data = mean_alpha(
+            self.collated_alpha_dict_a, 310)
+
+        self.assertEquals(o_metrics, expected_metrics)
+        self.assertEquals(o_sample_ids, expected_sample_ids)
+        self.assertEquals(o_data, expected_data)
+
+        expected_data = [[12.508435, 11.6105], [0.42877, 8.42], [11.58785, 1.0]]
+        expected_metrics = ['PD_whole_tree_even_610', 'chao1_even_610']
+
+        o_metrics, o_sample_ids, o_data = mean_alpha(
+            self.collated_alpha_dict_a, 610)
+
+        self.assertEquals(o_metrics, expected_metrics)
+        self.assertEquals(o_sample_ids, expected_sample_ids)
+        self.assertEquals(o_data, expected_data)
+
+        # files with non-matching sample ids should raise an exception
+        with self.assertRaises(ValueError):
+            o_metrics, o_sample_ids, o_data = mean_alpha(
+                self.collated_alpha_dict_b, 310)
+
+        # input types that should not be processed
+        with self.assertRaises(AssertionError):
+            output = mean_alpha([1, 2, 3], 5)
+
+        with self.assertRaises(AssertionError):
+            output = mean_alpha({'a':'b'}, -1.4)
+
+
 MAPPING_FILE_DATA = [
     ['PC.354','AGCACGAGCCTA','YATGCTGCCTCCCGTAGGAGT','Control','20061218','Control_mouse_I.D._354'],
     ['PC.355','AACTCGTCGATG','YATGCTGCCTCCCGTAGGAGT','Control','20061218','Control_mouse_I.D._355'],
@@ -133,6 +174,39 @@ def test__get_level(self):
 ['PC.635', 'ACCGCAGAGTCA', 'YATGCTGCCTCCCGTAGGAGT', 'Fast', '20080116', 'Fasting_mouse_I.D._635', '332.5', '1.0', 'bin_4_of_4', '7.48089', '1.0', 'bin_4_of_4'],
 ['PC.636', 'ACGGTGAGTGTC', 'YATGCTGCCTCCCGTAGGAGT', 'Fast', '20080116', 'Fasting_mouse_I.D._636', '173.0', '0.342268041237', 'bin_2_of_4', '6.39901', '0.636003943167', 'bin_3_of_4']]
 
+COLLATED_ALPHA_DICT_A = {
+'PD_whole_tree':['\tsequences per sample\titeration\ts1\ts2\ts3',
+            'rare10.txt\t10\t0\t1.99181\t0.42877\t2.13996',
+            'rare10.txt\t10\t1\t2.07163\t0.42877\t2.37055',
+            'rare310.txt\t310\t0\t8.83115\t0.42877\t11.00725',
+            'rare310.txt\t310\t1\t10.05242\t0.42877\t8.24474',
+            'rare610.txt\t610\t0\t12.03067\t0.42877\t11.58928',
+            'rare610.txt\t610\t1\t12.9862\t0.42877\t11.58642'],
+'chao1':['\tsequences per sample\titeration\ts1\ts2\ts3',
+            'rare10.txt\t10\t0\t4.2\t3.1415\t9.11',
+            'rare10.txt\t10\t1\t5.6\t3.15\t9.62',
+            'rare310.txt\t310\t0\t83.11\t5.2012\t8.12',
+            'rare310.txt\t310\t1\t82.75\t5.2000\t8.24',
+            'rare610.txt\t610\t0\t11.11\t8.42\t1',
+            'rare610.txt\t610\t1\t12.111\t8.42\t1']
+}
+
+COLLATED_ALPHA_DICT_B = {
+'PD_whole_tree':['\tsequences per sample\titeration\ts1\ts2\ts3',
+            'rare10.txt\t10\t0\t1.99181\t0.42877\t2.13996',
+            'rare10.txt\t10\t1\t2.07163\t0.42877\t2.37055',
+            'rare310.txt\t310\t0\t8.83115\t0.42877\t11.00725',
+            'rare310.txt\t310\t1\t10.05242\t0.42877\t8.24474',
+            'rare610.txt\t610\t0\t12.03067\t0.42877\t11.58928',
+            'rare610.txt\t610\t1\t12.9862\t0.42877\t11.58642'],
+'chao1':['\tsequences per sample\titeration\ts511\ts512\ts3',
+            'rare10.txt\t10\t0\t4.2\t3.1415\t9.11',
+            'rare10.txt\t10\t1\t5.6\t3.15\t9.62',
+            'rare310.txt\t310\t0\t83.11\t5.2012\t8.12',
+            'rare310.txt\t310\t1\t82.75\t5.2000\t8.24',
+            'rare610.txt\t610\t0\t11.11\t8.42\t1',
+            'rare610.txt\t610\t1\t12.111\t8.42\t1']
+}
 
 if __name__ == "__main__":
     main()