Skip to content
This repository has been archived by the owner on Nov 9, 2023. It is now read-only.

Commit

Permalink
Merge pull request #596 from ElDeveloper/collated_alpha_map
Browse files Browse the repository at this point in the history
Collated alpha map
  • Loading branch information
gregcaporaso committed Dec 23, 2012
2 parents aba9815 + fd542e9 commit baa350d
Show file tree
Hide file tree
Showing 3 changed files with 178 additions and 16 deletions.
52 changes: 51 additions & 1 deletion qiime/add_alpha_to_mapping_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@


from copy import deepcopy
from numpy import searchsorted
from qiime.stats import quantile
from numpy import searchsorted, array
from qiime.parse import parse_rarefaction

def add_alpha_diversity_values_to_mapping_file(metrics, alpha_sample_ids,
alpha_data, mapping_file_headers,
Expand Down Expand Up @@ -127,3 +128,52 @@ def _get_level(value, levels, prefix=None):
output = value_level

return output

def mean_alpha(alpha_dict, depth):
"""mean collated alpha diversity data at a given depth
Input:
alpha_dict: dictionary where the values are the lines of a collated alpha
diversity data files and the keys are the names of each of these files with
no extension, this name is usually the metric used to compute the alpha
diversity.
depth: selected depth to mean the computed alpha diversity values for the
alpha_dict data.
Output:
metrics: list of metric names i. e. the name of each collated alpha div file
sample_ids: list of sample identifiers represented
data: a list of lists with the mean of alpha diversity data at a given
depth for the different metrics, each column is a different metric.
"""

assert type(alpha_dict) == dict, "Input data must be a dictionary"
assert depth >= 0 and type(depth) == int, "The spcified depth must be a "+\
"positive integer."

metrics = []
sample_ids = []
data = []

for key, value in alpha_dict.iteritems():
metrics.append('{0}_even_{1}'.format(key, depth))
identifiers, _, _, rarefaction_data = parse_rarefaction(value)

# check all the files have the same sample ids in the same order
if sample_ids:
if not sample_ids == identifiers[3:]:
raise (ValueError, "Non-matching sample ids were found in the "
"collated alpha diversity files. Make sure all the files "
"contain data for the same samples.")
else:
sample_ids = identifiers[3:]

# find all the data at the desired depth and get the mean values, remove
# the first two elements ([depth, iteration]) as those are not needed
data.append(array([row[2:] for row in rarefaction_data if\
row[0] == depth]).mean(axis=0))

# transpose the data to match the formatting of non-collated alpha div data
data = array(data).T.tolist()

return metrics, sample_ids, data
66 changes: 52 additions & 14 deletions scripts/add_alpha_to_mapping_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@
__email__ = "[email protected]"
__status__ = "Development"

from os.path import basename, splitext
from qiime.format import format_mapping_file
from qiime.parse import parse_matrix, parse_mapping_file
from qiime.util import parse_command_line_parameters, make_option
from qiime.add_alpha_to_mapping_file import add_alpha_diversity_values_to_mapping_file
from qiime.add_alpha_to_mapping_file import (mean_alpha,
add_alpha_diversity_values_to_mapping_file)

script_info = {}
script_info['brief_description'] = "Add alpha diversity data to a metadata "+\
Expand All @@ -36,15 +38,23 @@
" classify the normalized values using the quartiles of the distribution "
"of this values.", "%prog -i adiv_pd.txt -m mapping.txt -b 4 -o alpha_"
"mapping_quantile.txt --binning_method=quantile"))
script_info['script_usage'].append(("Adding collated alpha diversity data",
"Add the mean of the alpha diversity values at a specified rarefaction"
" depth, this case is for use with the output of collated_alpha.py. It is "
"recommended that the filenames are the name of the metric used in each "
"file.", "%prog -i 'shannon.txt,chao1.txt' -m mapping.txt -b 4 -o collated_"
"alpha_mapping.txt --depth=49"))
script_info['output_description']= "The result of running this script is a "+\
"metadata mapping file that will include 3 new columns per alpha "+\
"diversity metric included in the alpha diversity file. For example, with"+\
" an alpha diversity file with only PD_whole_tree, the new columns will "+\
"PD_whole_tree_alpha, PD_whole_tree_normalized and PD_whole_tree_bin."
script_info['required_options'] = [
make_option('-i','--alpha_fp',type="existing_filepath",
make_option('-i','--alpha_fps',type="existing_filepaths",
help='alpha diversity data with one or multiple metrics i. e. the output'
' of alpha_diversity.py'),
' of alpha_diversity.py. This can also be a comma-separated list of colla'
'ted alpha diversity file paths i. e. the output of collate_alpha.py, when'
' using collated alpha diversity data the --depth option is required'),
make_option('-m','--mapping_fp',type="existing_filepath",
help='mapping file to modify by adding the alpha diversity data'),
]
Expand All @@ -53,20 +63,24 @@
help='filepath for the modified mapping file [default: %default]',
default='mapping_file_with_alpha.txt'),
make_option('-b','--number_of_bins',type="int",
help='number of bins [default: %default]', default=4),
help='number of bins [default: %default].', default=4),
make_option('-x','--missing_value_name',type="string",
help='bin prefix name for the sample identifiers that exist in the '
'mapping file (mapping_fp) but not in the alpha diversity file '
'(alpha_fp) [default: %default]', default='N/A'),
'(alpha_fp) [default: %default].', default='N/A'),
make_option('--binning_method', type='string', default='equal',
help='Select the method name to create the bins, the options are '
'\'equal\' and \'quantile\'. Both methods work over the normalized '
'alpha diversity values. On the one hand \'equal\' will assign the '
'bins on equally spaced limits, depending on the value of --number'
'_of_bins i. e. if you select 4 the limits will be [0.25, 0.50, 0.75]'
'. On the other hand \'quantile\' will select the limits based on the'
' --number_of_bins i. e. the limits will be the quartiles if 4 is '
'selected [default: %default].')
'\'equal\' and \'quantile\'. Both methods work over the normalized alpha '
'diversity values. On the one hand \'equal\' will assign the bins on '
'equally spaced limits, depending on the value of --number_of_bins i. e. '
'if you select 4 the limits will be [0.25, 0.50, 0.75]. On the other hand '
'\'quantile\' will select the limits based on the --number_of_bins i. e. '
'the limits will be the quartiles if 4 is selected [default: %default].'),
make_option('--depth', type='int', default=None, help='Select the rarefaction '
'depth to use when the alpha_fps refers to collated alpha diversity file(s)'
' i. e. the output of collate_alpha.py. All the iterations contained at '
'this depth will be averaged to form a single mean value [default: %defaul'
't].')
]
script_info['version'] = __version__

Expand All @@ -75,11 +89,12 @@
def main():
option_parser, opts, args = parse_command_line_parameters(**script_info)

alpha_fp = opts.alpha_fp
alpha_fps = opts.alpha_fps
mapping_fp = opts.mapping_fp
output_mapping_fp = opts.output_mapping_fp
binning_method = opts.binning_method
missing_value_name = opts.missing_value_name
depth = opts.depth

# make sure the number of bins is an integer
try:
Expand All @@ -88,10 +103,33 @@ def main():
raise ValueError, 'The number of bins must be an integer, not %s'\
% opts.number_of_bins

# if using collated data, make sure they specify a depth
if depth is not None:
alpha_dict = {}

# build up a dictionary with the filenames as keys and lines as values
for single_alpha_fp in alpha_fps:
alpha_dict[splitext(basename(single_alpha_fp))[0]] = open(
single_alpha_fp, 'U').readlines()

# format the collated data
metrics, alpha_sample_ids, alpha_data = mean_alpha(alpha_dict,
depth)

# when not using collated data, the user can only specify one input file
else:
if len(alpha_fps) > 1:
option_parser.error('A comma-separated list of files should only be'
' passed with the --alpha_fps option when using collated alpha '
'diversity data and also selecting a rarefaction depth with the'
' --depth option.')
else:
metrics, alpha_sample_ids, alpha_data = parse_matrix(open(
alpha_fps[0], 'U'))

# parse the data from the files
mapping_file_data, mapping_file_headers, comments = parse_mapping_file(
open(mapping_fp, 'U'))
metrics, alpha_sample_ids, alpha_data = parse_matrix(open(alpha_fp, 'U'))

# add the alpha diversity data to the mapping file
out_mapping_file_data, out_mapping_file_headers = \
Expand Down
76 changes: 75 additions & 1 deletion tests/test_add_alpha_to_mapping_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from numpy import array, median
from cogent.util.unit_test import TestCase, main
from qiime.add_alpha_to_mapping_file import (add_alpha_diversity_values_to_mapping_file,
_get_level)
_get_level, mean_alpha)

class TopLevelTests(TestCase):

Expand All @@ -26,6 +26,9 @@ def setUp(self):
self.sample_ids = ['PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354',
'PC.593', 'PC.355', 'PC.607', 'PC.634']

self.collated_alpha_dict_a = COLLATED_ALPHA_DICT_A
self.collated_alpha_dict_b = COLLATED_ALPHA_DICT_B

self.mapping_file_data = MAPPING_FILE_DATA
self.mapping_file_headers = ['SampleID', 'BarcodeSequence',
'LinkerPrimerSequence', 'Treatment', 'DOB', 'Description']
Expand Down Expand Up @@ -100,6 +103,44 @@ def test__get_level(self):
with self.assertRaises(AssertionError):
output = _get_level(-1, [0.25, 0.5, 0.75])

def test_mean_alpha(self):
"""checks data is being correctly averaged"""

# regular use-cases for this function
expected_data = [[9.441785, 82.93], [0.42877, 5.2006], [9.625995, 8.18]]
expected_metrics = ['PD_whole_tree_even_310', 'chao1_even_310']
expected_sample_ids = ['s1', 's2', 's3']

o_metrics, o_sample_ids, o_data = mean_alpha(
self.collated_alpha_dict_a, 310)

self.assertEquals(o_metrics, expected_metrics)
self.assertEquals(o_sample_ids, expected_sample_ids)
self.assertEquals(o_data, expected_data)

expected_data = [[12.508435, 11.6105], [0.42877, 8.42], [11.58785, 1.0]]
expected_metrics = ['PD_whole_tree_even_610', 'chao1_even_610']

o_metrics, o_sample_ids, o_data = mean_alpha(
self.collated_alpha_dict_a, 610)

self.assertEquals(o_metrics, expected_metrics)
self.assertEquals(o_sample_ids, expected_sample_ids)
self.assertEquals(o_data, expected_data)

# files with non-matching sample ids should raise an exception
with self.assertRaises(ValueError):
o_metrics, o_sample_ids, o_data = mean_alpha(
self.collated_alpha_dict_b, 310)

# input types that should not be processed
with self.assertRaises(AssertionError):
output = mean_alpha([1, 2, 3], 5)

with self.assertRaises(AssertionError):
output = mean_alpha({'a':'b'}, -1.4)


MAPPING_FILE_DATA = [
['PC.354','AGCACGAGCCTA','YATGCTGCCTCCCGTAGGAGT','Control','20061218','Control_mouse_I.D._354'],
['PC.355','AACTCGTCGATG','YATGCTGCCTCCCGTAGGAGT','Control','20061218','Control_mouse_I.D._355'],
Expand Down Expand Up @@ -133,6 +174,39 @@ def test__get_level(self):
['PC.635', 'ACCGCAGAGTCA', 'YATGCTGCCTCCCGTAGGAGT', 'Fast', '20080116', 'Fasting_mouse_I.D._635', '332.5', '1.0', 'bin_4_of_4', '7.48089', '1.0', 'bin_4_of_4'],
['PC.636', 'ACGGTGAGTGTC', 'YATGCTGCCTCCCGTAGGAGT', 'Fast', '20080116', 'Fasting_mouse_I.D._636', '173.0', '0.342268041237', 'bin_2_of_4', '6.39901', '0.636003943167', 'bin_3_of_4']]

COLLATED_ALPHA_DICT_A = {
'PD_whole_tree':['\tsequences per sample\titeration\ts1\ts2\ts3',
'rare10.txt\t10\t0\t1.99181\t0.42877\t2.13996',
'rare10.txt\t10\t1\t2.07163\t0.42877\t2.37055',
'rare310.txt\t310\t0\t8.83115\t0.42877\t11.00725',
'rare310.txt\t310\t1\t10.05242\t0.42877\t8.24474',
'rare610.txt\t610\t0\t12.03067\t0.42877\t11.58928',
'rare610.txt\t610\t1\t12.9862\t0.42877\t11.58642'],
'chao1':['\tsequences per sample\titeration\ts1\ts2\ts3',
'rare10.txt\t10\t0\t4.2\t3.1415\t9.11',
'rare10.txt\t10\t1\t5.6\t3.15\t9.62',
'rare310.txt\t310\t0\t83.11\t5.2012\t8.12',
'rare310.txt\t310\t1\t82.75\t5.2000\t8.24',
'rare610.txt\t610\t0\t11.11\t8.42\t1',
'rare610.txt\t610\t1\t12.111\t8.42\t1']
}

COLLATED_ALPHA_DICT_B = {
'PD_whole_tree':['\tsequences per sample\titeration\ts1\ts2\ts3',
'rare10.txt\t10\t0\t1.99181\t0.42877\t2.13996',
'rare10.txt\t10\t1\t2.07163\t0.42877\t2.37055',
'rare310.txt\t310\t0\t8.83115\t0.42877\t11.00725',
'rare310.txt\t310\t1\t10.05242\t0.42877\t8.24474',
'rare610.txt\t610\t0\t12.03067\t0.42877\t11.58928',
'rare610.txt\t610\t1\t12.9862\t0.42877\t11.58642'],
'chao1':['\tsequences per sample\titeration\ts511\ts512\ts3',
'rare10.txt\t10\t0\t4.2\t3.1415\t9.11',
'rare10.txt\t10\t1\t5.6\t3.15\t9.62',
'rare310.txt\t310\t0\t83.11\t5.2012\t8.12',
'rare310.txt\t310\t1\t82.75\t5.2000\t8.24',
'rare610.txt\t610\t0\t11.11\t8.42\t1',
'rare610.txt\t610\t1\t12.111\t8.42\t1']
}

if __name__ == "__main__":
main()

0 comments on commit baa350d

Please sign in to comment.