From d11c7ad47d2082e8f861d178da3c9cc1ab65e6f2 Mon Sep 17 00:00:00 2001 From: "Adam Ginsburg (keflavich)" Date: Tue, 23 Apr 2019 12:12:24 -0600 Subject: [PATCH 1/7] @GiantMolecularCloud was having some problems with io times, so we implented some small speedups --- astrodendro/io/util.py | 69 +++++++++++++++++++++------------------- astrodendro/structure.py | 10 +++--- 2 files changed, 42 insertions(+), 37 deletions(-) diff --git a/astrodendro/io/util.py b/astrodendro/io/util.py index 93ee349..d67029f 100644 --- a/astrodendro/io/util.py +++ b/astrodendro/io/util.py @@ -70,6 +70,39 @@ def collect(d): return items['trunk'] +def _construct_tree(dend, repr, indices_by_structure, flux_by_structure): + from ..structure import Structure + structures = [] + for idx in repr: + idx = int(idx) + structure_indices = indices_by_structure[idx] + f = flux_by_structure[idx] + if type(repr[idx]) == tuple: + sub_structures_repr = repr[idx][0] # Parsed representation of sub structures + sub_structures = _construct_tree(sub_structures_repr) + for i in sub_structures: + dend._structures_dict[i.idx] = i + b = Structure(structure_indices, f, children=sub_structures, idx=idx, dendrogram=dend) + # Correct merge levels - complicated because of the + # order in which we are building the tree. + # What we do is look at the heights of this branch's + # 1st child as stored in the newick representation, and then + # work backwards to compute the merge level of this branch + # + # these five lines were not used + #first_child_repr = six.next(six.itervalues(sub_structures_repr)) + #if type(first_child_repr) == tuple: + # height = first_child_repr[1] + #else: + # height = first_child_repr + dend._structures_dict[idx] = b + structures.append(b) + else: + ell = Structure(structure_indices, f, idx=idx, dendrogram=dend) + structures.append(ell) + dend._structures_dict[idx] = ell + return structures + def parse_dendrogram(newick, data, index_map, params, wcs=None): from ..dendrogram import Dendrogram from ..structure import Structure @@ -88,38 +121,8 @@ def parse_dendrogram(newick, data, index_map, params, wcs=None): except ImportError: flux_by_structure, indices_by_structure = _slow_reader(d.index_map, data) - def _construct_tree(repr): - structures = [] - for idx in repr: - idx = int(idx) - structure_indices = indices_by_structure[idx] - f = flux_by_structure[idx] - if type(repr[idx]) == tuple: - sub_structures_repr = repr[idx][0] # Parsed representation of sub structures - sub_structures = _construct_tree(sub_structures_repr) - for i in sub_structures: - d._structures_dict[i.idx] = i - b = Structure(structure_indices, f, children=sub_structures, idx=idx, dendrogram=d) - # Correct merge levels - complicated because of the - # order in which we are building the tree. - # What we do is look at the heights of this branch's - # 1st child as stored in the newick representation, and then - # work backwards to compute the merge level of this branch - first_child_repr = six.next(six.itervalues(sub_structures_repr)) - if type(first_child_repr) == tuple: - height = first_child_repr[1] - else: - height = first_child_repr - d._structures_dict[idx] = b - structures.append(b) - else: - l = Structure(structure_indices, f, idx=idx, dendrogram=d) - structures.append(l) - d._structures_dict[idx] = l - return structures - log.debug('Parsing newick and constructing tree...') - d.trunk = _construct_tree(parse_newick(newick)) + d.trunk = _construct_tree(d, parse_newick(newick), indices_by_structure, flux_by_structure) # To make the structure.level property fast, we ensure all the items in the # trunk have their level cached as "0" for structure in d.trunk: @@ -157,10 +160,10 @@ def _fast_reader(index_map, data): match = index_map[sl] == idx sl2 = (slice(None),) + sl match_inds = index_cube[sl2][:, match] - coords = list(zip(*match_inds)) + #coords = list(zip(*match_inds)) dd = data[sl][match].tolist() flux_by_structure[idx] = dd - indices_by_structure[idx] = coords + indices_by_structure[idx] = match_inds.T p.update() return flux_by_structure, indices_by_structure diff --git a/astrodendro/structure.py b/astrodendro/structure.py index 0eb7d30..81254a5 100644 --- a/astrodendro/structure.py +++ b/astrodendro/structure.py @@ -69,7 +69,7 @@ def __init__(self, indices, values, children=[], idx=None, dendrogram=None): self._indices = [indices] self._values = [values] self._vmin, self._vmax = values, values - elif isinstance(indices, list) and isinstance(values, list): + elif (isinstance(indices, list) or isinstance(indices, np.ndarray)) and isinstance(values, list): self._indices = indices self._values = values self._vmin, self._vmax = min(values), max(values) @@ -78,7 +78,10 @@ def __init__(self, indices, values, children=[], idx=None, dendrogram=None): self._values = [x for x in values] self._vmin, self._vmax = min(values), max(values) - self._smallest_index = min(self._indices) + try: + self._smallest_index = min(self._indices) + except ValueError: + self._smallest_index = self._indices[0] self.idx = idx @@ -402,8 +405,7 @@ def key(x): if self._peak is None: for s in reversed(list(prefix_visit(self))): - s._peak = (s._indices[s._values.index(s.vmax)], - s.vmax) + s._peak = (tuple(s._indices[s._values.index(s.vmax)]), s.vmax) if s.is_leaf: s._peak_subtree = s._peak else: From 8ad22ed5ae48c9c42934d08ec82082bac4e875a5 Mon Sep 17 00:00:00 2001 From: "Adam Ginsburg (keflavich)" Date: Tue, 23 Apr 2019 13:20:43 -0600 Subject: [PATCH 2/7] add a progressbar for the actual bottleneck --- astrodendro/io/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/astrodendro/io/util.py b/astrodendro/io/util.py index d67029f..e855bb1 100644 --- a/astrodendro/io/util.py +++ b/astrodendro/io/util.py @@ -22,7 +22,7 @@ def parse_newick(string): # Loop through levels and construct tree log.debug('Tree loading...') - for level in range(max_level, 0, -1): + for level in ProgressBar(range(max_level, 0, -1)): pairs = [] From f2e40f8a540bfcc990f32952c4f220ba892d0434 Mon Sep 17 00:00:00 2001 From: "Adam Ginsburg (keflavich)" Date: Tue, 23 Apr 2019 14:20:54 -0600 Subject: [PATCH 3/7] add a JSON parser and move the parse_newick to a different location. We still inexplicably have an order-of-magnitude speed difference between calling `parse_newick` on a newick string and doing the _exact same thing_ within the io code... --- astrodendro/io/fits.py | 13 +++++++---- astrodendro/io/util.py | 51 +++++++++++++++++++++++++++++++++--------- 2 files changed, 50 insertions(+), 14 deletions(-) diff --git a/astrodendro/io/fits.py b/astrodendro/io/fits.py index b8d0473..4a012c2 100644 --- a/astrodendro/io/fits.py +++ b/astrodendro/io/fits.py @@ -3,8 +3,9 @@ import os import numpy as np +from astropy import log -from .util import parse_dendrogram +from .util import parse_dendrogram, parse_newick from .handler import IOHandler # Import and export @@ -71,12 +72,16 @@ def dendro_import_fits(filename): params = {"min_npix": hdus[0].header['MIN_NPIX'], "min_value": hdus[0].header['MIN_VAL'], "min_delta": hdus[0].header['MIN_DELT']} - + else: - + params = {} - return parse_dendrogram(newick, data, index_map, params, wcs) + log.debug('Parsing newick and constructing tree...') + log.debug("newick is: {0}".format(newick[:100])) + parsed_newick = parse_newick(newick) + + return parse_dendrogram(parsed_newick, data, index_map, params, wcs) FITSHandler = IOHandler(identify=is_fits, diff --git a/astrodendro/io/util.py b/astrodendro/io/util.py index e855bb1..00b18de 100644 --- a/astrodendro/io/util.py +++ b/astrodendro/io/util.py @@ -5,14 +5,45 @@ from astropy import log -def parse_newick(string): +def newick_from_json(d): + """ + If "d" is a JSON-derived dict (e.g., `json.load('my_newick.json')`), this + will convert all of the keys from strings to integers, resulting in + something with identical format to `parse_newick`'s result. + + Example: + >>> newick = ''.join(chr(x) for x in hdus[3].data.flat) + >>> rslt = astrodendro.io.util.parse_newick(newick) + >>> jj = json.dumps(rslt) + >>> rdtrip = newick_from_json(json.loads(jj)) + >>> rdtrip == rslt + True + """ + new = {} + for k, v in d.items(): + new_v = v + if isinstance(v, dict): + new_v = newick_from_json(v) + elif isinstance(v, list): + new_v = list() + for x in v: + if isinstance(x, dict): + new_v.append(newick_from_json(x)) + else: + new_v.append(x) + new_v = tuple(new_v) + new[int(k)] = new_v + return new +def parse_newick(string): items = {} # Find maximum level current_level = 0 max_level = 0 - log.debug('String loading...') + log.debug("String starts with {0}".format(string[:100])) + log.debug("String loading... newick has {0} chars, {1} ('s" + .format(len(string), string.count("("))) for i, c in enumerate(string): if c == '(': current_level += 1 @@ -21,7 +52,7 @@ def parse_newick(string): max_level = max(max_level, current_level) # Loop through levels and construct tree - log.debug('Tree loading...') + log.debug('Tree loading... max_level={0}'.format(max_level)) for level in ProgressBar(range(max_level, 0, -1)): pairs = [] @@ -56,8 +87,6 @@ def parse_newick(string): # Remove branch definition from string string = string[:start] + string[end + 1:] - new_items = {} - def collect(d): for item in d: if item in items: @@ -79,7 +108,10 @@ def _construct_tree(dend, repr, indices_by_structure, flux_by_structure): f = flux_by_structure[idx] if type(repr[idx]) == tuple: sub_structures_repr = repr[idx][0] # Parsed representation of sub structures - sub_structures = _construct_tree(sub_structures_repr) + sub_structures = _construct_tree(dend, + sub_structures_repr, + indices_by_structure, + flux_by_structure) for i in sub_structures: dend._structures_dict[i.idx] = i b = Structure(structure_indices, f, children=sub_structures, idx=idx, dendrogram=dend) @@ -103,9 +135,8 @@ def _construct_tree(dend, repr, indices_by_structure, flux_by_structure): dend._structures_dict[idx] = ell return structures -def parse_dendrogram(newick, data, index_map, params, wcs=None): +def parse_dendrogram(parsed_newick, data, index_map, params, wcs=None): from ..dendrogram import Dendrogram - from ..structure import Structure d = Dendrogram() d.ndim = len(data.shape) @@ -121,8 +152,8 @@ def parse_dendrogram(newick, data, index_map, params, wcs=None): except ImportError: flux_by_structure, indices_by_structure = _slow_reader(d.index_map, data) - log.debug('Parsing newick and constructing tree...') - d.trunk = _construct_tree(d, parse_newick(newick), indices_by_structure, flux_by_structure) + log.debug('Constructing tree...') + d.trunk = _construct_tree(d, parsed_newick, indices_by_structure, flux_by_structure) # To make the structure.level property fast, we ensure all the items in the # trunk have their level cached as "0" for structure in d.trunk: From c7ba98cc19eec4886119a3899ca2a0150a109825 Mon Sep 17 00:00:00 2001 From: "Adam Ginsburg (keflavich)" Date: Tue, 23 Apr 2019 14:25:27 -0600 Subject: [PATCH 4/7] move the newick parsing back to where it was --- astrodendro/io/fits.py | 9 ++------- astrodendro/io/util.py | 5 ++++- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/astrodendro/io/fits.py b/astrodendro/io/fits.py index 4a012c2..ebbf3e6 100644 --- a/astrodendro/io/fits.py +++ b/astrodendro/io/fits.py @@ -3,9 +3,8 @@ import os import numpy as np -from astropy import log -from .util import parse_dendrogram, parse_newick +from .util import parse_dendrogram from .handler import IOHandler # Import and export @@ -77,11 +76,7 @@ def dendro_import_fits(filename): params = {} - log.debug('Parsing newick and constructing tree...') - log.debug("newick is: {0}".format(newick[:100])) - parsed_newick = parse_newick(newick) - - return parse_dendrogram(parsed_newick, data, index_map, params, wcs) + return parse_dendrogram(newick, data, index_map, params, wcs) FITSHandler = IOHandler(identify=is_fits, diff --git a/astrodendro/io/util.py b/astrodendro/io/util.py index 00b18de..d23c966 100644 --- a/astrodendro/io/util.py +++ b/astrodendro/io/util.py @@ -135,7 +135,7 @@ def _construct_tree(dend, repr, indices_by_structure, flux_by_structure): dend._structures_dict[idx] = ell return structures -def parse_dendrogram(parsed_newick, data, index_map, params, wcs=None): +def parse_dendrogram(newick, data, index_map, params, wcs=None): from ..dendrogram import Dendrogram d = Dendrogram() @@ -152,6 +152,9 @@ def parse_dendrogram(parsed_newick, data, index_map, params, wcs=None): except ImportError: flux_by_structure, indices_by_structure = _slow_reader(d.index_map, data) + log.debug('Parsing newick and constructing tree...') + parsed_newick = parse_newick(newick) + log.debug('Constructing tree...') d.trunk = _construct_tree(d, parsed_newick, indices_by_structure, flux_by_structure) # To make the structure.level property fast, we ensure all the items in the From 5510e4265859e942b9c1e0d322e315de748ec4c5 Mon Sep 17 00:00:00 2001 From: "Adam Ginsburg (keflavich)" Date: Tue, 23 Apr 2019 14:28:38 -0600 Subject: [PATCH 5/7] add a print statement --- astrodendro/io/util.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/astrodendro/io/util.py b/astrodendro/io/util.py index d23c966..e8b28c0 100644 --- a/astrodendro/io/util.py +++ b/astrodendro/io/util.py @@ -152,6 +152,8 @@ def parse_dendrogram(newick, data, index_map, params, wcs=None): except ImportError: flux_by_structure, indices_by_structure = _slow_reader(d.index_map, data) + # newline to avoid overwriting progressbar + print() log.debug('Parsing newick and constructing tree...') parsed_newick = parse_newick(newick) From 58f21df1b95a537629f2f7cb6665376ddd757cac Mon Sep 17 00:00:00 2001 From: "Adam Ginsburg (keflavich)" Date: Tue, 7 Dec 2021 18:03:34 -0500 Subject: [PATCH 6/7] fix tests --- astrodendro/structure.py | 8 +++++++- astrodendro/tests/test_flux.py | 7 ++++++- astrodendro/tests/test_structure.py | 6 +++--- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/astrodendro/structure.py b/astrodendro/structure.py index 81254a5..4e2c5f6 100644 --- a/astrodendro/structure.py +++ b/astrodendro/structure.py @@ -403,9 +403,15 @@ def get_peak(self, subtree=True): def key(x): return x[1] + def to_tuple(x): + if np.isscalar(x): + return (x,) + else: + return tuple(x) + if self._peak is None: for s in reversed(list(prefix_visit(self))): - s._peak = (tuple(s._indices[s._values.index(s.vmax)]), s.vmax) + s._peak = (to_tuple(s._indices[s._values.index(s.vmax)]), s.vmax) if s.is_leaf: s._peak_subtree = s._peak else: diff --git a/astrodendro/tests/test_flux.py b/astrodendro/tests/test_flux.py index 3a104b3..c34cf68 100644 --- a/astrodendro/tests/test_flux.py +++ b/astrodendro/tests/test_flux.py @@ -25,7 +25,12 @@ def strip_parentheses(string): @pytest.mark.parametrize(('input_quantities', 'output_unit', 'keywords', 'output'), COMBINATIONS) def test_compute_flux(input_quantities, output_unit, keywords, output): q = compute_flux(input_quantities, output_unit, **keywords) - np.testing.assert_allclose(q.value, output.value) + # tolerances are set b/c some conversions lose precision at the ~10^-7 level + # (, Unit("Jy"), {'spatial_scale': , 'beam_major': , 'beam_minor': , 'wavelength': }, ) + # -> + # Max absolute difference: 5.64106332e-08 + # Max relative difference: 1.44859431e-07 + np.testing.assert_allclose(q.value, output.value, atol=1e-7, rtol=2e-7) assert q.unit == output.unit diff --git a/astrodendro/tests/test_structure.py b/astrodendro/tests/test_structure.py index 8040750..825a3cd 100644 --- a/astrodendro/tests/test_structure.py +++ b/astrodendro/tests/test_structure.py @@ -331,21 +331,21 @@ def test_add_pixel(): s = Structure(1, 10.) assert s.get_npix() == 1 - assert s.get_peak() == (1, 10) + assert s.get_peak() == ((1,), 10.) assert s.vmin == 10. assert s.vmax == 10. s._add_pixel(2, 8.) assert s.get_npix() == 2 - assert s.get_peak() == (1, 10) + assert s.get_peak() == ((1,), 10.) assert s.vmin == 8. assert s.vmax == 10. s._add_pixel(3, 12.) assert s.get_npix() == 3 - assert s.get_peak() == (3, 12.) + assert s.get_peak() == ((3,), 12.) assert s.vmin == 8. assert s.vmax == 12. From cd4f56e47822c95afdab82fb15bee82a1ff11d39 Mon Sep 17 00:00:00 2001 From: "Adam Ginsburg (keflavich)" Date: Tue, 7 Dec 2021 18:06:07 -0500 Subject: [PATCH 7/7] make progressbar pseudo-optional (nothing is passing kwargs to parse_newick right now) --- astrodendro/io/util.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/astrodendro/io/util.py b/astrodendro/io/util.py index e8b28c0..822cd88 100644 --- a/astrodendro/io/util.py +++ b/astrodendro/io/util.py @@ -35,7 +35,7 @@ def newick_from_json(d): new[int(k)] = new_v return new -def parse_newick(string): +def parse_newick(string, verbose=True): items = {} # Find maximum level @@ -51,6 +51,9 @@ def parse_newick(string): current_level -= 1 max_level = max(max_level, current_level) + if not verbose: + ProgressBar = lambda x: x + # Loop through levels and construct tree log.debug('Tree loading... max_level={0}'.format(max_level)) for level in ProgressBar(range(max_level, 0, -1)):