Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IO optimization #164

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions astrodendro/io/fits.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,9 @@ def dendro_import_fits(filename):
params = {"min_npix": hdus[0].header['MIN_NPIX'],
"min_value": hdus[0].header['MIN_VAL'],
"min_delta": hdus[0].header['MIN_DELT']}

else:

params = {}

return parse_dendrogram(newick, data, index_map, params, wcs)
Expand Down
122 changes: 82 additions & 40 deletions astrodendro/io/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,58 @@
from astropy import log


def parse_newick(string):
def newick_from_json(d):
"""
If "d" is a JSON-derived dict (e.g., `json.load('my_newick.json')`), this
will convert all of the keys from strings to integers, resulting in
something with identical format to `parse_newick`'s result.

Example:
>>> newick = ''.join(chr(x) for x in hdus[3].data.flat)
>>> rslt = astrodendro.io.util.parse_newick(newick)
>>> jj = json.dumps(rslt)
>>> rdtrip = newick_from_json(json.loads(jj))
>>> rdtrip == rslt
True
"""
new = {}
for k, v in d.items():
new_v = v
if isinstance(v, dict):
new_v = newick_from_json(v)
elif isinstance(v, list):
new_v = list()
for x in v:
if isinstance(x, dict):
new_v.append(newick_from_json(x))
else:
new_v.append(x)
new_v = tuple(new_v)
new[int(k)] = new_v
return new

def parse_newick(string, verbose=True):
items = {}

# Find maximum level
current_level = 0
max_level = 0
log.debug('String loading...')
log.debug("String starts with {0}".format(string[:100]))
log.debug("String loading... newick has {0} chars, {1} ('s"
.format(len(string), string.count("(")))
for i, c in enumerate(string):
if c == '(':
current_level += 1
if c == ')':
current_level -= 1
max_level = max(max_level, current_level)

if not verbose:
ProgressBar = lambda x: x

# Loop through levels and construct tree
log.debug('Tree loading...')
for level in range(max_level, 0, -1):
log.debug('Tree loading... max_level={0}'.format(max_level))
for level in ProgressBar(range(max_level, 0, -1)):

pairs = []

Expand Down Expand Up @@ -56,8 +90,6 @@ def parse_newick(string):
# Remove branch definition from string
string = string[:start] + string[end + 1:]

new_items = {}

def collect(d):
for item in d:
if item in items:
Expand All @@ -70,9 +102,44 @@ def collect(d):
return items['trunk']


def _construct_tree(dend, repr, indices_by_structure, flux_by_structure):
from ..structure import Structure
structures = []
for idx in repr:
idx = int(idx)
structure_indices = indices_by_structure[idx]
f = flux_by_structure[idx]
if type(repr[idx]) == tuple:
sub_structures_repr = repr[idx][0] # Parsed representation of sub structures
sub_structures = _construct_tree(dend,
sub_structures_repr,
indices_by_structure,
flux_by_structure)
for i in sub_structures:
dend._structures_dict[i.idx] = i
b = Structure(structure_indices, f, children=sub_structures, idx=idx, dendrogram=dend)
# Correct merge levels - complicated because of the
# order in which we are building the tree.
# What we do is look at the heights of this branch's
# 1st child as stored in the newick representation, and then
# work backwards to compute the merge level of this branch
#
# these five lines were not used
#first_child_repr = six.next(six.itervalues(sub_structures_repr))
#if type(first_child_repr) == tuple:
# height = first_child_repr[1]
#else:
# height = first_child_repr
dend._structures_dict[idx] = b
structures.append(b)
else:
ell = Structure(structure_indices, f, idx=idx, dendrogram=dend)
structures.append(ell)
dend._structures_dict[idx] = ell
return structures

def parse_dendrogram(newick, data, index_map, params, wcs=None):
from ..dendrogram import Dendrogram
from ..structure import Structure

d = Dendrogram()
d.ndim = len(data.shape)
Expand All @@ -88,38 +155,13 @@ def parse_dendrogram(newick, data, index_map, params, wcs=None):
except ImportError:
flux_by_structure, indices_by_structure = _slow_reader(d.index_map, data)

def _construct_tree(repr):
structures = []
for idx in repr:
idx = int(idx)
structure_indices = indices_by_structure[idx]
f = flux_by_structure[idx]
if type(repr[idx]) == tuple:
sub_structures_repr = repr[idx][0] # Parsed representation of sub structures
sub_structures = _construct_tree(sub_structures_repr)
for i in sub_structures:
d._structures_dict[i.idx] = i
b = Structure(structure_indices, f, children=sub_structures, idx=idx, dendrogram=d)
# Correct merge levels - complicated because of the
# order in which we are building the tree.
# What we do is look at the heights of this branch's
# 1st child as stored in the newick representation, and then
# work backwards to compute the merge level of this branch
first_child_repr = six.next(six.itervalues(sub_structures_repr))
if type(first_child_repr) == tuple:
height = first_child_repr[1]
else:
height = first_child_repr
d._structures_dict[idx] = b
structures.append(b)
else:
l = Structure(structure_indices, f, idx=idx, dendrogram=d)
structures.append(l)
d._structures_dict[idx] = l
return structures

# newline to avoid overwriting progressbar
print()
e-koch marked this conversation as resolved.
Show resolved Hide resolved
log.debug('Parsing newick and constructing tree...')
d.trunk = _construct_tree(parse_newick(newick))
parsed_newick = parse_newick(newick)

log.debug('Constructing tree...')
d.trunk = _construct_tree(d, parsed_newick, indices_by_structure, flux_by_structure)
# To make the structure.level property fast, we ensure all the items in the
# trunk have their level cached as "0"
for structure in d.trunk:
Expand Down Expand Up @@ -157,10 +199,10 @@ def _fast_reader(index_map, data):
match = index_map[sl] == idx
sl2 = (slice(None),) + sl
match_inds = index_cube[sl2][:, match]
coords = list(zip(*match_inds))
#coords = list(zip(*match_inds))
dd = data[sl][match].tolist()
flux_by_structure[idx] = dd
indices_by_structure[idx] = coords
indices_by_structure[idx] = match_inds.T
p.update()

return flux_by_structure, indices_by_structure
Expand Down
16 changes: 12 additions & 4 deletions astrodendro/structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def __init__(self, indices, values, children=[], idx=None, dendrogram=None):
self._indices = [indices]
self._values = [values]
self._vmin, self._vmax = values, values
elif isinstance(indices, list) and isinstance(values, list):
elif (isinstance(indices, list) or isinstance(indices, np.ndarray)) and isinstance(values, list):
self._indices = indices
self._values = values
self._vmin, self._vmax = min(values), max(values)
Expand All @@ -78,7 +78,10 @@ def __init__(self, indices, values, children=[], idx=None, dendrogram=None):
self._values = [x for x in values]
self._vmin, self._vmax = min(values), max(values)

self._smallest_index = min(self._indices)
try:
self._smallest_index = min(self._indices)
except ValueError:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What causes the ValueError in min? Is there a check somewhere if _indices is empty?

self._smallest_index = self._indices[0]

self.idx = idx

Expand Down Expand Up @@ -400,10 +403,15 @@ def get_peak(self, subtree=True):
def key(x):
return x[1]

def to_tuple(x):
if np.isscalar(x):
return (x,)
else:
return tuple(x)

if self._peak is None:
for s in reversed(list(prefix_visit(self))):
s._peak = (s._indices[s._values.index(s.vmax)],
s.vmax)
s._peak = (to_tuple(s._indices[s._values.index(s.vmax)]), s.vmax)
if s.is_leaf:
s._peak_subtree = s._peak
else:
Expand Down
7 changes: 6 additions & 1 deletion astrodendro/tests/test_flux.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,12 @@ def strip_parentheses(string):
@pytest.mark.parametrize(('input_quantities', 'output_unit', 'keywords', 'output'), COMBINATIONS)
def test_compute_flux(input_quantities, output_unit, keywords, output):
q = compute_flux(input_quantities, output_unit, **keywords)
np.testing.assert_allclose(q.value, output.value)
# tolerances are set b/c some conversions lose precision at the ~10^-7 level
# (<Quantity [1., 2., 3.] K>, Unit("Jy"), {'spatial_scale': <Quantity 2. arcsec>, 'beam_major': <Quantity 1. arcsec>, 'beam_minor': <Quantity 0.5 arcsec>, 'wavelength': <Quantity 2. mm>}, <Quantity 0.38941637 Jy>)
# ->
# Max absolute difference: 5.64106332e-08
# Max relative difference: 1.44859431e-07
np.testing.assert_allclose(q.value, output.value, atol=1e-7, rtol=2e-7)
assert q.unit == output.unit


Expand Down
6 changes: 3 additions & 3 deletions astrodendro/tests/test_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,21 +331,21 @@ def test_add_pixel():
s = Structure(1, 10.)

assert s.get_npix() == 1
assert s.get_peak() == (1, 10)
assert s.get_peak() == ((1,), 10.)
assert s.vmin == 10.
assert s.vmax == 10.

s._add_pixel(2, 8.)

assert s.get_npix() == 2
assert s.get_peak() == (1, 10)
assert s.get_peak() == ((1,), 10.)
assert s.vmin == 8.
assert s.vmax == 10.

s._add_pixel(3, 12.)

assert s.get_npix() == 3
assert s.get_peak() == (3, 12.)
assert s.get_peak() == ((3,), 12.)
assert s.vmin == 8.
assert s.vmax == 12.

Expand Down