diff --git a/cosima_cookbook/database.py b/cosima_cookbook/database.py index e61bf87..eca8833 100644 --- a/cosima_cookbook/database.py +++ b/cosima_cookbook/database.py @@ -172,7 +172,7 @@ def update_timeinfo(f, ncfile): with netCDF4.Dataset(f, 'r') as ds: # we assume the record dimension corresponds to time - time_dim = netcdf_utils.find_record_dimension(ds) + time_dim = netcdf_utils.find_time_dimension(ds) if time_dim is None: return None diff --git a/cosima_cookbook/netcdf_utils.py b/cosima_cookbook/netcdf_utils.py index fcef180..5f83793 100644 --- a/cosima_cookbook/netcdf_utils.py +++ b/cosima_cookbook/netcdf_utils.py @@ -6,3 +6,42 @@ def find_record_dimension(d): return dim return None + +def find_dimension_with_attribute(d, attribute, value): + """Find a matching dimension with attribute=value, or None.""" + + for dim in d.dimensions: + if dim not in d.variables: + continue + + if getattr(d.variables[dim], attribute, None) == value: + return dim + + return None + +def find_time_dimension(d): + """Find a time dimension in a netCDF4 Dataset.""" + + # this is a bit heuristic, but we cascade through some checks, guided by + # the CF conventions + + dim = find_dimension_with_attribute(d, 'standard_name', 'time') + if dim is not None: + return dim + + dim = find_dimension_with_attribute(d, 'axis', 'T') + if dim is not None: + return dim + + dim = find_record_dimension(d) + if dim is not None: + return dim + + for dim in d.dimensions: + if dim.lower() == 'time': + return dim + + # CF conventions also suggests the units attribute, + # but time_bounds may have the same units, and a false positive + # here could be very confusing... + return None diff --git a/test/data/indexing/time/t1.nc b/test/data/indexing/time/t1.nc new file mode 100644 index 0000000..9b8bb44 Binary files /dev/null and b/test/data/indexing/time/t1.nc differ diff --git a/test/data/indexing/time/t2.nc b/test/data/indexing/time/t2.nc new file mode 100644 index 0000000..27a6e82 Binary files /dev/null and b/test/data/indexing/time/t2.nc differ diff --git a/test/data/indexing/time/t3.nc b/test/data/indexing/time/t3.nc new file mode 100644 index 0000000..d6f8e4b Binary files /dev/null and b/test/data/indexing/time/t3.nc differ diff --git a/test/data/indexing/time/t4.nc b/test/data/indexing/time/t4.nc new file mode 100644 index 0000000..e419292 Binary files /dev/null and b/test/data/indexing/time/t4.nc differ diff --git a/test/test_indexing.py b/test/test_indexing.py index 87a6307..1611388 100644 --- a/test/test_indexing.py +++ b/test/test_indexing.py @@ -170,6 +170,16 @@ def test_broken_metadata(session_db): assert(indexed == 1) +def test_time_dimension(session_db): + session, db = session_db + database.build_index('test/data/indexing/time', session) + + q = session.query(database.NCFile.time_start, database.NCFile.time_end) + assert(q.count() == 4) # should pick up 4 files + + q = q.filter((database.NCFile.time_start is None) | (database.NCFile.time_end is None)) + assert(q.count() == 0) # but all of them should have times populated + def test_distributed(client, session_db): session, db = session_db database.build_index('test/data/indexing/broken_file', session, client)