diff --git a/tests/test_variantdata.py b/tests/test_variantdata.py index 550ae1a5..746fc883 100644 --- a/tests/test_variantdata.py +++ b/tests/test_variantdata.py @@ -218,7 +218,11 @@ def test_variantdata_accessors_defaults(tmp_path, in_mem): ds = data if in_mem else sgkit.load_dataset(data) default_schema = tskit.MetadataSchema.permissive_json().schema - assert vdata.sequence_length == ts.sequence_length + with pytest.warns( + UserWarning, + match="`sequence_length` was not found as an attribute in the dataset", + ): + assert vdata.sequence_length == ts.sequence_length assert vdata.sites_metadata_schema == default_schema assert vdata.sites_metadata == [{} for _ in range(ts.num_sites)] for time in vdata.sites_time: @@ -234,17 +238,32 @@ def test_variantdata_accessors_defaults(tmp_path, in_mem): assert vdata.individuals_metadata == [ {"variant_data_sample_id": sample_id} for sample_id in ds.sample_id[:] ] - for time in vdata.individuals_time: - assert tskit.is_unknown_time(time) - assert np.array_equal( - vdata.individuals_location, np.array([[]] * ts.num_individuals, dtype=float) - ) - assert np.array_equal( - vdata.individuals_population, np.full(ts.num_individuals, tskit.NULL) - ) - assert np.array_equal( - vdata.individuals_flags, np.zeros(ts.num_individuals, dtype=int) - ) + with pytest.warns( + UserWarning, match="`individuals_time` was not found as an array in the dataset" + ): + for time in vdata.individuals_time: + assert tskit.is_unknown_time(time) + with pytest.warns( + UserWarning, + match="`individuals_location` was not found as an array in the dataset", + ): + assert np.array_equal( + vdata.individuals_location, np.array([[]] * ts.num_individuals, dtype=float) + ) + with pytest.warns( + UserWarning, + match="`individuals_population` was not found as an array in the dataset", + ): + assert np.array_equal( + vdata.individuals_population, np.full(ts.num_individuals, tskit.NULL) + ) + with pytest.warns( + UserWarning, + match="`individuals_flags` was not found as an array in the dataset", + ): + assert np.array_equal( + vdata.individuals_flags, np.zeros(ts.num_individuals, dtype=int) + ) @pytest.mark.skipif(sys.platform == "win32", reason="No cyvcf2 on windows") diff --git a/tsinfer/formats.py b/tsinfer/formats.py index c485d792..95c91061 100644 --- a/tsinfer/formats.py +++ b/tsinfer/formats.py @@ -2520,6 +2520,12 @@ def sequence_length(self): try: return self.data.attrs["sequence_length"] except KeyError: + warnings.warn( + "`sequence_length` was not found as an attribute in the dataset, so" + " the largest position has been used. It can be set with" + " ds.attrs['sequence_length'] = 1337; ds.to_zarr('path/to/store'," + " mode='a')" + ) return int(np.max(self.data["variant_position"])) + 1 @property @@ -2653,6 +2659,12 @@ def individuals_time(self): try: return self.data["individuals_time"][:][self.individuals_select] except KeyError: + warnings.warn( + "`individuals_time` was not found as an array in the dataset, so " + "tskit.UNKNOWN_TIME has been used. It can be apppended to the dataset " + "with data_array.to_zarr('path/to/store', append_dim='samples', " + "mode='a')" + ) return np.full(self.num_individuals, tskit.UNKNOWN_TIME) @functools.cached_property @@ -2696,6 +2708,11 @@ def individuals_location(self): try: return self.data["individuals_location"][:][self.individuals_select] except KeyError: + warnings.warn( + "`individuals_location` was not found as an array in the dataset, " + "so [] has been used. It can be apppended to the dataset with " + "data_array.to_zarr('path/to/store', append_dim='samples', mode='a')" + ) return np.array([[]] * self.num_individuals, dtype=float) @functools.cached_property @@ -2703,6 +2720,11 @@ def individuals_population(self): try: return self.data["individuals_population"][:][self.individuals_select] except KeyError: + warnings.warn( + "`individuals_population` was not found as an array in the dataset, " + "so tskit.NULL has been used. It can be apppended to the dataset with " + "data_array.to_zarr('path/to/store', append_dim='samples', mode='a')" + ) return np.full((self.num_individuals), tskit.NULL, dtype=np.int32) @functools.cached_property @@ -2710,6 +2732,11 @@ def individuals_flags(self): try: return self.data["individuals_flags"][:][self.individuals_select] except KeyError: + warnings.warn( + "`individuals_flags` was not found as an array in the dataset, so 0 " + "has been used. It can be apppended to the dataset with " + "data_array.to_zarr('path/to/store', append_dim='samples', mode='a')" + ) return np.full((self.num_individuals), 0, dtype=np.int32) @staticmethod