Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DATASET: Air pollution data #309

Merged
merged 4 commits into from
Aug 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 180 additions & 0 deletions hub/management/commands/import_air_quality_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
from functools import reduce

from django.conf import settings
from django.core.management.base import BaseCommand

import pandas as pd
from tqdm import tqdm

from hub.models import Area, AreaData, DataSet, DataType


class Command(BaseCommand):
help = "Import air-pollution data"

source_url = "https://uk-air.defra.gov.uk/data/modelling-data"

gridcode_lookup_file = (
settings.BASE_DIR / "data" / "gridcode_constituency_lookup.csv"
)
message = "Importing constituency air-pollution data"
defaults = {
"label": "Air pollution",
"description": "Data relating to air pollution, recorded in 2021",
"data_type": "float",
"category": "place",
"source_label": "Defra",
"is_range": True,
"source": source_url,
"source_type": "csv",
"table": "areadata",
"comparators": DataSet.numerical_comparators(),
"default_value": 10,
"is_shadable": False,
}

in_files = {
"pm_10": {
"pollutant": "PM10",
"metric": "Annual mean",
"header_label": "pm102021g",
"comments": "Gravimetric units",
"csv_link": "https://uk-air.defra.gov.uk/datastore/pcm/mappm102021g.csv",
},
"pm_2_5": {
"pollutant": "PM2.5",
"metric": "Annual mean",
"header_label": "pm252021g",
"csv_link": "https://uk-air.defra.gov.uk/datastore/pcm/mappm252021g.csv",
},
"no_2": {
"pollutant": "NO2",
"metric": "Annual mean",
"header_label": "no22021",
"csv_link": "https://uk-air.defra.gov.uk/datastore/pcm/mapno22021.csv",
},
"no_x": {
"pollutant": "NOx",
"metric": "Annual mean",
"header_label": "nox2021",
"comments": "µg m\u207B\u00B3 (NO\u2093 as NO\u2082)",
"csv_link": "https://uk-air.defra.gov.uk/datastore/pcm/mapnox2021.csv",
},
"so_2": {
"pollutant": "SO2",
"metric": "Annual mean",
"header_label": "so22021",
"csv_link": "https://uk-air.defra.gov.uk/datastore/pcm/mapso22021.csv",
},
"ozone": {
"pollutant": "Ozone",
"metric": "DGT120",
"header_label": "dgt12021",
"comments": "number of days on which the daily max 8-hr concentration is greater than 120 µg m\u207B\u00B3",
"csv_link": "https://uk-air.defra.gov.uk/datastore/pcm/mapdgt12021.csv",
},
"benzene": {
"pollutant": "Benzene",
"metric": "Annual mean",
"header_label": "bz2021",
"csv_link": "https://uk-air.defra.gov.uk/datastore/pcm/mapbz2021.csv",
},
}

def add_arguments(self, parser):
parser.add_argument(
"-q", "--quiet", action="store_true", help="Silence progress bars."
)

def handle(self, quiet=False, *args, **options):
self._quiet = quiet
df = self.get_dataframe()
self.data_types = self.create_data_types(df)
self.delete_data()
self.import_data(df)

def create_data_types(self, df):
if not self._quiet:
self.stdout.write("Creating dataset + types")
data_set, created = DataSet.objects.update_or_create(
name="constituency_air_quality", defaults=self.defaults
)
data_types = []
for col in tqdm(df.columns, disable=self._quiet):
label = self.in_files[col]["pollutant"]
metric = self.in_files[col]["metric"]
if "comments" in self.in_files[col]:
metric += f" ({self.in_files[col]['comments']})"

data_type, created = DataType.objects.update_or_create(
data_set=data_set,
name=f"air_quality_{col}",
defaults={
"data_type": "float",
"label": label,
"description": metric,
},
)
data_types.append(data_type)

return data_types

def import_data(self, df):
if not self._quiet:
self.stdout.write("Importing air quality data")
for gss, row in tqdm(df.iterrows(), disable=self._quiet):
try:
area = Area.objects.get(gss=gss)
except Area.DoesNotExist:
self.stdout.write(f"Failed to find area with code {gss}")
continue
for data_type in self.data_types:
AreaData.objects.create(
data_type=data_type,
area=area,
data=row[data_type.name[12:]],
)
for col in df.columns:
average = df[col].mean()
data_type = DataType.objects.get(name=f"air_quality_{col}")
data_type.average = average
data_type.save()

def delete_data(self):
AreaData.objects.filter(data_type__in=self.data_types).delete()

def get_dataframe(self):
dfs = []
print("Importing separate csvs")
for label, metadata in tqdm(self.in_files.items(), disable=self._quiet):
dfs.append(
pd.read_csv(
metadata["csv_link"],
usecols=["gridcode", metadata["header_label"]],
skiprows=5,
na_values="MISSING",
dtype={"gridcode": "int", metadata["header_label"]: "float"},
).rename(columns={metadata["header_label"]: label})
)

print("Transforming data")
# Merge all of the dataframes on the common 'gridcode' column
df = reduce(lambda df1, df2: pd.merge(df1, df2, on="gridcode"), dfs)

# Use external lookup file to append GSS codes
gridcode_lookup = (
pd.read_csv(self.gridcode_lookup_file, usecols=["gss", "gridcode"])
.set_index("gridcode")["gss"]
.to_dict()
)
df["gss"] = df["gridcode"].apply(
lambda gridcode: gridcode_lookup.get(gridcode, None)
)

# Drop None values (which occur when a code lays outside a constituency
# - in the sea)
df = df.dropna(subset="gss")

# Prepare the df for useful importing
df = df.drop(columns=["gridcode"]).groupby("gss").mean()
return df
18 changes: 18 additions & 0 deletions hub/migrations/0042_add_dataset_release_date.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 4.2.3 on 2023-07-06 11:38

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("hub", "0041_alter_dataset_subcategory"),
]

operations = [
migrations.AddField(
model_name="dataset",
name="release_date",
field=models.DateField(blank=True, null=True),
),
]
1 change: 1 addition & 0 deletions hub/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ def options_default():
max_length=50, blank=True, null=True, choices=SOURCE_CHOICES
)
data_url = models.URLField(blank=True, null=True)
release_date = models.DateField(blank=True, null=True)
is_upload = models.BooleanField(default=False)
is_range = models.BooleanField(default=False)
featured = models.BooleanField(default=False)
Expand Down
24 changes: 24 additions & 0 deletions hub/templates/hub/area/_place_data.html
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,30 @@ <h5>{{ category.name }}</h5>
{% endfor %}
</body>
</table>
{% elif category.name == 'Air pollution' %}
<table class="table mb-0">
<thead>
<tr>
<th scope="col">Pollutant</th>
<th scope="col">This area</th>
<th scope="col" class="text-muted">UK average</th>
</tr>
</thead>
<tbody>
{% for range in category.data %}
<tr>
<th>
{{ range.label|html_format_dataset_name|safe }}
{% if range.data_type.description %}
<small class="d-block mt-1 fs-8 fw-normal">{{ range.data_type.description|escape }}</small>
{% endif %}
</th>
<td>{{ range.value|floatformat }}%</td>
<td class="text-muted">{{ range.average|floatformat }}%</td>
</tr>
{% endfor %}
</tbody>
</table>
{% else %}
{% if category.data.is_number %}
<p class="card-text mb-0 display-6 lh-1 text-primary">{{ category.data.value|floatformat }}{% if category.data.is_percentage %}%{% endif %}</p>
Expand Down
1 change: 1 addition & 0 deletions hub/templates/hub/explore.html
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ <h3 class="h6">${ filter.title }</h3>
<p class="text-muted fs-7 mb-0 mt-3">
<template v-if="filter.description">${ filter.description }.</template>
Data from <a href="${ filter.source }">${ filter.source_label }</a>.
<span v-if="filter.release_date">Last update: ${ filter.release_date }</span>
</p>
<table v-if="Number.isFinite(filter.min)" class="table table-sm table-bordered table-fixed text-center mb-0 mt-3 fs-7">
<thead>
Expand Down
13 changes: 13 additions & 0 deletions hub/templatetags/hub_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,16 @@ def pending_account_requests(**kwargs):
userproperties__email_confirmed=True,
userproperties__account_confirmed=False,
).count()


@register.filter
@stringfilter
def html_format_dataset_name(value):
pollutants = {
"PM10": "PM<sub>10</sub>",
"PM2.5": "PM<sub>2.5</sub>",
"NO2": "NO<sub>2</sub>",
"NOx": "NO<sub>X</sub>",
"SO2": "SO<sub>2</sub>",
}
return pollutants.get(value, value)
2 changes: 1 addition & 1 deletion hub/tests/test_admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,4 @@ def test_dataset_field_availability(self):
context = response.context
form = context["adminform"]

self.assertEqual(len(form.fields), 11)
self.assertEqual(len(form.fields), 12)
3 changes: 3 additions & 0 deletions hub/views/explore.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ def render_to_response(self, context, **response_kwargs):
is_range=d.is_range,
data_type=d.data_type,
)
if d.release_date is not None:
ds["release_date"] = d.release_date

if (
type_map.get(d.name, None) is not None
and type_map[d.name]["min"] is not None
Expand Down