diff --git a/api/main.py b/api/main.py index bd92762268d3..5e15ca72e849 100644 --- a/api/main.py +++ b/api/main.py @@ -7,8 +7,7 @@ from api.v1 import v1 from etl import config from etl.db import get_engine - -from . import utils +from etl.slack_helpers import format_slack_message, send_slack_message log = structlog.get_logger() @@ -66,11 +65,9 @@ async def slack_middleware(request: Request, call_next): log.info("response", method=request.method, url=str(request.url), status_code=response.status_code, body=res_body) - utils.send_slack_message( + send_slack_message( "#metadata-updates", - utils.format_slack_message( - request.method, request.url, response.status_code, req_body.decode(), res_body.decode() - ), + format_slack_message(request.method, request.url, response.status_code, req_body.decode(), res_body.decode()), ) return Response( diff --git a/api/utils.py b/api/utils.py index 6c9a281861c9..fdf3092905df 100644 --- a/api/utils.py +++ b/api/utils.py @@ -1,43 +1,5 @@ -import json from typing import Any, Dict -from slack_sdk import WebClient - -from etl import config - -slack_client = WebClient(token=config.SLACK_API_TOKEN) - def prune_none(d: Dict[str, Any]) -> Dict[str, Any]: return {k: v for k, v in d.items() if v is not None} - - -def send_slack_message(channel: str, message: str) -> None: - if config.SLACK_API_TOKEN: - slack_client.chat_postMessage(channel=channel, text=message) - - -def format_slack_message(method, url, status_code, req_body, res_body): - try: - res_body = json.dumps(json.loads(res_body), indent=2) - except json.decoder.JSONDecodeError: - pass - - try: - req_body = json.dumps(json.loads(req_body), indent=2) - except json.decoder.JSONDecodeError: - pass - - if status_code == 200: - emoji = ":information_source:" - else: - emoji = ":warning:" - - message = f"{emoji} *{method}* {url}\n" - - if req_body: - message += f"Request\n```\n{req_body}\n```\n" - - message += f"Response\n```\n{res_body}\n```\n" - - return message diff --git a/apps/chart_sync/cli.py b/apps/chart_sync/cli.py index 26a1bf228071..163f8f7d9cb8 100644 --- a/apps/chart_sync/cli.py +++ b/apps/chart_sync/cli.py @@ -7,7 +7,6 @@ import structlog from rich import print from rich_click.rich_command import RichCommand -from slack_sdk import WebClient from sqlalchemy.orm import Session from apps.chart_sync.admin_api import AdminAPI @@ -17,6 +16,7 @@ from etl.config import OWIDEnv, get_container_name from etl.datadiff import _dict_diff from etl.grapher import model as gm +from etl.slack_helpers import send_slack_message config.enable_bugsnag() @@ -273,8 +273,7 @@ def _notify_slack_chart_update(chart_id: int, source: str, diff: ChartDiff, dry_ if config.SLACK_API_TOKEN and not dry_run: assert diff.target_chart - slack_client = WebClient(token=config.SLACK_API_TOKEN) - slack_client.chat_postMessage(channel="#data-architecture-github", text=message) + send_slack_message(channel="#data-architecture-github", message=message) def _notify_slack_chart_create(source_chart_id: int, source: str, dry_run: bool) -> None: @@ -286,8 +285,7 @@ def _notify_slack_chart_create(source_chart_id: int, source: str, dry_run: bool) print(message) if config.SLACK_API_TOKEN and not dry_run: - slack_client = WebClient(token=config.SLACK_API_TOKEN) - slack_client.chat_postMessage(channel="#data-architecture-github", text=message) + send_slack_message(channel="#data-architecture-github", message=message) def _matches_include_exclude(chart: gm.Chart, session: Session, include: Optional[str], exclude: Optional[str]): diff --git a/apps/cli/__init__.py b/apps/cli/__init__.py index 380079732517..ed459f65f337 100644 --- a/apps/cli/__init__.py +++ b/apps/cli/__init__.py @@ -74,6 +74,7 @@ def _lazy_load(self, cmd_name): "map-datasets": "apps.utils.map_datasets.cli", "scan-chart-diff": "apps.utils.scan_chart_diff.cli", "profile": "apps.utils.profile.cli", + "housekeeper": "apps.housekeeper.cli.main", }, }, "b": { diff --git a/apps/housekeeper/__init__.py b/apps/housekeeper/__init__.py new file mode 100644 index 000000000000..9a28c5525de3 --- /dev/null +++ b/apps/housekeeper/__init__.py @@ -0,0 +1,12 @@ +"""Assist with housekeeping tasks. + +The initial motivation for this was to help with the problem of chart maintenance: + +"A growing problem we have at OWID is that our database contains a very high number of charts, and this number keeps growing month by month. Many charts are good and worth keeping, but several hundred at least are charts that aren't maintained, updated, and generally up to our current standards. + +These charts get few views but still "clog" our internal admin and search results (on OWID and search engines). Overall, these charts take mental space that we could instead allocate to maintaining our most important charts." + +TODOs: + +Add option of regular reviews of datasets, etc. +""" diff --git a/apps/housekeeper/charts.py b/apps/housekeeper/charts.py new file mode 100644 index 000000000000..d7a9e9654cf9 --- /dev/null +++ b/apps/housekeeper/charts.py @@ -0,0 +1,64 @@ +from datetime import datetime + +import pandas as pd + +from apps.housekeeper.utils import add_reviews, get_reviews_id +from apps.wizard.app_pages.similar_charts.data import get_raw_charts +from etl.config import OWID_ENV +from etl.slack_helpers import send_slack_message + +CHANNEL_NAME = "#lucas-playground" +SLACK_USERNAME = "housekeeper" + + +def get_charts_to_review(): + df = get_raw_charts() + + # Keep only older-than-a-year charts + TODAY = datetime.today() + YEAR_AGO = TODAY.replace(year=TODAY.year - 1) + df = df.loc[df["created_at"] < YEAR_AGO] + + # Discard charts already presented in the chat + reviews_id = get_reviews_id(object_type="chart") + df = df.loc[~df["chart_id"].isin(reviews_id)] + + return df + + +def select_chart(df: pd.DataFrame): + # Sort by views + df = df.sort_values(["views_365d", "views_14d", "views_7d"]) + + # Select oldest chart + chart = df.iloc[0] + + return chart + + +def send_slack_chart_review(channel_name: str, slack_username: str, icon_emoji: str): + # Get charts + df = get_charts_to_review() + + # Select chart + chart = select_chart(df) + + # Prepare message + DATE = datetime.today().date().strftime("%d %b, %Y") + + message = ( + f"{DATE}: *Daily chart to review is...*\n" + f"<{OWID_ENV.chart_site(chart['slug'])}|{chart['title']}> ({chart['views_365d']} views in the last year)\n" + f"Go to <{OWID_ENV.chart_admin_site(chart['chart_id'])}|edit :writing_hand:>\n" + ) + + # Send message + send_slack_message( + channel=channel_name, + message=message, + icon_emoji=icon_emoji, + username=slack_username, + ) + + # Add chart to reviewed charts + add_reviews(object_type="chart", object_id=chart["chart_id"]) diff --git a/apps/housekeeper/cli.py b/apps/housekeeper/cli.py new file mode 100644 index 000000000000..f7a9e0953949 --- /dev/null +++ b/apps/housekeeper/cli.py @@ -0,0 +1,29 @@ +"""Keep things in OWID catalog clean by regularly checking and reviewing content.""" + +import click +from rich_click import RichCommand + +from apps.housekeeper.charts import send_slack_chart_review + +# TODO: Add more review types +REVIEW_TYPES = [ + "chart", + # "dataset", +] + +# Config +CHANNEL_NAME = "#lucas-playground" +SLACK_USERNAME = "housekeeper" +ICON_EMOJI = "sus-blue" + + +@click.command("housekeeper", cls=RichCommand, help=__doc__) +@click.option("--review-type", "-t", type=click.Choice(REVIEW_TYPES, case_sensitive=False)) +def main(review_type: str): + # Review charts + if review_type == "chart": + send_slack_chart_review( + channel_name=CHANNEL_NAME, + slack_username=SLACK_USERNAME, + icon_emoji=ICON_EMOJI, + ) diff --git a/apps/housekeeper/utils.py b/apps/housekeeper/utils.py new file mode 100644 index 000000000000..6fa23f70c580 --- /dev/null +++ b/apps/housekeeper/utils.py @@ -0,0 +1,18 @@ +from sqlalchemy.orm import Session + +from etl.config import OWID_ENV +from etl.grapher import model as gm + + +def get_reviews_id(object_type: str): + with Session(OWID_ENV.engine) as session: + return gm.HousekeepingSuggestedReview.load_reviews_object_id(session, object_type=object_type) + + +def add_reviews(object_type: str, object_id: int): + with Session(OWID_ENV.engine) as session: + gm.HousekeepingSuggestedReview.add_review( + session=session, + object_type=object_type, + object_id=object_id, + ) diff --git a/apps/wizard/app_pages/indicator_search/app.py b/apps/wizard/app_pages/indicator_search/app.py index 5cc95d9c38aa..a8e3ead8c5ae 100644 --- a/apps/wizard/app_pages/indicator_search/app.py +++ b/apps/wizard/app_pages/indicator_search/app.py @@ -121,7 +121,8 @@ def deduplicate_dimensions(indicators: list[data.Indicator]) -> list[data.Indica @st.cache_data(show_spinner=False, max_entries=1) def get_and_fit_model(_indicators: list[data.Indicator]) -> emb.EmbeddingsModel: # Get embedding model. - model = emb.EmbeddingsModel(emb.get_model()) + with st.spinner("Loading model..."): + model = emb.EmbeddingsModel(emb.get_model()) # Create an embedding for each indicator. with st.spinner("Creating embeddings..."): model.fit(_indicators) diff --git a/apps/wizard/app_pages/insight_search/app.py b/apps/wizard/app_pages/insight_search/app.py index eccbe53d7648..3d2f1af8f357 100644 --- a/apps/wizard/app_pages/insight_search/app.py +++ b/apps/wizard/app_pages/insight_search/app.py @@ -67,7 +67,8 @@ def get_authors_with_DIs(insights: list[data.Insight]) -> set[str]: @st.cache_data(show_spinner=False, max_entries=1) def get_and_fit_model(insights: list[data.Insight]) -> emb.EmbeddingsModel: # Get embedding model. - model = emb.EmbeddingsModel(emb.get_model()) + with st.spinner("Loading model..."): + model = emb.EmbeddingsModel(emb.get_model()) # Create an embedding for each insight. model.fit(insights) return model diff --git a/apps/wizard/app_pages/similar_charts/app.py b/apps/wizard/app_pages/similar_charts/app.py index 8d2be19c293e..5d3ec2152278 100644 --- a/apps/wizard/app_pages/similar_charts/app.py +++ b/apps/wizard/app_pages/similar_charts/app.py @@ -24,6 +24,22 @@ ######################################################################################################################## +@st.cache_data(show_spinner=False, persist="disk") +def get_charts() -> list[data.Chart]: + with st.spinner("Loading charts..."): + # Get charts from the database.. + df = data.get_raw_charts() + + charts = df.to_dict(orient="records") + + ret = [] + for c in charts: + c["tags"] = c["tags"].split(";") if c["tags"] else [] + ret.append(data.Chart(**c)) # type: ignore + + return ret + + def st_chart_info(chart: data.Chart) -> None: chart_url = OWID_ENV.chart_site(chart.slug) title = f"#### [{chart.title}]({chart_url})" @@ -74,14 +90,15 @@ def split_input_string(input_string: str) -> tuple[str, list[str], list[str]]: @st.cache_data(show_spinner=False, max_entries=1) def get_and_fit_model(charts: list[data.Chart]) -> scoring.ScoringModel: - scoring_model = scoring.ScoringModel(emb.get_model()) + with st.spinner("Loading model..."): + scoring_model = scoring.ScoringModel(emb.get_model()) scoring_model.fit(charts) return scoring_model ######################################################################################################################## # Fetch all data indicators. -charts = data.get_charts() +charts = get_charts() # Get scoring model. scoring_model = get_and_fit_model(charts) diff --git a/apps/wizard/app_pages/similar_charts/data.py b/apps/wizard/app_pages/similar_charts/data.py index 9ad6cea55bdc..3690aa6f9aa4 100644 --- a/apps/wizard/app_pages/similar_charts/data.py +++ b/apps/wizard/app_pages/similar_charts/data.py @@ -1,8 +1,8 @@ from dataclasses import dataclass +from datetime import datetime from typing import Optional import pandas as pd -import streamlit as st from apps.wizard.utils.embeddings import Doc from etl.db import read_sql @@ -16,6 +16,7 @@ class Chart(Doc): note: str tags: list[str] slug: str + created_at: Optional[datetime] = None views_7d: Optional[int] = None views_14d: Optional[int] = None views_365d: Optional[int] = None @@ -24,6 +25,8 @@ class Chart(Doc): def get_raw_charts() -> pd.DataFrame: """Get all charts that exist in the database.""" + # TODO: allow archived charts to be returned. Maybe add argument to function + # Get all data indicators from the database. query = """ with tags as ( @@ -38,6 +41,7 @@ def get_raw_charts() -> pd.DataFrame: ) select c.id as chart_id, + c.createdAt as created_at, cf.slug, cf.full->>'$.title' as title, cf.full->>'$.subtitle' as subtitle, @@ -62,19 +66,3 @@ def get_raw_charts() -> pd.DataFrame: assert df["chart_id"].nunique() == df.shape[0] return df - - -@st.cache_data(show_spinner=False, persist="disk") -def get_charts() -> list[Chart]: - with st.spinner("Loading charts..."): - # Get charts from the database.. - df = get_raw_charts() - - charts = df.to_dict(orient="records") - - ret = [] - for c in charts: - c["tags"] = c["tags"].split(";") if c["tags"] else [] - ret.append(Chart(**c)) # type: ignore - - return ret diff --git a/apps/wizard/utils/embeddings.py b/apps/wizard/utils/embeddings.py index b0d2a27c124b..5a7e6408be90 100644 --- a/apps/wizard/utils/embeddings.py +++ b/apps/wizard/utils/embeddings.py @@ -5,7 +5,6 @@ from pathlib import Path from typing import Callable, Generic, Optional, TypeVar -import streamlit as st import torch from joblib import Memory from sentence_transformers import SentenceTransformer, util @@ -44,8 +43,7 @@ def set_device() -> str: @memory.cache def get_model(model_name: str = "all-MiniLM-L6-v2") -> SentenceTransformer: "Load the pre-trained model" - with st.spinner("Loading model..."): - model = SentenceTransformer(model_name) + model = SentenceTransformer(model_name) return model diff --git a/etl/grapher/model.py b/etl/grapher/model.py index 2b348807f548..9f43da1773f6 100644 --- a/etl/grapher/model.py +++ b/etl/grapher/model.py @@ -18,7 +18,7 @@ import io import json import random -from datetime import date, datetime +from datetime import date, datetime, timezone from enum import Enum from pathlib import Path from typing import Any, Dict, List, Literal, Optional, Union, get_args, overload @@ -128,6 +128,52 @@ def create_table(cls, engine: Engine, if_exists: Literal["fail", "replace", "ski raise ValueError(f"Unrecognized value for if_exists: {if_exists}") +class HousekeepingSuggestedReview(Base): + __tablename__ = "housekeeping_suggested_reviews" + + id: Mapped[int] = mapped_column( + Integer, + primary_key=True, + init=False, + # autoincrement=True, + comment="Identifier of the review", + ) + suggestedAt: Mapped[datetime] = mapped_column( + DateTime, + nullable=True, + server_default=text("CURRENT_TIMESTAMP"), + comment="Date when the review was suggested", + ) + objectType: Mapped[str] = mapped_column( + String(255), + nullable=False, + comment="Type of the object to review (e.g., 'chart', 'dataset', etc.)", + ) + + objectId: Mapped[int] = mapped_column(Integer, nullable=False) + + @classmethod + def load_reviews(cls, session: Session, object_type: Optional[str] = None) -> list["HousekeepingSuggestedReview"]: + if object_type is None: + vars = session.scalars(select(cls)).all() + return list(vars) + else: + vars = session.scalars(select(cls).where(cls.objectType == object_type)).all() + return list(vars) + + @classmethod + def load_reviews_object_id(cls, session: Session, object_type: str) -> list[int]: + vars = session.scalars(select(cls.objectId).where(cls.objectType == object_type)).all() + return list(vars) + + @classmethod + def add_review(cls, session: Session, object_type: str, object_id: int): + new_review = cls(objectType=object_type, objectId=object_id, suggestedAt=datetime.now(timezone.utc)) + session.add(new_review) + session.commit() + # return new_review + + class Entity(Base): __tablename__ = "entities" __table_args__ = (Index("code", "code", unique=True), Index("name", "name", unique=True)) diff --git a/etl/slack_helpers.py b/etl/slack_helpers.py new file mode 100644 index 000000000000..e7117303e3e7 --- /dev/null +++ b/etl/slack_helpers.py @@ -0,0 +1,55 @@ +"""Tools to assist with Slack interactions.""" + +import json + +from slack_sdk import WebClient + +from etl import config + +slack_client = WebClient(token=config.SLACK_API_TOKEN) + + +def send_slack_message(channel: str, message: str, **kwargs) -> None: + """Send `message` to Slack channel `channel`.""" + if config.SLACK_API_TOKEN: + slack_client.chat_postMessage(channel=channel, text=message, **kwargs) + + +def format_slack_message(method, url, status_code, req_body, res_body): + try: + res_body = json.dumps(json.loads(res_body), indent=2) + except json.decoder.JSONDecodeError: + pass + + try: + req_body = json.dumps(json.loads(req_body), indent=2) + except json.decoder.JSONDecodeError: + pass + + if status_code == 200: + emoji = ":information_source:" + else: + emoji = ":warning:" + + message = f"{emoji} *{method}* {url}\n" + + if req_body: + message += f"Request\n```\n{req_body}\n```\n" + + message += f"Response\n```\n{res_body}\n```\n" + + return message + + +def get_channels(): + response = slack_client.conversations_list() + channels = response["channels"] + return channels + + +def channels_mapping(): + channels = get_channels() + mapping = {} + for channel in channels: + mapping[channel["name"]] = channel["id"] + return mapping diff --git a/scripts/housekeeper.sh b/scripts/housekeeper.sh new file mode 100755 index 000000000000..518ca258c609 --- /dev/null +++ b/scripts/housekeeper.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# +# housekeeper.sh +# +# Keep things at OWID clean +# + +set -e + +start_time=$(date +%s) + +HOUR=$(TZ=Europe/Berlin date +%H) +echo '--- Keep OWID clean' +cd /home/owid/etl +if [ "$HOUR" -eq "01" ]; then + uv run etl d housekeeper --review-type chart +fi + +end_time=$(date +%s) + +echo "--- Done! ($(($end_time - $start_time))s)"