🎉 slack bot housekeeper (#3796)

* 🎉 slack bot to review charts * 🔨 Refactor Slack message handling into dedicated helper functions * 🔨 Refactor Slack message handling into dedicated helper functions * 🔨 Refactor Slack message sending to use helper function * remove streamlit dependency from data module * utils to get slack channels * add createdAt * wip * 🎉 slack bot housekeeper * add ORM for housekeeper * tools to get reviews * wip * fixes * slowly remove streamlit dependency from pure backend code * re-structure command for chart review * prettier * adjust script
owid · Feb 5, 2025 · 5209a2e · 5209a2e
1 parent 0612d91
commit 5209a2e
Show file tree

Hide file tree

Showing 16 changed files with 282 additions and 74 deletions.
diff --git a/api/main.py b/api/main.py
@@ -7,8 +7,7 @@
 from api.v1 import v1
 from etl import config
 from etl.db import get_engine
-
-from . import utils
+from etl.slack_helpers import format_slack_message, send_slack_message
 
 log = structlog.get_logger()
 
@@ -66,11 +65,9 @@ async def slack_middleware(request: Request, call_next):
 
     log.info("response", method=request.method, url=str(request.url), status_code=response.status_code, body=res_body)
 
-    utils.send_slack_message(
+    send_slack_message(
         "#metadata-updates",
-        utils.format_slack_message(
-            request.method, request.url, response.status_code, req_body.decode(), res_body.decode()
-        ),
+        format_slack_message(request.method, request.url, response.status_code, req_body.decode(), res_body.decode()),
     )
 
     return Response(

diff --git a/api/utils.py b/api/utils.py
@@ -1,43 +1,5 @@
-import json
 from typing import Any, Dict
 
-from slack_sdk import WebClient
-
-from etl import config
-
-slack_client = WebClient(token=config.SLACK_API_TOKEN)
-
 
 def prune_none(d: Dict[str, Any]) -> Dict[str, Any]:
     return {k: v for k, v in d.items() if v is not None}
-
-
-def send_slack_message(channel: str, message: str) -> None:
-    if config.SLACK_API_TOKEN:
-        slack_client.chat_postMessage(channel=channel, text=message)
-
-
-def format_slack_message(method, url, status_code, req_body, res_body):
-    try:
-        res_body = json.dumps(json.loads(res_body), indent=2)
-    except json.decoder.JSONDecodeError:
-        pass
-
-    try:
-        req_body = json.dumps(json.loads(req_body), indent=2)
-    except json.decoder.JSONDecodeError:
-        pass
-
-    if status_code == 200:
-        emoji = ":information_source:"
-    else:
-        emoji = ":warning:"
-
-    message = f"{emoji} *{method}* {url}\n"
-
-    if req_body:
-        message += f"Request\n```\n{req_body}\n```\n"
-
-    message += f"Response\n```\n{res_body}\n```\n"
-
-    return message
diff --git a/apps/chart_sync/cli.py b/apps/chart_sync/cli.py
@@ -7,7 +7,6 @@
 import structlog
 from rich import print
 from rich_click.rich_command import RichCommand
-from slack_sdk import WebClient
 from sqlalchemy.orm import Session
 
 from apps.chart_sync.admin_api import AdminAPI
@@ -17,6 +16,7 @@
 from etl.config import OWIDEnv, get_container_name
 from etl.datadiff import _dict_diff
 from etl.grapher import model as gm
+from etl.slack_helpers import send_slack_message
 
 config.enable_bugsnag()
 
@@ -273,8 +273,7 @@ def _notify_slack_chart_update(chart_id: int, source: str, diff: ChartDiff, dry_
 
     if config.SLACK_API_TOKEN and not dry_run:
         assert diff.target_chart
-        slack_client = WebClient(token=config.SLACK_API_TOKEN)
-        slack_client.chat_postMessage(channel="#data-architecture-github", text=message)
+        send_slack_message(channel="#data-architecture-github", message=message)
 
 
 def _notify_slack_chart_create(source_chart_id: int, source: str, dry_run: bool) -> None:
@@ -286,8 +285,7 @@ def _notify_slack_chart_create(source_chart_id: int, source: str, dry_run: bool)
     print(message)
 
     if config.SLACK_API_TOKEN and not dry_run:
-        slack_client = WebClient(token=config.SLACK_API_TOKEN)
-        slack_client.chat_postMessage(channel="#data-architecture-github", text=message)
+        send_slack_message(channel="#data-architecture-github", message=message)
 
 
 def _matches_include_exclude(chart: gm.Chart, session: Session, include: Optional[str], exclude: Optional[str]):

diff --git a/apps/cli/__init__.py b/apps/cli/__init__.py
@@ -74,6 +74,7 @@ def _lazy_load(self, cmd_name):
             "map-datasets": "apps.utils.map_datasets.cli",
             "scan-chart-diff": "apps.utils.scan_chart_diff.cli",
             "profile": "apps.utils.profile.cli",
+            "housekeeper": "apps.housekeeper.cli.main",
         },
     },
     "b": {

diff --git a/apps/housekeeper/__init__.py b/apps/housekeeper/__init__.py
@@ -0,0 +1,12 @@
+"""Assist with housekeeping tasks.
+
+The initial motivation for this was to help with the problem of chart maintenance:
+
+"A growing problem we have at OWID is that our database contains a very high number of charts, and this number keeps growing month by month. Many charts are good and worth keeping, but several hundred at least are charts that aren't maintained, updated, and generally up to our current standards.
+
+These charts get few views but still "clog" our internal admin and search results (on OWID and search engines). Overall, these charts take mental space that we could instead allocate to maintaining our most important charts."
+
+TODOs:
+
+Add option of regular reviews of datasets, etc.
+"""
diff --git a/apps/housekeeper/charts.py b/apps/housekeeper/charts.py
@@ -0,0 +1,64 @@
+from datetime import datetime
+
+import pandas as pd
+
+from apps.housekeeper.utils import add_reviews, get_reviews_id
+from apps.wizard.app_pages.similar_charts.data import get_raw_charts
+from etl.config import OWID_ENV
+from etl.slack_helpers import send_slack_message
+
+CHANNEL_NAME = "#lucas-playground"
+SLACK_USERNAME = "housekeeper"
+
+
+def get_charts_to_review():
+    df = get_raw_charts()
+
+    # Keep only older-than-a-year charts
+    TODAY = datetime.today()
+    YEAR_AGO = TODAY.replace(year=TODAY.year - 1)
+    df = df.loc[df["created_at"] < YEAR_AGO]
+
+    # Discard charts already presented in the chat
+    reviews_id = get_reviews_id(object_type="chart")
+    df = df.loc[~df["chart_id"].isin(reviews_id)]
+
+    return df
+
+
+def select_chart(df: pd.DataFrame):
+    # Sort by views
+    df = df.sort_values(["views_365d", "views_14d", "views_7d"])
+
+    # Select oldest chart
+    chart = df.iloc[0]
+
+    return chart
+
+
+def send_slack_chart_review(channel_name: str, slack_username: str, icon_emoji: str):
+    # Get charts
+    df = get_charts_to_review()
+
+    # Select chart
+    chart = select_chart(df)
+
+    # Prepare message
+    DATE = datetime.today().date().strftime("%d %b, %Y")
+
+    message = (
+        f"{DATE}: *Daily chart to review is...*\n"
+        f"<{OWID_ENV.chart_site(chart['slug'])}|{chart['title']}> ({chart['views_365d']} views in the last year)\n"
+        f"Go to <{OWID_ENV.chart_admin_site(chart['chart_id'])}|edit :writing_hand:>\n"
+    )
+
+    # Send message
+    send_slack_message(
+        channel=channel_name,
+        message=message,
+        icon_emoji=icon_emoji,
+        username=slack_username,
+    )
+
+    # Add chart to reviewed charts
+    add_reviews(object_type="chart", object_id=chart["chart_id"])
diff --git a/apps/housekeeper/cli.py b/apps/housekeeper/cli.py
@@ -0,0 +1,29 @@
+"""Keep things in OWID catalog clean by regularly checking and reviewing content."""
+
+import click
+from rich_click import RichCommand
+
+from apps.housekeeper.charts import send_slack_chart_review
+
+# TODO: Add more review types
+REVIEW_TYPES = [
+    "chart",
+    # "dataset",
+]
+
+# Config
+CHANNEL_NAME = "#lucas-playground"
+SLACK_USERNAME = "housekeeper"
+ICON_EMOJI = "sus-blue"
+
+
+@click.command("housekeeper", cls=RichCommand, help=__doc__)
+@click.option("--review-type", "-t", type=click.Choice(REVIEW_TYPES, case_sensitive=False))
+def main(review_type: str):
+    # Review charts
+    if review_type == "chart":
+        send_slack_chart_review(
+            channel_name=CHANNEL_NAME,
+            slack_username=SLACK_USERNAME,
+            icon_emoji=ICON_EMOJI,
+        )
diff --git a/apps/housekeeper/utils.py b/apps/housekeeper/utils.py
@@ -0,0 +1,18 @@
+from sqlalchemy.orm import Session
+
+from etl.config import OWID_ENV
+from etl.grapher import model as gm
+
+
+def get_reviews_id(object_type: str):
+    with Session(OWID_ENV.engine) as session:
+        return gm.HousekeepingSuggestedReview.load_reviews_object_id(session, object_type=object_type)
+
+
+def add_reviews(object_type: str, object_id: int):
+    with Session(OWID_ENV.engine) as session:
+        gm.HousekeepingSuggestedReview.add_review(
+            session=session,
+            object_type=object_type,
+            object_id=object_id,
+        )
diff --git a/apps/wizard/app_pages/indicator_search/app.py b/apps/wizard/app_pages/indicator_search/app.py
@@ -121,7 +121,8 @@ def deduplicate_dimensions(indicators: list[data.Indicator]) -> list[data.Indica
 @st.cache_data(show_spinner=False, max_entries=1)
 def get_and_fit_model(_indicators: list[data.Indicator]) -> emb.EmbeddingsModel:
     # Get embedding model.
-    model = emb.EmbeddingsModel(emb.get_model())
+    with st.spinner("Loading model..."):
+        model = emb.EmbeddingsModel(emb.get_model())
     # Create an embedding for each indicator.
     with st.spinner("Creating embeddings..."):
         model.fit(_indicators)

diff --git a/apps/wizard/app_pages/insight_search/app.py b/apps/wizard/app_pages/insight_search/app.py
@@ -67,7 +67,8 @@ def get_authors_with_DIs(insights: list[data.Insight]) -> set[str]:
 @st.cache_data(show_spinner=False, max_entries=1)
 def get_and_fit_model(insights: list[data.Insight]) -> emb.EmbeddingsModel:
     # Get embedding model.
-    model = emb.EmbeddingsModel(emb.get_model())
+    with st.spinner("Loading model..."):
+        model = emb.EmbeddingsModel(emb.get_model())
     # Create an embedding for each insight.
     model.fit(insights)
     return model

diff --git a/apps/wizard/app_pages/similar_charts/app.py b/apps/wizard/app_pages/similar_charts/app.py
@@ -24,6 +24,22 @@
 ########################################################################################################################
 
 
+@st.cache_data(show_spinner=False, persist="disk")
+def get_charts() -> list[data.Chart]:
+    with st.spinner("Loading charts..."):
+        # Get charts from the database..
+        df = data.get_raw_charts()
+
+        charts = df.to_dict(orient="records")
+
+    ret = []
+    for c in charts:
+        c["tags"] = c["tags"].split(";") if c["tags"] else []
+        ret.append(data.Chart(**c))  # type: ignore
+
+    return ret
+
+
 def st_chart_info(chart: data.Chart) -> None:
     chart_url = OWID_ENV.chart_site(chart.slug)
     title = f"#### [{chart.title}]({chart_url})"
@@ -74,14 +90,15 @@ def split_input_string(input_string: str) -> tuple[str, list[str], list[str]]:
 
 @st.cache_data(show_spinner=False, max_entries=1)
 def get_and_fit_model(charts: list[data.Chart]) -> scoring.ScoringModel:
-    scoring_model = scoring.ScoringModel(emb.get_model())
+    with st.spinner("Loading model..."):
+        scoring_model = scoring.ScoringModel(emb.get_model())
     scoring_model.fit(charts)
     return scoring_model
 
 
 ########################################################################################################################
 # Fetch all data indicators.
-charts = data.get_charts()
+charts = get_charts()
 # Get scoring model.
 scoring_model = get_and_fit_model(charts)
 

diff --git a/apps/wizard/app_pages/similar_charts/data.py b/apps/wizard/app_pages/similar_charts/data.py
@@ -1,8 +1,8 @@
 from dataclasses import dataclass
+from datetime import datetime
 from typing import Optional
 
 import pandas as pd
-import streamlit as st
 
 from apps.wizard.utils.embeddings import Doc
 from etl.db import read_sql
@@ -16,6 +16,7 @@ class Chart(Doc):
     note: str
     tags: list[str]
     slug: str
+    created_at: Optional[datetime] = None
     views_7d: Optional[int] = None
     views_14d: Optional[int] = None
     views_365d: Optional[int] = None
@@ -24,6 +25,8 @@ class Chart(Doc):
 
 def get_raw_charts() -> pd.DataFrame:
     """Get all charts that exist in the database."""
+    # TODO: allow archived charts to be returned. Maybe add argument to function
+
     # Get all data indicators from the database.
     query = """
     with tags as (
@@ -38,6 +41,7 @@ def get_raw_charts() -> pd.DataFrame:
     )
     select
         c.id as chart_id,
+        c.createdAt as created_at,
         cf.slug,
         cf.full->>'$.title' as title,
         cf.full->>'$.subtitle' as subtitle,
@@ -62,19 +66,3 @@ def get_raw_charts() -> pd.DataFrame:
     assert df["chart_id"].nunique() == df.shape[0]
 
     return df
-
-
-@st.cache_data(show_spinner=False, persist="disk")
-def get_charts() -> list[Chart]:
-    with st.spinner("Loading charts..."):
-        # Get charts from the database..
-        df = get_raw_charts()
-
-        charts = df.to_dict(orient="records")
-
-    ret = []
-    for c in charts:
-        c["tags"] = c["tags"].split(";") if c["tags"] else []
-        ret.append(Chart(**c))  # type: ignore
-
-    return ret
diff --git a/apps/wizard/utils/embeddings.py b/apps/wizard/utils/embeddings.py
@@ -5,7 +5,6 @@
 from pathlib import Path
 from typing import Callable, Generic, Optional, TypeVar
 
-import streamlit as st
 import torch
 from joblib import Memory
 from sentence_transformers import SentenceTransformer, util
@@ -44,8 +43,7 @@ def set_device() -> str:
 @memory.cache
 def get_model(model_name: str = "all-MiniLM-L6-v2") -> SentenceTransformer:
     "Load the pre-trained model"
-    with st.spinner("Loading model..."):
-        model = SentenceTransformer(model_name)
+    model = SentenceTransformer(model_name)
     return model