Skip to content

Commit

Permalink
🎉 slack bot housekeeper (#3796)
Browse files Browse the repository at this point in the history
* 🎉 slack bot to review charts

* 🔨 Refactor Slack message handling into dedicated helper functions

* 🔨 Refactor Slack message handling into dedicated helper functions

* 🔨 Refactor Slack message sending to use helper function

* remove streamlit dependency from data module

* utils to get slack channels

* add createdAt

* wip

* 🎉 slack bot housekeeper

* add ORM for housekeeper

* tools to get reviews

* wip

* fixes

* slowly remove streamlit dependency from pure backend code

* re-structure command for chart review

* prettier

* adjust script
  • Loading branch information
lucasrodes authored and Tuna Acisu committed Feb 5, 2025
1 parent 0612d91 commit 5209a2e
Show file tree
Hide file tree
Showing 16 changed files with 282 additions and 74 deletions.
9 changes: 3 additions & 6 deletions api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@
from api.v1 import v1
from etl import config
from etl.db import get_engine

from . import utils
from etl.slack_helpers import format_slack_message, send_slack_message

log = structlog.get_logger()

Expand Down Expand Up @@ -66,11 +65,9 @@ async def slack_middleware(request: Request, call_next):

log.info("response", method=request.method, url=str(request.url), status_code=response.status_code, body=res_body)

utils.send_slack_message(
send_slack_message(
"#metadata-updates",
utils.format_slack_message(
request.method, request.url, response.status_code, req_body.decode(), res_body.decode()
),
format_slack_message(request.method, request.url, response.status_code, req_body.decode(), res_body.decode()),
)

return Response(
Expand Down
38 changes: 0 additions & 38 deletions api/utils.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,5 @@
import json
from typing import Any, Dict

from slack_sdk import WebClient

from etl import config

slack_client = WebClient(token=config.SLACK_API_TOKEN)


def prune_none(d: Dict[str, Any]) -> Dict[str, Any]:
return {k: v for k, v in d.items() if v is not None}


def send_slack_message(channel: str, message: str) -> None:
if config.SLACK_API_TOKEN:
slack_client.chat_postMessage(channel=channel, text=message)


def format_slack_message(method, url, status_code, req_body, res_body):
try:
res_body = json.dumps(json.loads(res_body), indent=2)
except json.decoder.JSONDecodeError:
pass

try:
req_body = json.dumps(json.loads(req_body), indent=2)
except json.decoder.JSONDecodeError:
pass

if status_code == 200:
emoji = ":information_source:"
else:
emoji = ":warning:"

message = f"{emoji} *{method}* {url}\n"

if req_body:
message += f"Request\n```\n{req_body}\n```\n"

message += f"Response\n```\n{res_body}\n```\n"

return message
8 changes: 3 additions & 5 deletions apps/chart_sync/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import structlog
from rich import print
from rich_click.rich_command import RichCommand
from slack_sdk import WebClient
from sqlalchemy.orm import Session

from apps.chart_sync.admin_api import AdminAPI
Expand All @@ -17,6 +16,7 @@
from etl.config import OWIDEnv, get_container_name
from etl.datadiff import _dict_diff
from etl.grapher import model as gm
from etl.slack_helpers import send_slack_message

config.enable_bugsnag()

Expand Down Expand Up @@ -273,8 +273,7 @@ def _notify_slack_chart_update(chart_id: int, source: str, diff: ChartDiff, dry_

if config.SLACK_API_TOKEN and not dry_run:
assert diff.target_chart
slack_client = WebClient(token=config.SLACK_API_TOKEN)
slack_client.chat_postMessage(channel="#data-architecture-github", text=message)
send_slack_message(channel="#data-architecture-github", message=message)


def _notify_slack_chart_create(source_chart_id: int, source: str, dry_run: bool) -> None:
Expand All @@ -286,8 +285,7 @@ def _notify_slack_chart_create(source_chart_id: int, source: str, dry_run: bool)
print(message)

if config.SLACK_API_TOKEN and not dry_run:
slack_client = WebClient(token=config.SLACK_API_TOKEN)
slack_client.chat_postMessage(channel="#data-architecture-github", text=message)
send_slack_message(channel="#data-architecture-github", message=message)


def _matches_include_exclude(chart: gm.Chart, session: Session, include: Optional[str], exclude: Optional[str]):
Expand Down
1 change: 1 addition & 0 deletions apps/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def _lazy_load(self, cmd_name):
"map-datasets": "apps.utils.map_datasets.cli",
"scan-chart-diff": "apps.utils.scan_chart_diff.cli",
"profile": "apps.utils.profile.cli",
"housekeeper": "apps.housekeeper.cli.main",
},
},
"b": {
Expand Down
12 changes: 12 additions & 0 deletions apps/housekeeper/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""Assist with housekeeping tasks.
The initial motivation for this was to help with the problem of chart maintenance:
"A growing problem we have at OWID is that our database contains a very high number of charts, and this number keeps growing month by month. Many charts are good and worth keeping, but several hundred at least are charts that aren't maintained, updated, and generally up to our current standards.
These charts get few views but still "clog" our internal admin and search results (on OWID and search engines). Overall, these charts take mental space that we could instead allocate to maintaining our most important charts."
TODOs:
Add option of regular reviews of datasets, etc.
"""
64 changes: 64 additions & 0 deletions apps/housekeeper/charts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from datetime import datetime

import pandas as pd

from apps.housekeeper.utils import add_reviews, get_reviews_id
from apps.wizard.app_pages.similar_charts.data import get_raw_charts
from etl.config import OWID_ENV
from etl.slack_helpers import send_slack_message

CHANNEL_NAME = "#lucas-playground"
SLACK_USERNAME = "housekeeper"


def get_charts_to_review():
df = get_raw_charts()

# Keep only older-than-a-year charts
TODAY = datetime.today()
YEAR_AGO = TODAY.replace(year=TODAY.year - 1)
df = df.loc[df["created_at"] < YEAR_AGO]

# Discard charts already presented in the chat
reviews_id = get_reviews_id(object_type="chart")
df = df.loc[~df["chart_id"].isin(reviews_id)]

return df


def select_chart(df: pd.DataFrame):
# Sort by views
df = df.sort_values(["views_365d", "views_14d", "views_7d"])

# Select oldest chart
chart = df.iloc[0]

return chart


def send_slack_chart_review(channel_name: str, slack_username: str, icon_emoji: str):
# Get charts
df = get_charts_to_review()

# Select chart
chart = select_chart(df)

# Prepare message
DATE = datetime.today().date().strftime("%d %b, %Y")

message = (
f"{DATE}: *Daily chart to review is...*\n"
f"<{OWID_ENV.chart_site(chart['slug'])}|{chart['title']}> ({chart['views_365d']} views in the last year)\n"
f"Go to <{OWID_ENV.chart_admin_site(chart['chart_id'])}|edit :writing_hand:>\n"
)

# Send message
send_slack_message(
channel=channel_name,
message=message,
icon_emoji=icon_emoji,
username=slack_username,
)

# Add chart to reviewed charts
add_reviews(object_type="chart", object_id=chart["chart_id"])
29 changes: 29 additions & 0 deletions apps/housekeeper/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""Keep things in OWID catalog clean by regularly checking and reviewing content."""

import click
from rich_click import RichCommand

from apps.housekeeper.charts import send_slack_chart_review

# TODO: Add more review types
REVIEW_TYPES = [
"chart",
# "dataset",
]

# Config
CHANNEL_NAME = "#lucas-playground"
SLACK_USERNAME = "housekeeper"
ICON_EMOJI = "sus-blue"


@click.command("housekeeper", cls=RichCommand, help=__doc__)
@click.option("--review-type", "-t", type=click.Choice(REVIEW_TYPES, case_sensitive=False))
def main(review_type: str):
# Review charts
if review_type == "chart":
send_slack_chart_review(
channel_name=CHANNEL_NAME,
slack_username=SLACK_USERNAME,
icon_emoji=ICON_EMOJI,
)
18 changes: 18 additions & 0 deletions apps/housekeeper/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from sqlalchemy.orm import Session

from etl.config import OWID_ENV
from etl.grapher import model as gm


def get_reviews_id(object_type: str):
with Session(OWID_ENV.engine) as session:
return gm.HousekeepingSuggestedReview.load_reviews_object_id(session, object_type=object_type)


def add_reviews(object_type: str, object_id: int):
with Session(OWID_ENV.engine) as session:
gm.HousekeepingSuggestedReview.add_review(
session=session,
object_type=object_type,
object_id=object_id,
)
3 changes: 2 additions & 1 deletion apps/wizard/app_pages/indicator_search/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,8 @@ def deduplicate_dimensions(indicators: list[data.Indicator]) -> list[data.Indica
@st.cache_data(show_spinner=False, max_entries=1)
def get_and_fit_model(_indicators: list[data.Indicator]) -> emb.EmbeddingsModel:
# Get embedding model.
model = emb.EmbeddingsModel(emb.get_model())
with st.spinner("Loading model..."):
model = emb.EmbeddingsModel(emb.get_model())
# Create an embedding for each indicator.
with st.spinner("Creating embeddings..."):
model.fit(_indicators)
Expand Down
3 changes: 2 additions & 1 deletion apps/wizard/app_pages/insight_search/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ def get_authors_with_DIs(insights: list[data.Insight]) -> set[str]:
@st.cache_data(show_spinner=False, max_entries=1)
def get_and_fit_model(insights: list[data.Insight]) -> emb.EmbeddingsModel:
# Get embedding model.
model = emb.EmbeddingsModel(emb.get_model())
with st.spinner("Loading model..."):
model = emb.EmbeddingsModel(emb.get_model())
# Create an embedding for each insight.
model.fit(insights)
return model
Expand Down
21 changes: 19 additions & 2 deletions apps/wizard/app_pages/similar_charts/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,22 @@
########################################################################################################################


@st.cache_data(show_spinner=False, persist="disk")
def get_charts() -> list[data.Chart]:
with st.spinner("Loading charts..."):
# Get charts from the database..
df = data.get_raw_charts()

charts = df.to_dict(orient="records")

ret = []
for c in charts:
c["tags"] = c["tags"].split(";") if c["tags"] else []
ret.append(data.Chart(**c)) # type: ignore

return ret


def st_chart_info(chart: data.Chart) -> None:
chart_url = OWID_ENV.chart_site(chart.slug)
title = f"#### [{chart.title}]({chart_url})"
Expand Down Expand Up @@ -74,14 +90,15 @@ def split_input_string(input_string: str) -> tuple[str, list[str], list[str]]:

@st.cache_data(show_spinner=False, max_entries=1)
def get_and_fit_model(charts: list[data.Chart]) -> scoring.ScoringModel:
scoring_model = scoring.ScoringModel(emb.get_model())
with st.spinner("Loading model..."):
scoring_model = scoring.ScoringModel(emb.get_model())
scoring_model.fit(charts)
return scoring_model


########################################################################################################################
# Fetch all data indicators.
charts = data.get_charts()
charts = get_charts()
# Get scoring model.
scoring_model = get_and_fit_model(charts)

Expand Down
22 changes: 5 additions & 17 deletions apps/wizard/app_pages/similar_charts/data.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from dataclasses import dataclass
from datetime import datetime
from typing import Optional

import pandas as pd
import streamlit as st

from apps.wizard.utils.embeddings import Doc
from etl.db import read_sql
Expand All @@ -16,6 +16,7 @@ class Chart(Doc):
note: str
tags: list[str]
slug: str
created_at: Optional[datetime] = None
views_7d: Optional[int] = None
views_14d: Optional[int] = None
views_365d: Optional[int] = None
Expand All @@ -24,6 +25,8 @@ class Chart(Doc):

def get_raw_charts() -> pd.DataFrame:
"""Get all charts that exist in the database."""
# TODO: allow archived charts to be returned. Maybe add argument to function

# Get all data indicators from the database.
query = """
with tags as (
Expand All @@ -38,6 +41,7 @@ def get_raw_charts() -> pd.DataFrame:
)
select
c.id as chart_id,
c.createdAt as created_at,
cf.slug,
cf.full->>'$.title' as title,
cf.full->>'$.subtitle' as subtitle,
Expand All @@ -62,19 +66,3 @@ def get_raw_charts() -> pd.DataFrame:
assert df["chart_id"].nunique() == df.shape[0]

return df


@st.cache_data(show_spinner=False, persist="disk")
def get_charts() -> list[Chart]:
with st.spinner("Loading charts..."):
# Get charts from the database..
df = get_raw_charts()

charts = df.to_dict(orient="records")

ret = []
for c in charts:
c["tags"] = c["tags"].split(";") if c["tags"] else []
ret.append(Chart(**c)) # type: ignore

return ret
4 changes: 1 addition & 3 deletions apps/wizard/utils/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from pathlib import Path
from typing import Callable, Generic, Optional, TypeVar

import streamlit as st
import torch
from joblib import Memory
from sentence_transformers import SentenceTransformer, util
Expand Down Expand Up @@ -44,8 +43,7 @@ def set_device() -> str:
@memory.cache
def get_model(model_name: str = "all-MiniLM-L6-v2") -> SentenceTransformer:
"Load the pre-trained model"
with st.spinner("Loading model..."):
model = SentenceTransformer(model_name)
model = SentenceTransformer(model_name)
return model


Expand Down
Loading

0 comments on commit 5209a2e

Please sign in to comment.