Skip to content

Commit

Permalink
fix(stop_word): case where keyword is empty to enable faster updates …
Browse files Browse the repository at this point in the history
…filter (#322)
  • Loading branch information
polomarcus authored Jan 23, 2025
1 parent 989ff00 commit 97783e0
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 14 deletions.
33 changes: 23 additions & 10 deletions quotaclimat/data_processing/mediatree/update_pg_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,39 @@
import logging

from sqlalchemy.orm import Session
from postgres.schemas.models import Keywords
from postgres.schemas.models import Keywords, Stop_Word
from quotaclimat.data_processing.mediatree.detect_keywords import *
from quotaclimat.data_processing.mediatree.api_import import get_stop_words
from quotaclimat.data_processing.mediatree.channel_program import get_programs, get_a_program_with_start_timestamp, get_channel_title_for_name
from sqlalchemy import func, select, and_, or_


def get_keyword_else_context(stop_word_object: Stop_Word):
if stop_word_object.keyword is not None:
return stop_word_object.keyword
else:
return stop_word_object.context

def get_top_keyword_of_stop_words(stop_word_keyword_only: bool, stop_words_objects: List[Stop_Word]):
top_keyword_of_stop_words = []
if stop_word_keyword_only and (len(stop_words_objects) > 0):
logging.warning(f"Using stop words to filter rows inside Keywords table")

top_keyword_of_stop_words = set(map(lambda stop: get_keyword_else_context(stop), stop_words_objects))
logging.info(f"stop words keywords : {top_keyword_of_stop_words}")
else:
logging.info(f"No filter on plaintext for Keywords table - stop_word_keyword_only env variable to false")

return top_keyword_of_stop_words

def update_keywords(session: Session, batch_size: int = 50000, start_date : str = "2023-04-01", program_only=False, \
end_date: str = "2023-04-30", channel: str = "", empty_program_only=False, \
stop_word_keyword_only = False) -> list:
df_programs = get_programs()

stop_words_object = get_stop_words(session, validated_only=True, context_only=False)
stop_words = list(map(lambda stop: stop.context, stop_words_object))
if stop_word_keyword_only and (len(stop_words) > 0):
logging.warning(f"Using stop words to filter rows inside Keywords table")
top_keyword_of_stop_words = set(map(lambda stop: stop.keyword, stop_words_object))
logging.info(f"stop words keywords :\n {top_keyword_of_stop_words}")
else:
logging.info(f"No filter on plaintext for Keywords table - stop_word_keyword_only env variable to false")
top_keyword_of_stop_words = []
stop_words_objects = get_stop_words(session, validated_only=True, context_only=False)
stop_words = list(map(lambda stop: stop.context, stop_words_objects))
top_keyword_of_stop_words = get_top_keyword_of_stop_words(stop_word_keyword_only, stop_words_objects=stop_words_objects)

total_updates = get_total_count_saved_keywords(session, start_date, end_date, channel, empty_program_only, keywords_to_includes=top_keyword_of_stop_words)

Expand Down
55 changes: 54 additions & 1 deletion test/sitemap/test_update_pg_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -946,7 +946,60 @@ def test_update_only_keywords_that_includes_some_keywords():
conn.dispose()
session.close()
assert result_after_update.number_of_keywords_climat == number_of_keywords_climat



def test_get_top_keyword_of_stop_words_stop_word_keyword_only_True():
sw1 = Stop_Word(
id="test"
,keyword_id="test"
,channel_title="test"
,context="test_context"
,count=10
,keyword="keyword1"
,created_at="test"
,start_date="test"
,updated_at="test"
,validated=True
)

sw2 = Stop_Word(
id="test"
,keyword_id="test"
,channel_title="test"
,context="test_context"
,count=10
,keyword="keyword2"
,created_at="test"
,start_date="test"
,updated_at="test"
,validated=True
)

sw3 = Stop_Word(
id="test"
,keyword_id="test"
,channel_title="test"
,context="test_context"
,count=10
# ,keyword="keyword2" empty keyword it should use context
,created_at="test"
,start_date="test"
,updated_at="test"
,validated=True
)
stop_words_objects = [sw1, sw1, sw1, sw2, sw3]

output = get_top_keyword_of_stop_words(stop_word_keyword_only=True, stop_words_objects=stop_words_objects)
expected = set(["keyword1", "keyword2", "test_context"])

assert output == expected

def test_get_top_keyword_of_stop_words_stop_word_keyword_only_False():
output = get_top_keyword_of_stop_words(stop_word_keyword_only=False, stop_words_objects=[])
expected = []

assert output == expected

def test_update_nothing_because_no_keywords_are_included():
conn = connect_to_db()
session = get_db_session(conn)
Expand Down
17 changes: 14 additions & 3 deletions test/stop_word/test_stop_word.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,14 +233,23 @@ def test_stop_word_save_append_stop_word():
"context": "lacieux selon les experts question climatique en fait elle dépasse la question ",
"count": 19,
"id" : get_consistent_hash("lacieux selon les experts question climatique en fait elle dépasse la question "),
},
{
"keyword_id": "fake_id",
"id": "test2",
# "keyword": ", empty
"channel_title": "TF1",
"context": "empty_keyword",
"count": 19,
"id" : get_consistent_hash("empty_keyword"),
}
]
save_append_stop_word(conn, to_save)

# get all stop word from db
stop_words = get_all_stop_word(session)

assert len(stop_words) == 2
assert len(stop_words) == 3
assert stop_words[0].keyword == "replantation"
assert stop_words[1].keyword == "climatique"
assert stop_words[0].count == 20
Expand All @@ -249,14 +258,16 @@ def test_stop_word_save_append_stop_word():
assert stop_words[1].channel_title == "TF1"
assert stop_words[0].context == " avait promis de lancer un plan de replantation euh hélas pas pu tout s' est pa"
assert stop_words[1].context == "lacieux selon les experts question climatique en fait elle dépasse la question "
assert stop_words[2].context == "empty_keyword"
assert stop_words[2].keyword == None

def test_stop_word_main():
conn = connect_to_db()
manage_stop_word(conn=conn, duration=3000)
# get all stop word from db
stop_words = get_all_stop_word(session)

assert len(stop_words) == 2
assert len(stop_words) == 3


def test_stop_word_is_already_known_stop_word():
Expand All @@ -270,4 +281,4 @@ def test_stop_word_save_append_stop_word_duplicate():
save = [{'context': 'ts historiques les bouleversements climatiques de très fortes chaleurs les vict', 'keyword_id': 'fca7a8626fb7762ba6849f1cbc3b2708099118352442025e9e99b8b451e698d3', 'count': 41, 'id': 'a9017d023deadedd24a915ea3ad2eac5c79a94a9dcd87214fc3a0378b0acf8c5', 'keyword': 'climatique', 'channel_title': 'Arte', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': 'tion des conflits les dérèglements climatiques et la crise économique aggrave l', 'keyword_id': 'd9e9290ef2fa15ab4d54e387bc3ef6204c5b34e0effa00f2479d9ed51442cbd6', 'count': 27, 'id': 'd2f44307d8057427f48e67c2400d9e3e8cb175d080f0f492a168c794178bb4dc', 'keyword': 'climatique', 'channel_title': 'Arte', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': 'aux pollutions et aux dérèglements climatiques olivier emond embarqué à bord de', 'keyword_id': 'e683a12b0260af1a9f22717165e869e30ed13dbb6b5da54b6c3c2adcd3b02308', 'count': 23, 'id': '5637bf04ae3d04d76a952d7d3864795fdb32d8d8f79a86b31115b5ab46cad9d9', 'keyword': 'climatique', 'channel_title': 'Arte', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': "nt de la lpo comment accueillir la biodiversité limiter l' érosion stocker le c", 'keyword_id': 'f2c11b05a19ad10e662c0669bbac7887c490b50b80cce9c3253ced24e5c7515c', 'count': 36, 'id': '09a1bb6f54bb34134d4010ce8c97991983ece3be1f598210596ff026612534fa', 'keyword': 'biodiversité', 'channel_title': 'Arte', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': "t s' engage dans le dispositif mon leasing électrique avec renault twingo hi te", 'keyword_id': 'ff68bad3169c0dfd7f382eb7ef25d2919f53a46fcdffd017c496eb239c96e151', 'count': 57, 'id': 'f6196da2b5d0eabe34fb08f4d9fc3e834b18f934298c3bc5b777194151909e22', 'keyword': 'leasing électrique', 'channel_title': 'Europe 1', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': 'ros par mois seulement grâce à mon leasing électrique comme ça tout le monde pe', 'keyword_id': 'fe506268294d1eae63bc7c40f2205436821824d5db78d095924e21e9103fce44', 'count': 47, 'id': '571bec95475147a5fc7b4d5497ae52a36fa570efe197865c2874ed531687814d', 'keyword': 'leasing électrique', 'channel_title': 'Europe 1', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': "t s' engage dans le dispositif mon leasing électrique avec renault mégane texte", 'keyword_id': 'fa6a852a31d9447978e886b4c621d20d69a9852082f0569e8e20475ba72a4a70', 'count': 38, 'id': '6cc34be846b6d9e39db9d7f514c0b54edb1e99d52053476c7fd0831e40287ac6', 'keyword': 'leasing électrique', 'channel_title': 'Europe 1', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': ' accessible fallait bien ça car la transition énergétique ça compte mais aussi ', 'keyword_id': 'f8d11f03feef88edbaf4646ddd2d29719825b3062683e0b54e4a377544b93c97', 'count': 42, 'id': 'b3f4ea8c4fd9ed3ef702faed1caccecca8d562a5bbf8c17705157517a322d35e', 'keyword': 'transition énergétique', 'channel_title': 'Europe 1', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': 'e a été formé pour créer une ferme agro écologique faites un don pour que son h', 'keyword_id': 'fff2e7844e4a926d6e744d36ef2ccfa6ecfb3f680ba139c5785a071c824d8159', 'count': 32, 'id': '2cbdb8b5d730926696abe523e953745266d33fd6a5198e1815394ba7d78daa8d', 'keyword': 'agro écologique', 'channel_title': 'France 2', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': "mérique favorisant l' émergence d' énergie renouvelable en afrique francophone ", 'keyword_id': 'fb1984e3c30e3f97840dcc0db9359b31c18acf08174c05be0532e371c99eeb0e', 'count': 24, 'id': '654d6946f00b27cd6d61dd051753fe7960eb2499be29a1b176e76cbf8acaed39', 'keyword': 'énergie renouvelable', 'channel_title': 'RFI', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': 'ts historiques les bouleversements climatiques de très fortes chaleurs les vict', 'keyword_id': 'fca7a8626fb7762ba6849f1cbc3b2708099118352442025e9e99b8b451e698d3', 'count': 41, 'id': 'a9017d023deadedd24a915ea3ad2eac5c79a94a9dcd87214fc3a0378b0acf8c5', 'keyword': 'climat', 'channel_title': 'RFI', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': 'tion des conflits les dérèglements climatiques et la crise économique aggrave l', 'keyword_id': 'd9e9290ef2fa15ab4d54e387bc3ef6204c5b34e0effa00f2479d9ed51442cbd6', 'count': 26, 'id': 'd2f44307d8057427f48e67c2400d9e3e8cb175d080f0f492a168c794178bb4dc', 'keyword': 'climat', 'channel_title': 'RFI', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': 'aux pollutions et aux dérèglements climatiques olivier emond embarqué à bord de', 'keyword_id': 'e683a12b0260af1a9f22717165e869e30ed13dbb6b5da54b6c3c2adcd3b02308', 'count': 23, 'id': '5637bf04ae3d04d76a952d7d3864795fdb32d8d8f79a86b31115b5ab46cad9d9', 'keyword': 'climat', 'channel_title': 'RFI', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}]
save_append_stop_word(conn, save)
stop_words = get_all_stop_word(session)
assert len(stop_words) == 12
assert len(stop_words) == 13

1 comment on commit 97783e0

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
postgres
   insert_data.py43784%36–38, 57–59, 64
   insert_existing_data_example.py19384%25–27
postgres/schemas
   models.py1972189%152–153, 156–163, 172, 179, 181–182, 247–248, 251–256, 267, 274–275
quotaclimat/data_ingestion
   scrap_sitemap.py1341787%27–28, 33–34, 66–71, 95–97, 138–140, 202, 223–228
quotaclimat/data_ingestion/ingest_db
   ingest_sitemap_in_db.py553733%21–42, 45–58, 62–73
quotaclimat/data_ingestion/scrap_html
   scrap_description_article.py36392%19–20, 32
quotaclimat/data_processing/mediatree
   api_import.py24915339%44–48, 54–89, 93–96, 102, 111, 117, 120–122, 125–172, 178–193, 198, 211–223, 227–233, 247–259, 262–266, 272, 317–318, 321–352, 355–357
   channel_program.py1625765%21–23, 34–36, 53–54, 57–59, 98–99, 108, 124, 175–216
   config.py15287%7, 16
   detect_keywords.py2571196%126–127, 283, 351–358, 400
   update_pg_keywords.py896725%16–19, 22–31, 36–157, 181, 184, 188–189, 199–220, 254–291, 298
   utils.py963267%29–53, 56, 65, 120–122, 135–138, 142–149
quotaclimat/data_processing/mediatree/s3
   api_to_s3.py15110034%64–78, 81–107, 110–123, 126–176, 179–205, 208–210
quotaclimat/data_processing/mediatree/stop_word
   main.py16010634%40–44, 70, 87–107, 117–183, 187–251, 255–284, 288–327, 330–332
quotaclimat/utils
   healthcheck_config.py291452%22–24, 27–38
   logger.py241154%22–24, 28–37
   sentry.py11282%22–23
TOTAL175464363% 

Tests Skipped Failures Errors Time
111 0 💤 0 ❌ 0 🔥 2m 39s ⏱️

Please sign in to comment.