From 97783e05a44a2e7d8e0d5b21137ed277cf04c7f0 Mon Sep 17 00:00:00 2001 From: Paul Leclercq Date: Thu, 23 Jan 2025 10:18:01 +0100 Subject: [PATCH] fix(stop_word): case where keyword is empty to enable faster updates filter (#322) --- .../mediatree/update_pg_keywords.py | 33 +++++++---- test/sitemap/test_update_pg_keywords.py | 55 ++++++++++++++++++- test/stop_word/test_stop_word.py | 17 +++++- 3 files changed, 91 insertions(+), 14 deletions(-) diff --git a/quotaclimat/data_processing/mediatree/update_pg_keywords.py b/quotaclimat/data_processing/mediatree/update_pg_keywords.py index c58b932b..3705ec2b 100644 --- a/quotaclimat/data_processing/mediatree/update_pg_keywords.py +++ b/quotaclimat/data_processing/mediatree/update_pg_keywords.py @@ -5,26 +5,39 @@ import logging from sqlalchemy.orm import Session -from postgres.schemas.models import Keywords +from postgres.schemas.models import Keywords, Stop_Word from quotaclimat.data_processing.mediatree.detect_keywords import * from quotaclimat.data_processing.mediatree.api_import import get_stop_words from quotaclimat.data_processing.mediatree.channel_program import get_programs, get_a_program_with_start_timestamp, get_channel_title_for_name from sqlalchemy import func, select, and_, or_ + +def get_keyword_else_context(stop_word_object: Stop_Word): + if stop_word_object.keyword is not None: + return stop_word_object.keyword + else: + return stop_word_object.context + +def get_top_keyword_of_stop_words(stop_word_keyword_only: bool, stop_words_objects: List[Stop_Word]): + top_keyword_of_stop_words = [] + if stop_word_keyword_only and (len(stop_words_objects) > 0): + logging.warning(f"Using stop words to filter rows inside Keywords table") + + top_keyword_of_stop_words = set(map(lambda stop: get_keyword_else_context(stop), stop_words_objects)) + logging.info(f"stop words keywords : {top_keyword_of_stop_words}") + else: + logging.info(f"No filter on plaintext for Keywords table - stop_word_keyword_only env variable to false") + + return top_keyword_of_stop_words + def update_keywords(session: Session, batch_size: int = 50000, start_date : str = "2023-04-01", program_only=False, \ end_date: str = "2023-04-30", channel: str = "", empty_program_only=False, \ stop_word_keyword_only = False) -> list: df_programs = get_programs() - stop_words_object = get_stop_words(session, validated_only=True, context_only=False) - stop_words = list(map(lambda stop: stop.context, stop_words_object)) - if stop_word_keyword_only and (len(stop_words) > 0): - logging.warning(f"Using stop words to filter rows inside Keywords table") - top_keyword_of_stop_words = set(map(lambda stop: stop.keyword, stop_words_object)) - logging.info(f"stop words keywords :\n {top_keyword_of_stop_words}") - else: - logging.info(f"No filter on plaintext for Keywords table - stop_word_keyword_only env variable to false") - top_keyword_of_stop_words = [] + stop_words_objects = get_stop_words(session, validated_only=True, context_only=False) + stop_words = list(map(lambda stop: stop.context, stop_words_objects)) + top_keyword_of_stop_words = get_top_keyword_of_stop_words(stop_word_keyword_only, stop_words_objects=stop_words_objects) total_updates = get_total_count_saved_keywords(session, start_date, end_date, channel, empty_program_only, keywords_to_includes=top_keyword_of_stop_words) diff --git a/test/sitemap/test_update_pg_keywords.py b/test/sitemap/test_update_pg_keywords.py index d98a6088..d92d3115 100644 --- a/test/sitemap/test_update_pg_keywords.py +++ b/test/sitemap/test_update_pg_keywords.py @@ -946,7 +946,60 @@ def test_update_only_keywords_that_includes_some_keywords(): conn.dispose() session.close() assert result_after_update.number_of_keywords_climat == number_of_keywords_climat - + + +def test_get_top_keyword_of_stop_words_stop_word_keyword_only_True(): + sw1 = Stop_Word( + id="test" + ,keyword_id="test" + ,channel_title="test" + ,context="test_context" + ,count=10 + ,keyword="keyword1" + ,created_at="test" + ,start_date="test" + ,updated_at="test" + ,validated=True + ) + + sw2 = Stop_Word( + id="test" + ,keyword_id="test" + ,channel_title="test" + ,context="test_context" + ,count=10 + ,keyword="keyword2" + ,created_at="test" + ,start_date="test" + ,updated_at="test" + ,validated=True + ) + + sw3 = Stop_Word( + id="test" + ,keyword_id="test" + ,channel_title="test" + ,context="test_context" + ,count=10 + # ,keyword="keyword2" empty keyword it should use context + ,created_at="test" + ,start_date="test" + ,updated_at="test" + ,validated=True + ) + stop_words_objects = [sw1, sw1, sw1, sw2, sw3] + + output = get_top_keyword_of_stop_words(stop_word_keyword_only=True, stop_words_objects=stop_words_objects) + expected = set(["keyword1", "keyword2", "test_context"]) + + assert output == expected + +def test_get_top_keyword_of_stop_words_stop_word_keyword_only_False(): + output = get_top_keyword_of_stop_words(stop_word_keyword_only=False, stop_words_objects=[]) + expected = [] + + assert output == expected + def test_update_nothing_because_no_keywords_are_included(): conn = connect_to_db() session = get_db_session(conn) diff --git a/test/stop_word/test_stop_word.py b/test/stop_word/test_stop_word.py index 820fa512..ea80e6f0 100644 --- a/test/stop_word/test_stop_word.py +++ b/test/stop_word/test_stop_word.py @@ -233,6 +233,15 @@ def test_stop_word_save_append_stop_word(): "context": "lacieux selon les experts question climatique en fait elle dépasse la question ", "count": 19, "id" : get_consistent_hash("lacieux selon les experts question climatique en fait elle dépasse la question "), + }, + { + "keyword_id": "fake_id", + "id": "test2", + # "keyword": ", empty + "channel_title": "TF1", + "context": "empty_keyword", + "count": 19, + "id" : get_consistent_hash("empty_keyword"), } ] save_append_stop_word(conn, to_save) @@ -240,7 +249,7 @@ def test_stop_word_save_append_stop_word(): # get all stop word from db stop_words = get_all_stop_word(session) - assert len(stop_words) == 2 + assert len(stop_words) == 3 assert stop_words[0].keyword == "replantation" assert stop_words[1].keyword == "climatique" assert stop_words[0].count == 20 @@ -249,6 +258,8 @@ def test_stop_word_save_append_stop_word(): assert stop_words[1].channel_title == "TF1" assert stop_words[0].context == " avait promis de lancer un plan de replantation euh hélas pas pu tout s' est pa" assert stop_words[1].context == "lacieux selon les experts question climatique en fait elle dépasse la question " + assert stop_words[2].context == "empty_keyword" + assert stop_words[2].keyword == None def test_stop_word_main(): conn = connect_to_db() @@ -256,7 +267,7 @@ def test_stop_word_main(): # get all stop word from db stop_words = get_all_stop_word(session) - assert len(stop_words) == 2 + assert len(stop_words) == 3 def test_stop_word_is_already_known_stop_word(): @@ -270,4 +281,4 @@ def test_stop_word_save_append_stop_word_duplicate(): save = [{'context': 'ts historiques les bouleversements climatiques de très fortes chaleurs les vict', 'keyword_id': 'fca7a8626fb7762ba6849f1cbc3b2708099118352442025e9e99b8b451e698d3', 'count': 41, 'id': 'a9017d023deadedd24a915ea3ad2eac5c79a94a9dcd87214fc3a0378b0acf8c5', 'keyword': 'climatique', 'channel_title': 'Arte', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': 'tion des conflits les dérèglements climatiques et la crise économique aggrave l', 'keyword_id': 'd9e9290ef2fa15ab4d54e387bc3ef6204c5b34e0effa00f2479d9ed51442cbd6', 'count': 27, 'id': 'd2f44307d8057427f48e67c2400d9e3e8cb175d080f0f492a168c794178bb4dc', 'keyword': 'climatique', 'channel_title': 'Arte', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': 'aux pollutions et aux dérèglements climatiques olivier emond embarqué à bord de', 'keyword_id': 'e683a12b0260af1a9f22717165e869e30ed13dbb6b5da54b6c3c2adcd3b02308', 'count': 23, 'id': '5637bf04ae3d04d76a952d7d3864795fdb32d8d8f79a86b31115b5ab46cad9d9', 'keyword': 'climatique', 'channel_title': 'Arte', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': "nt de la lpo comment accueillir la biodiversité limiter l' érosion stocker le c", 'keyword_id': 'f2c11b05a19ad10e662c0669bbac7887c490b50b80cce9c3253ced24e5c7515c', 'count': 36, 'id': '09a1bb6f54bb34134d4010ce8c97991983ece3be1f598210596ff026612534fa', 'keyword': 'biodiversité', 'channel_title': 'Arte', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': "t s' engage dans le dispositif mon leasing électrique avec renault twingo hi te", 'keyword_id': 'ff68bad3169c0dfd7f382eb7ef25d2919f53a46fcdffd017c496eb239c96e151', 'count': 57, 'id': 'f6196da2b5d0eabe34fb08f4d9fc3e834b18f934298c3bc5b777194151909e22', 'keyword': 'leasing électrique', 'channel_title': 'Europe 1', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': 'ros par mois seulement grâce à mon leasing électrique comme ça tout le monde pe', 'keyword_id': 'fe506268294d1eae63bc7c40f2205436821824d5db78d095924e21e9103fce44', 'count': 47, 'id': '571bec95475147a5fc7b4d5497ae52a36fa570efe197865c2874ed531687814d', 'keyword': 'leasing électrique', 'channel_title': 'Europe 1', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': "t s' engage dans le dispositif mon leasing électrique avec renault mégane texte", 'keyword_id': 'fa6a852a31d9447978e886b4c621d20d69a9852082f0569e8e20475ba72a4a70', 'count': 38, 'id': '6cc34be846b6d9e39db9d7f514c0b54edb1e99d52053476c7fd0831e40287ac6', 'keyword': 'leasing électrique', 'channel_title': 'Europe 1', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': ' accessible fallait bien ça car la transition énergétique ça compte mais aussi ', 'keyword_id': 'f8d11f03feef88edbaf4646ddd2d29719825b3062683e0b54e4a377544b93c97', 'count': 42, 'id': 'b3f4ea8c4fd9ed3ef702faed1caccecca8d562a5bbf8c17705157517a322d35e', 'keyword': 'transition énergétique', 'channel_title': 'Europe 1', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': 'e a été formé pour créer une ferme agro écologique faites un don pour que son h', 'keyword_id': 'fff2e7844e4a926d6e744d36ef2ccfa6ecfb3f680ba139c5785a071c824d8159', 'count': 32, 'id': '2cbdb8b5d730926696abe523e953745266d33fd6a5198e1815394ba7d78daa8d', 'keyword': 'agro écologique', 'channel_title': 'France 2', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': "mérique favorisant l' émergence d' énergie renouvelable en afrique francophone ", 'keyword_id': 'fb1984e3c30e3f97840dcc0db9359b31c18acf08174c05be0532e371c99eeb0e', 'count': 24, 'id': '654d6946f00b27cd6d61dd051753fe7960eb2499be29a1b176e76cbf8acaed39', 'keyword': 'énergie renouvelable', 'channel_title': 'RFI', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': 'ts historiques les bouleversements climatiques de très fortes chaleurs les vict', 'keyword_id': 'fca7a8626fb7762ba6849f1cbc3b2708099118352442025e9e99b8b451e698d3', 'count': 41, 'id': 'a9017d023deadedd24a915ea3ad2eac5c79a94a9dcd87214fc3a0378b0acf8c5', 'keyword': 'climat', 'channel_title': 'RFI', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': 'tion des conflits les dérèglements climatiques et la crise économique aggrave l', 'keyword_id': 'd9e9290ef2fa15ab4d54e387bc3ef6204c5b34e0effa00f2479d9ed51442cbd6', 'count': 26, 'id': 'd2f44307d8057427f48e67c2400d9e3e8cb175d080f0f492a168c794178bb4dc', 'keyword': 'climat', 'channel_title': 'RFI', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}, {'context': 'aux pollutions et aux dérèglements climatiques olivier emond embarqué à bord de', 'keyword_id': 'e683a12b0260af1a9f22717165e869e30ed13dbb6b5da54b6c3c2adcd3b02308', 'count': 23, 'id': '5637bf04ae3d04d76a952d7d3864795fdb32d8d8f79a86b31115b5ab46cad9d9', 'keyword': 'climat', 'channel_title': 'RFI', 'start_date': datetime(2024, 1, 14, 10, 2, 21)}] save_append_stop_word(conn, save) stop_words = get_all_stop_word(session) - assert len(stop_words) == 12 \ No newline at end of file + assert len(stop_words) == 13 \ No newline at end of file