From 9e560d26dce293e36951abecd6ce0de2c21b5d08 Mon Sep 17 00:00:00 2001 From: Theodlz Date: Mon, 8 Apr 2024 12:39:44 -0700 Subject: [PATCH 1/5] don't re-trigger if there are any recently (less than 12 hours) failed requests --- kowalski/alert_brokers/alert_broker.py | 29 ++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/kowalski/alert_brokers/alert_broker.py b/kowalski/alert_brokers/alert_broker.py index eb840471..e2dc0709 100644 --- a/kowalski/alert_brokers/alert_broker.py +++ b/kowalski/alert_brokers/alert_broker.py @@ -2067,10 +2067,31 @@ def alert_sentinel_skyportal( if any( [ status in str(r["status"]).lower() - for status in ["complete", "submitted", "deleted"] + for status in ["complete", "submitted", "deleted", "failed"] ] ) ] + # filter out the failed requests that failed more than 12 hours + # this is for to us avoid re-triggering on requests that failed LESS than 12 hours ago + # we keep those recently failed ones so that the code underneath finds an existing request + # and does not retriggers a new one. Usually, failed requests are reprocessed during the day and marked + # as complete, which is why we can afford to wait 12 hours before re-triggering (basically until the next night) + now_utc = datetime.datetime.utcnow() + existing_requests = [ + r + for r in existing_requests + if not ( + "failed" in str(r["status"]).lower() + and ( + now_utc + - datetime.datetime.strptime( + r["modified"], "%Y-%m-%dT%H:%M:%S.%f" + ) + ).total_seconds() + > 12 * 3600 + ) + ] + # sort by priority (highest first) existing_requests = sorted( existing_requests, @@ -2212,15 +2233,15 @@ def alert_sentinel_skyportal( # if there is an existing request, but the priority is lower than the one we want to post, # update the existing request with the new priority request_to_update = existing_requests_filtered[0][1] - # if the status is completed or deleted, do not update + # if the status is completed, deleted, or failed, do not update if any( [ status in str(request_to_update["status"]).lower() - for status in ["complete", "deleted"] + for status in ["complete", "deleted", "failed"] ] ): log( - f"Followup request for {alert['objectId']} and allocation_id {passed_filter['auto_followup']['allocation_id']} already exists on SkyPortal, but is completed or deleted, no need for update" + f"Followup request for {alert['objectId']} and allocation_id {passed_filter['auto_followup']['allocation_id']} already exists on SkyPortal, but is completed, deleted or failed (recently), no need for update" ) # if the status is submitted, and the new priority is higher, update if "submitted" in str( From bdfc90411281d4c66366be7936b15725435bd4e1 Mon Sep 17 00:00:00 2001 From: Theophile du Laz Date: Mon, 8 Apr 2024 12:47:07 -0700 Subject: [PATCH 2/5] fix comment --- kowalski/alert_brokers/alert_broker.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kowalski/alert_brokers/alert_broker.py b/kowalski/alert_brokers/alert_broker.py index e2dc0709..b17a76a7 100644 --- a/kowalski/alert_brokers/alert_broker.py +++ b/kowalski/alert_brokers/alert_broker.py @@ -2071,11 +2071,13 @@ def alert_sentinel_skyportal( ] ) ] - # filter out the failed requests that failed more than 12 hours - # this is for to us avoid re-triggering on requests that failed LESS than 12 hours ago - # we keep those recently failed ones so that the code underneath finds an existing request - # and does not retriggers a new one. Usually, failed requests are reprocessed during the day and marked - # as complete, which is why we can afford to wait 12 hours before re-triggering (basically until the next night) + # filter out the failed requests that failed more than 12 hours. This is for to avoid + # re-triggering on objects where existing requests failed LESS than 12 hours ago. + # + # We keep these recently failed ones so that the code underneath finds an existing request and + # does not retrigger a new one. Usually, failed requests are reprocessed during the day and marked + # as complete, which is why we can afford to wait 12 hours before re-triggering + # (basically until the next night) now_utc = datetime.datetime.utcnow() existing_requests = [ r From 643ffaf331cd77d37da51f2d42907cd3b16653fe Mon Sep 17 00:00:00 2001 From: Theodlz Date: Mon, 8 Apr 2024 12:49:11 -0700 Subject: [PATCH 3/5] lint --- kowalski/alert_brokers/alert_broker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kowalski/alert_brokers/alert_broker.py b/kowalski/alert_brokers/alert_broker.py index b17a76a7..50a1ad8e 100644 --- a/kowalski/alert_brokers/alert_broker.py +++ b/kowalski/alert_brokers/alert_broker.py @@ -2073,7 +2073,7 @@ def alert_sentinel_skyportal( ] # filter out the failed requests that failed more than 12 hours. This is for to avoid # re-triggering on objects where existing requests failed LESS than 12 hours ago. - # + # # We keep these recently failed ones so that the code underneath finds an existing request and # does not retrigger a new one. Usually, failed requests are reprocessed during the day and marked # as complete, which is why we can afford to wait 12 hours before re-triggering From d129a7ecd4d8faebd2ec415bd4b59e8d45f2c0d4 Mon Sep 17 00:00:00 2001 From: Theodlz Date: Mon, 8 Apr 2024 12:50:23 -0700 Subject: [PATCH 4/5] fix some more things in the comments as per Michael --- kowalski/alert_brokers/alert_broker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kowalski/alert_brokers/alert_broker.py b/kowalski/alert_brokers/alert_broker.py index 50a1ad8e..ddb388e8 100644 --- a/kowalski/alert_brokers/alert_broker.py +++ b/kowalski/alert_brokers/alert_broker.py @@ -2071,7 +2071,7 @@ def alert_sentinel_skyportal( ] ) ] - # filter out the failed requests that failed more than 12 hours. This is for to avoid + # filter out the failed requests that failed more than 12 hours. This is to avoid # re-triggering on objects where existing requests failed LESS than 12 hours ago. # # We keep these recently failed ones so that the code underneath finds an existing request and @@ -2235,7 +2235,7 @@ def alert_sentinel_skyportal( # if there is an existing request, but the priority is lower than the one we want to post, # update the existing request with the new priority request_to_update = existing_requests_filtered[0][1] - # if the status is completed, deleted, or failed, do not update + # if the status is completed, deleted, or failed do not update if any( [ status in str(request_to_update["status"]).lower() From e82f8c51e69e994e09a07fc0bd669a2f066378a8 Mon Sep 17 00:00:00 2001 From: Theodlz Date: Mon, 8 Apr 2024 15:46:22 -0700 Subject: [PATCH 5/5] try except to avoid issues applying filters or triggering if we can't remove old requests --- kowalski/alert_brokers/alert_broker.py | 37 ++++++++++++++++---------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/kowalski/alert_brokers/alert_broker.py b/kowalski/alert_brokers/alert_broker.py index ddb388e8..14f3181d 100644 --- a/kowalski/alert_brokers/alert_broker.py +++ b/kowalski/alert_brokers/alert_broker.py @@ -2078,21 +2078,30 @@ def alert_sentinel_skyportal( # does not retrigger a new one. Usually, failed requests are reprocessed during the day and marked # as complete, which is why we can afford to wait 12 hours before re-triggering # (basically until the next night) - now_utc = datetime.datetime.utcnow() - existing_requests = [ - r - for r in existing_requests - if not ( - "failed" in str(r["status"]).lower() - and ( - now_utc - - datetime.datetime.strptime( - r["modified"], "%Y-%m-%dT%H:%M:%S.%f" - ) - ).total_seconds() - > 12 * 3600 + try: + now_utc = datetime.datetime.utcnow() + existing_requests = [ + r + for r in existing_requests + if not ( + "failed" in str(r["status"]).lower() + and ( + now_utc + - datetime.datetime.strptime( + r["modified"], "%Y-%m-%dT%H:%M:%S.%f" + ) + ).total_seconds() + > 12 * 3600 + ) + ] + except Exception as e: + # to avoid not triggering at all if the above fails, we log that failure + # and keep all failed requests in the list. That way if there are any failed requests + # we won't trigger, but we will trigger as usual if they're isn't any failed requests + # it's just a fail-safe + log( + f"Failed to filter out failed followup requests for {alert['objectId']}: {e}" ) - ] # sort by priority (highest first) existing_requests = sorted(