Skip to content

Commit

Permalink
Fix [#873]: Don't notify if parents down (#995)
Browse files Browse the repository at this point in the history
* Add basic notification viability test

* Don't notify if any parents are down, add recovery notification type, handle recoveries better

* Fix number in header and fix bug introduced in e7a17b3
  • Loading branch information
tsadpbb authored Oct 23, 2024
1 parent 402fbc4 commit c378fd1
Show file tree
Hide file tree
Showing 12 changed files with 683 additions and 66 deletions.
1 change: 1 addition & 0 deletions Changelog
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Nagios Core 4 Change Log
4.5.7 - 2024-11-19
------------------
* Expand the custom variable macros to an empty string, if the custom variable does not exist (Andre Klärner)
* Fix notification logic to not send a notification if a host and services parents are down (#873) (Dylan Anderson)
* Update Exfoliation theme (Dylan Anderson)

4.5.6 - 2024-10-08
Expand Down
39 changes: 20 additions & 19 deletions base/checks.c
Original file line number Diff line number Diff line change
Expand Up @@ -1064,8 +1064,8 @@ static inline void host_propagate_checks_to_immediate_parents(host * hst, int pa
log_debug_info(DEBUGL_CHECKS, 1, "Propagating checks to parent host(s)...\n");
for(temp_hostsmember = hst->parent_hosts; temp_hostsmember != NULL; temp_hostsmember = temp_hostsmember->next) {
parent_host = temp_hostsmember->host_ptr;
if ((parent_host_up == TRUE && parent_host->current_state == HOST_UP)
|| ((parent_host_up == FALSE && parent_host->current_state != HOST_UP))) {
if ((parent_host_up == TRUE && parent_host->current_state != HOST_UP)
|| ((parent_host_up == FALSE && parent_host->current_state == HOST_UP))) {

log_debug_info(DEBUGL_CHECKS, 1, "Check of parent host '%s' queued.\n", parent_host->name);
schedule_host_check(parent_host, current_time, CHECK_OPTION_DEPENDENCY_CHECK);
Expand Down Expand Up @@ -1208,6 +1208,8 @@ int handle_async_service_check_result(service *svc, check_result *cr)

host * hst = NULL;

int notification_type = NOTIFICATION_NORMAL;

log_debug_info(DEBUGL_FUNCTIONS, 0, "handle_async_service_check_result()\n");

if (svc == NULL) {
Expand Down Expand Up @@ -1590,10 +1592,16 @@ int handle_async_service_check_result(service *svc, check_result *cr)
handle_event = TRUE;
}

// Recovery notification was not sent
if(svc->notified_on != 0 && svc->current_state == STATE_OK && svc->state_type == HARD_STATE) {
notification_type = NOTIFICATION_RECOVERY;
send_notification = TRUE;
}

if (send_notification == TRUE) {

/* send notification */
if (service_notification(svc, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE) == OK) {
if (service_notification(svc, notification_type, NULL, NULL, NOTIFICATION_OPTION_NONE) == OK) {

/* log state due to notification event when stalking_options N is set */
if (should_stalk_notifications(svc)) {
Expand All @@ -1602,13 +1610,6 @@ int handle_async_service_check_result(service *svc, check_result *cr)
}
}

/* the service recovered, so reset the current notification number and state flags (after the recovery notification has gone out) */
/* We don't want to reset notifications if the service is currently flapping because we want recovery notifications */
if(svc->current_state == STATE_OK && svc->state_type == HARD_STATE && hard_state_change == TRUE && svc->is_flapping == FALSE) {
svc->current_notification_number = 0;
svc->notified_on = 0;
}

if (obsess_over_services == TRUE) {
obsessive_compulsive_service_check_processor(svc);
}
Expand All @@ -1634,7 +1635,6 @@ int handle_async_service_check_result(service *svc, check_result *cr)

/* Reset attempts */
if (hard_state_change == TRUE) {
svc->current_notification_number = 0;
svc->host_problem_at_last_check = FALSE;
}

Expand Down Expand Up @@ -2255,6 +2255,8 @@ int handle_async_host_check_result(host *hst, check_result *cr)

char * old_plugin_output = NULL;

int notification_type = NOTIFICATION_NORMAL;

log_debug_info(DEBUGL_FUNCTIONS, 0, "handle_async_host_check_result()\n");

if (is_valid_check_result_data(hst, cr) == FALSE) {
Expand Down Expand Up @@ -2482,10 +2484,16 @@ int handle_async_host_check_result(host *hst, check_result *cr)
hst->current_attempt = 1;
}

// Recovery notification was not sent
if(hst->notified_on != 0 && hst->current_state == HOST_UP && hst->state_type == HARD_STATE) {
notification_type = NOTIFICATION_RECOVERY;
send_notification = TRUE;
}

if (send_notification == TRUE) {

/* send notifications */
if (host_notification(hst, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE) == OK) {
if (host_notification(hst, notification_type, NULL, NULL, NOTIFICATION_OPTION_NONE) == OK) {

/* log state due to notification event when stalking_options N is set */
if (should_stalk_notifications(hst)) {
Expand All @@ -2494,13 +2502,6 @@ int handle_async_host_check_result(host *hst, check_result *cr)
}
}

/* the host recovered, so reset the current notification number and state flags (after the recovery notification has gone out) */
/* We don't want to reset notifications if the host is currently flapping because we want recovery notifications */
if(hst->current_state == HOST_UP && hst->state_type == HARD_STATE && hard_state_change == TRUE && hst->is_flapping == FALSE) {
hst->current_notification_number = 0;
hst->notified_on = 0;
}

if (obsess_over_hosts == TRUE) {
obsessive_compulsive_host_check_processor(hst);
}
Expand Down
10 changes: 2 additions & 8 deletions base/commands.c
Original file line number Diff line number Diff line change
Expand Up @@ -5390,10 +5390,7 @@ void clear_host_flapping_state(host *hst) {

/* should we send a recovery notification? */
if (hst->current_state == HOST_UP && hst->check_flapping_recovery_notification == TRUE) {
host_notification(hst, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE);
/* Similar to what happens when a host recovers in handle_async_host_check_result */
hst->current_notification_number = 0;
hst->notified_on = 0;
host_notification(hst, NOTIFICATION_RECOVERY, NULL, NULL, NOTIFICATION_OPTION_NONE);
}
}

Expand Down Expand Up @@ -5450,10 +5447,7 @@ void clear_service_flapping_state(service *svc) {

/* should we send a recovery notification? */
if (svc->current_state == STATE_OK && svc->check_flapping_recovery_notification == TRUE) {
service_notification(svc, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE);
/* Similar to what happens when a service recovers in handle_async_service_check_result */
svc->current_notification_number = 0;
svc->notified_on = 0;
service_notification(svc, NOTIFICATION_RECOVERY, NULL, NULL, NOTIFICATION_OPTION_NONE);
}
}

Expand Down
20 changes: 4 additions & 16 deletions base/flapping.c
Original file line number Diff line number Diff line change
Expand Up @@ -375,10 +375,7 @@ void clear_service_flap(service *svc, double percent_change, double high_thresho

/* should we send a recovery notification? */
if(svc->check_flapping_recovery_notification == TRUE && svc->current_state == STATE_OK) {
service_notification(svc, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE);
/* Similar to what happens when a service recovers in handle_async_service_check_result */
svc->current_notification_number = 0;
svc->notified_on = 0;
service_notification(svc, NOTIFICATION_RECOVERY, NULL, NULL, NOTIFICATION_OPTION_NONE);
}
}

Expand Down Expand Up @@ -470,10 +467,7 @@ void clear_host_flap(host *hst, double percent_change, double high_threshold, do

/* should we send a recovery notification? */
if(hst->check_flapping_recovery_notification == TRUE && hst->current_state == HOST_UP) {
host_notification(hst, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE);
/* Similar to what happens when a host recovers in handle_async_host_check_result */
hst->current_notification_number = 0;
hst->notified_on = 0;
host_notification(hst, NOTIFICATION_RECOVERY, NULL, NULL, NOTIFICATION_OPTION_NONE);
}
}

Expand Down Expand Up @@ -663,10 +657,7 @@ void handle_host_flap_detection_disabled(host *hst) {

/* should we send a recovery notification? */
if(hst->check_flapping_recovery_notification == TRUE && hst->current_state == HOST_UP) {
host_notification(hst, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE);
/* Similar to what happens when a host recovers in handle_async_host_check_result */
hst->current_notification_number = 0;
hst->notified_on = 0;
host_notification(hst, NOTIFICATION_RECOVERY, NULL, NULL, NOTIFICATION_OPTION_NONE);
}

/* clear the recovery notification flag */
Expand Down Expand Up @@ -781,10 +772,7 @@ void handle_service_flap_detection_disabled(service *svc) {

/* should we send a recovery notification? */
if(svc->check_flapping_recovery_notification == TRUE && svc->current_state == STATE_OK) {
service_notification(svc, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE);
/* Similar to what happens when a service recovers in handle_async_service_check_result */
svc->current_notification_number = 0;
svc->notified_on = 0;
service_notification(svc, NOTIFICATION_RECOVERY, NULL, NULL, NOTIFICATION_OPTION_NONE);
}

/* clear the recovery notification flag */
Expand Down
88 changes: 65 additions & 23 deletions base/notifications.c
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,19 @@ int service_notification(service *svc, int type, char *not_author, char *not_dat
/* check the viability of sending out a service notification */
if(check_service_notification_viability(svc, type, options) == ERROR) {
log_debug_info(DEBUGL_NOTIFICATIONS, 0, "Notification viability test failed. No notification will be sent out.\n");
// Recovery notification wasn't sent because they don't want it. Still have to modify values
if(type == NOTIFICATION_RECOVERY && ! flag_isset(svc->notification_options, OPT_RECOVERY)) {
svc->notified_on = 0;
svc->current_notification_number = 0;
}
/* Set next_notification time if we're in a downtime and
notification_interval = zero, otherwise if the service
doesn't recover after downtime ends, it will never send
the notification */
if (svc->notification_interval == 0.0) {
if (temp_host->scheduled_downtime_depth > 0 || svc->scheduled_downtime_depth > 0)
svc->next_notification = current_time;
}
if (svc->notification_interval == 0.0) {
if (temp_host->scheduled_downtime_depth > 0 || svc->scheduled_downtime_depth > 0)
svc->next_notification = current_time;
}
return ERROR;
}

Expand Down Expand Up @@ -212,7 +217,7 @@ int service_notification(service *svc, int type, char *not_author, char *not_dat
mac.x[MACRO_NOTIFICATIONTYPE] = "DOWNTIMECANCELLED";
else if(type == NOTIFICATION_CUSTOM)
mac.x[MACRO_NOTIFICATIONTYPE] = "CUSTOM";
else if(svc->current_state == STATE_OK)
else if(type == NOTIFICATION_RECOVERY)
mac.x[MACRO_NOTIFICATIONTYPE] = "RECOVERY";
else
mac.x[MACRO_NOTIFICATIONTYPE] = "PROBLEM";
Expand Down Expand Up @@ -276,7 +281,7 @@ int service_notification(service *svc, int type, char *not_author, char *not_dat
clear_servicegroup_macros_r(&mac);
clear_datetime_macros_r(&mac);

if(type == NOTIFICATION_NORMAL) {
if(type == NOTIFICATION_NORMAL || type == NOTIFICATION_RECOVERY) {

/* adjust last/next notification time and notification flags if we notified someone */
if(contacts_notified > 0) {
Expand All @@ -289,8 +294,14 @@ int service_notification(service *svc, int type, char *not_author, char *not_dat
/* update the last notification time for this service (this is needed for rescheduling later notifications) */
svc->last_notification = current_time;

/* update notifications flags */
add_notified_on(svc, svc->current_state);
/* update notifications flags, make sure to consider recovery */
if (type == NOTIFICATION_NORMAL) {
add_notified_on(svc, svc->current_state);
}
else {
svc->notified_on = 0;
svc->current_notification_number = 0;
}
}

/* we didn't end up notifying anyone */
Expand Down Expand Up @@ -603,6 +614,20 @@ int check_service_notification_viability(service *svc, int type, int options) {
return ERROR;
}

/* If any of the parents are down, don't notify */
if (temp_host->parent_hosts != NULL) {
hostsmember *temp_hostsmember = NULL;
host *parent_host = NULL;

for(temp_hostsmember = temp_host->parent_hosts; temp_hostsmember != NULL; temp_hostsmember = temp_hostsmember->next) {
parent_host = temp_hostsmember->host_ptr;
if (parent_host->current_state != HOST_UP) {
log_debug_info(DEBUGL_NOTIFICATIONS, 1, "At least one parent (%s) is down, so we won't notify about this service.\n", parent_host->name);
return ERROR;
}
}
}

/* don't notify if we haven't waited long enough since the last time (and the service is not marked as being volatile) */
if((current_time < svc->next_notification) && svc->is_volatile == FALSE) {
log_debug_info(DEBUGL_NOTIFICATIONS, 1, "We haven't waited long enough to re-notify contacts about this service.\n");
Expand Down Expand Up @@ -1065,6 +1090,11 @@ int host_notification(host *hst, int type, char *not_author, char *not_data, int

/* check viability of sending out a host notification */
if(check_host_notification_viability(hst, type, options) == ERROR) {
// Recovery notification wasn't sent because they don't want it. Still have to modify values
if(type == NOTIFICATION_RECOVERY && ! flag_isset(hst->notification_options, OPT_RECOVERY)) {
hst->notified_on = 0;
hst->current_notification_number = 0;
}
log_debug_info(DEBUGL_NOTIFICATIONS, 0, "Notification viability test failed. No notification will be sent out.\n");
return ERROR;
}
Expand Down Expand Up @@ -1169,7 +1199,7 @@ int host_notification(host *hst, int type, char *not_author, char *not_data, int
mac.x[MACRO_NOTIFICATIONTYPE] = "DOWNTIMECANCELLED";
else if(type == NOTIFICATION_CUSTOM)
mac.x[MACRO_NOTIFICATIONTYPE] = "CUSTOM";
else if(hst->current_state == HOST_UP)
else if(type == NOTIFICATION_RECOVERY)
mac.x[MACRO_NOTIFICATIONTYPE] = "RECOVERY";
else
mac.x[MACRO_NOTIFICATIONTYPE] = "PROBLEM";
Expand Down Expand Up @@ -1233,7 +1263,7 @@ int host_notification(host *hst, int type, char *not_author, char *not_data, int
clear_hostgroup_macros_r(&mac);
clear_datetime_macros_r(&mac);

if(type == NOTIFICATION_NORMAL) {
if(type == NOTIFICATION_NORMAL || type == NOTIFICATION_RECOVERY) {

/* adjust last/next notification time and notification flags if we notified someone */
if(contacts_notified > 0) {
Expand All @@ -1245,7 +1275,13 @@ int host_notification(host *hst, int type, char *not_author, char *not_data, int
hst->last_notification = current_time;

/* update notifications flags */
add_notified_on(hst, hst->current_state);
if(type == NOTIFICATION_NORMAL) {
add_notified_on(hst, hst->current_state);
}
else {
hst->notified_on = 0;
hst->current_notification_number = 0;
}

log_debug_info(DEBUGL_NOTIFICATIONS, 0, "%d contacts were notified. Next possible notification time: %s", contacts_notified, ctime(&hst->next_notification));
}
Expand Down Expand Up @@ -1449,21 +1485,13 @@ int check_host_notification_viability(host *hst, int type, int options) {
}

/* see if we should notify about problems with this host */
if((hst->notification_options & (1 << hst->current_state)) == FALSE) {
if(should_notify(hst) == FALSE) {
log_debug_info(DEBUGL_NOTIFICATIONS, 1, "We shouldn't notify about %s status for this host.\n", host_state_name(hst->current_state));
return ERROR;
}
if(hst->current_state == HOST_UP) {

if((hst->notification_options & OPT_RECOVERY) == FALSE) {
log_debug_info(DEBUGL_NOTIFICATIONS, 1, "We shouldn't notify about RECOVERY states for this host.\n");
return ERROR;
}
if(!hst->notified_on) {
log_debug_info(DEBUGL_NOTIFICATIONS, 1, "We shouldn't notify about this recovery.\n");
return ERROR;
}

if(hst->current_state == HOST_UP && hst->notified_on == 0) {
log_debug_info(DEBUGL_NOTIFICATIONS, 1, "We shouldn't notify about this recovery.\n");
return ERROR;
}

/* see if enough time has elapsed for first notification (Mathias Sundman) */
Expand Down Expand Up @@ -1510,6 +1538,20 @@ int check_host_notification_viability(host *hst, int type, int options) {
return ERROR;
}

/* If any of the parents are down, don't notify */
if (hst->parent_hosts != NULL) {
hostsmember *temp_hostsmember = NULL;
host *parent_host = NULL;

for(temp_hostsmember = hst->parent_hosts; temp_hostsmember != NULL; temp_hostsmember = temp_hostsmember->next) {
parent_host = temp_hostsmember->host_ptr;
if (parent_host->current_state != HOST_UP) {
log_debug_info(DEBUGL_NOTIFICATIONS, 1, "At least one parent (%s) is down, so we won't notify about this host.\n", parent_host->name);
return ERROR;
}
}
}

return OK;
}

Expand Down
1 change: 1 addition & 0 deletions include/nagios.h
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,7 @@ extern struct load_control loadctl;
#define NOTIFICATION_DOWNTIMEEND 6
#define NOTIFICATION_DOWNTIMECANCELLED 7
#define NOTIFICATION_CUSTOM 8
#define NOTIFICATION_RECOVERY 9



Expand Down
1 change: 1 addition & 0 deletions t-tap/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ test_xsddefault
test_commands
test_downtime
test_strtoul
test_notifications
*.dSYM
4 changes: 4 additions & 0 deletions t-tap/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ TESTS += test_downtime
TESTS += test_nagios_config
TESTS += test_timeperiods
TESTS += test_macros
TESTS += test_notifications

XSD_OBJS = $(BLD_CGI)/statusdata-cgi.o $(BLD_CGI)/xstatusdata-cgi.o
XSD_OBJS += $(BLD_CGI)/objects-cgi.o $(BLD_CGI)/xobjects-cgi.o
Expand Down Expand Up @@ -104,6 +105,9 @@ test_timeperiods: test_timeperiods.o $(TP_OBJS) $(TAPOBJ)
test_macros: test_macros.o $(TP_OBJS) $(TAPOBJ)
$(CC) $(CFLAGS) -o $@ $^ $(BLD_BASE)/checks.o $(BROKER_LDFLAGS) $(LDFLAGS) $(MATHLIBS) $(SOCKETLIBS) $(LIBS)

test_notifications: test_notifications.o $(BLD_BASE)/notifications.o $(BLD_BASE)/checks.o $(BLD_BASE)/utils.o $(BLD_COMMON)/shared.o $(BLD_BASE)/objects-base.o $(TAPOBJ)
$(CC) $(CFLAGS) -o $@ $^ $(LIBS) $(MATHLIBS)

test_xsddefault: test_xsddefault.o $(XSD_OBJS) $(TAPOBJ)
$(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS)

Expand Down
9 changes: 9 additions & 0 deletions t-tap/stub_broker.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,12 @@ void broker_timed_event(int type, int flags, int attr, timed_event *event, struc

void broker_flapping_data(int type, int flags, int attr, int flapping_type, void *data, double percent_change, double high_threshold, double low_threshold, struct timeval *timestamp)
{ }

int broker_contact_notification_data(int type, int flags, int attr, int notification_type, int reason_type, struct timeval start_time, struct timeval end_time, void *data, contact *cntct, char *ack_author, char *ack_data, int escalated, struct timeval *timestamp)
{ return OK; }

int broker_contact_notification_method_data(int type, int flags, int attr, int notification_type, int reason_type, struct timeval start_time, struct timeval end_time, void *data, contact *cntct, char *cmd, char *ack_author, char *ack_data, int escalated, struct timeval *timestamp)
{ return OK; }

int broker_notification_data(int type, int flags, int attr, int notification_type, int reason_type, struct timeval start_time, struct timeval end_time, void *data, char *ack_author, char *ack_data, int escalated, int contacts_notified, struct timeval *timestamp)
{ return OK; }
Loading

0 comments on commit c378fd1

Please sign in to comment.