Skip to content

Commit

Permalink
featg(MonitorResult): track master status of monitor result
Browse files Browse the repository at this point in the history
  • Loading branch information
jacob-xhio committed Apr 23, 2024
1 parent c11eaca commit b4cc239
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,13 @@ class MonitorResultService extends BaseService {
}

MonitorResult runMonitor(Monitor monitor, long timeoutSeconds) {
if (!monitor.active || monitor.masterOnly && !Utils.clusterService.isMaster) {
if (!monitor.active || (monitor.masterOnly && !clusterService.isMaster)) {
return inactiveMonitorResult(monitor)
}

def defSvc = Utils.appContext.monitorDefinitionService,
code = monitor.code,
result = new MonitorResult(monitor: monitor),
result = new MonitorResult(monitor: monitor, instance: clusterService.localName, master: isMaster),
startTime = new Date()

try {
Expand Down
30 changes: 13 additions & 17 deletions grails-app/services/io/xh/hoist/monitor/MonitoringService.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import grails.gorm.transactions.ReadOnly
import io.xh.hoist.BaseService
import io.xh.hoist.cluster.ReplicatedValue
import io.xh.hoist.util.Timer
import io.xh.hoist.util.Utils

import static io.xh.hoist.util.DateTimeUtils.intervalElapsed
import static grails.async.Promises.task
Expand All @@ -41,12 +40,13 @@ class MonitoringService extends BaseService {
monitorResultService

// Shared state for all servers to read
// Map of instance name -> Map of monitor code -> MonitorResult
private IMap<String, Map<String, MonitorResult>> _results = getIMap('results')
private ReplicatedValue<Map<String, StatusInfo>> _statusInfos = getReplicatedValue('statusInfos')

// Notification state for master to read only
private ReplicatedValue<Boolean> alertMode = getReplicatedValue('alertMode')
private ReplicatedValue<Long> lastNotified = getReplicatedValue('lastNotified')
// Notification state for master instance to manage
private ReplicatedValue<Map<String, StatusInfo>> _statusInfos = getReplicatedValue('statusInfos')
private ReplicatedValue<Boolean> _alertMode = getReplicatedValue('alertMode')
private ReplicatedValue<Long> _lastNotified = getReplicatedValue('lastNotified')

private Timer monitorTimer
private Timer notifyTimer
Expand Down Expand Up @@ -115,17 +115,13 @@ class MonitoringService extends BaseService {
task { monitorResultService.runMonitor(m, timeout) }
}

def clusterService = Utils.clusterService,
localName = clusterService.localName

Map<String, MonitorResult> newResults = Promises
.waitAll(tasks)
.collectEntries {
it.instance = clusterService.isMaster ? localName + '-M' : localName
[it.code, it]
}

_results[localName] = newResults
_results[clusterService.localName] = newResults
if (monitorConfig.writeToMonitorLog != false) logResults(newResults.values())
}
}
Expand All @@ -151,19 +147,19 @@ class MonitoringService extends BaseService {
warnThreshold = monitorConfig.warnNotifyThreshold

// Calc new alert mode, true if crossed thresholds or already alerting and still have problems
def currAlertMode = alertMode.get()
def currAlertMode = _alertMode.get()
def newAlertMode = (currAlertMode && statusInfos.any { it.status >= WARN }) ||
statusInfos.any { it.cyclesAsFail >= failThreshold || it.cyclesAsWarn >= warnThreshold }
if (newAlertMode != alertMode.get()) {
alertMode.set(newAlertMode)
if (newAlertMode != currAlertMode) {
_alertMode.set(newAlertMode)
notifyAlertModeChange()
}
}

private void notifyAlertModeChange() {
if (!isDevelopmentMode()) {
getTopic('xhMonitorStatusReport').publishAsync(generateStatusReport())
lastNotified.set(currentTimeMillis())
_lastNotified.set(currentTimeMillis())
}
}

Expand All @@ -184,13 +180,13 @@ class MonitoringService extends BaseService {
}

private void onNotifyTimer() {
if (!alertMode.get()) return
if (!_alertMode.get()) return

if (intervalElapsed(monitorConfig.monitorRepeatNotifyMins * MINUTES, lastNotified.get())) {
if (intervalElapsed(monitorConfig.monitorRepeatNotifyMins * MINUTES, _lastNotified.get())) {
def report = generateStatusReport()
logDebug("Emitting monitor status report: ${report.title}")
getTopic('xhMonitorStatusReport').publishAsync(report)
lastNotified.set(currentTimeMillis())
_lastNotified.set(currentTimeMillis())
}
}

Expand Down
2 changes: 2 additions & 0 deletions src/main/groovy/io/xh/hoist/monitor/MonitorResult.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import static io.xh.hoist.monitor.MonitorStatus.UNKNOWN
@CompileStatic
class MonitorResult implements JSONFormat {
String instance
Boolean master
MonitorStatus status = UNKNOWN
Object metric
String message
Expand All @@ -41,6 +42,7 @@ class MonitorResult implements JSONFormat {
Map formatForJSON() {
[
instance: instance,
master: master,
status: status,
metric: metric,
message: message,
Expand Down
7 changes: 4 additions & 3 deletions src/main/groovy/io/xh/hoist/monitor/StatusInfo.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,15 @@ import static io.xh.hoist.monitor.MonitorStatus.*
class StatusInfo {
MonitorStatus status = UNKNOWN
Date lastChange
Integer cyclesAsSuccess
Integer cyclesAsFail
Integer cyclesAsWarn
Integer cyclesAsSuccess = 0
Integer cyclesAsFail = 0
Integer cyclesAsWarn = 0

void recordStatus(MonitorStatus status) {
// Keep track of the number of consecutive cycles in each status
switch (status) {
case FAIL:
// Entering FAIL does not clear WARN streaks
cyclesAsSuccess = 0
cyclesAsFail++
break
Expand Down

0 comments on commit b4cc239

Please sign in to comment.