Skip to content

Commit

Permalink
feat(MonitoringService): consolidate monitor result history into Stat…
Browse files Browse the repository at this point in the history
…usInfos
  • Loading branch information
jacob-xhio committed Apr 23, 2024
1 parent 36d6a6d commit c11eaca
Show file tree
Hide file tree
Showing 8 changed files with 115 additions and 135 deletions.
3 changes: 2 additions & 1 deletion grails-app/domain/io/xh/hoist/monitor/Monitor.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ class Monitor implements JSONFormat {
sortOrder: sortOrder,
active: active,
lastUpdatedBy: lastUpdatedBy,
lastUpdated: lastUpdated
lastUpdated: lastUpdated,
masterOnly: masterOnly
]
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class MonitorResultService extends BaseService {
}

MonitorResult runMonitor(Monitor monitor, long timeoutSeconds) {
if (!monitor.active) {
if (!monitor.active || monitor.masterOnly && !Utils.clusterService.isMaster) {
return inactiveMonitorResult(monitor)
}

Expand Down
142 changes: 52 additions & 90 deletions grails-app/services/io/xh/hoist/monitor/MonitoringService.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,22 @@

package io.xh.hoist.monitor

import com.hazelcast.core.EntryListener
import com.hazelcast.map.IMap
import grails.async.Promises
import grails.gorm.transactions.ReadOnly
import io.xh.hoist.BaseService
import io.xh.hoist.cluster.ClusterRequest
import io.xh.hoist.cluster.ReplicatedValue
import io.xh.hoist.util.Timer
import io.xh.hoist.util.Utils

import static io.xh.hoist.util.DateTimeUtils.intervalElapsed
import static grails.async.Promises.task
import static io.xh.hoist.monitor.MonitorStatus.*
import static io.xh.hoist.util.DateTimeUtils.MINUTES
import static io.xh.hoist.util.DateTimeUtils.SECONDS
import static grails.util.Environment.isDevelopmentMode
import static java.lang.System.currentTimeMillis
import static io.xh.hoist.util.Utils.getAppContext


/**
Expand All @@ -45,7 +45,6 @@ class MonitoringService extends BaseService {
private ReplicatedValue<Map<String, StatusInfo>> _statusInfos = getReplicatedValue('statusInfos')

// Notification state for master to read only
private ReplicatedValue<Map<String, Map>> problems = getReplicatedValue('problems')
private ReplicatedValue<Boolean> alertMode = getReplicatedValue('alertMode')
private ReplicatedValue<Long> lastNotified = getReplicatedValue('lastNotified')

Expand All @@ -55,16 +54,17 @@ class MonitoringService extends BaseService {

void init() {
monitorTimer = createTimer(
name: 'monitorTimer',
runFn: this.&onMonitorTimer,
interval: {monitorInterval},
delay: startupDelay
name: 'monitorTimer',
runFn: this.&onMonitorTimer,
interval: {monitorInterval},
delay: startupDelay
)

notifyTimer = createTimer (
name: 'notifyTimer',
runFn: this.&onNotifyTimer,
interval: {notifyInterval}
name: 'notifyTimer',
runFn: this.&onNotifyTimer,
interval: {notifyInterval},
masterOnly: true
)

cleanupTimer = createTimer(
Expand All @@ -73,9 +73,16 @@ class MonitoringService extends BaseService {
interval: {cleanupInterval},
masterOnly: true,
)

_results.addEntryListener([
entryAdded: { updateStatuses() },
entryUpdated: { updateStatuses() },
entryRemoved: { updateStatuses() }
] as EntryListener, false)
}

void forceRun() {
cleanupTimer.forceRun()
monitorTimer.forceRun()
}

Expand All @@ -90,8 +97,7 @@ class MonitoringService extends BaseService {

new MonitorInfo(
monitor: it,
status: statusInfo.status,
lastStatusChange: statusInfo.lastChange,
statusInfo: statusInfo,
instanceResults: instanceResults
)
}
Expand All @@ -109,74 +115,47 @@ class MonitoringService extends BaseService {
task { monitorResultService.runMonitor(m, timeout) }
}

def localName = Utils.clusterService.localName
def clusterService = Utils.clusterService,
localName = clusterService.localName

Map<String, MonitorResult> newResults = Promises
.waitAll(tasks)
.collectEntries {
it.instance = localName
it.instance = clusterService.isMaster ? localName + '-M' : localName
[it.code, it]
}

_results[localName] = newResults
Utils.clusterService.submitToInstance(new StatusUpdater(), Utils.clusterService.masterName)
if (monitorConfig.writeToMonitorLog != false) logResults()
evaluateProblems()
if (monitorConfig.writeToMonitorLog != false) logResults(newResults.values())
}
}

@ReadOnly
private void updateStatuses() {
if (!isMaster) return
def statusInfos = _statusInfos.get() ?: [:]
Monitor.list().each{ monitor ->
def code = monitor.code
List<MonitorStatus> statuses = _results.findAll{it.value[code]}.collect{it.value[code].status}
MonitorStatus newStatus = statuses.max()
if (!statusInfos[code] || newStatus > statusInfos[code].status) {
statusInfos[code] = new StatusInfo(status: newStatus, lastChange: new Date())
_statusInfos.set(statusInfos)
}
}
}

static class StatusUpdater extends ClusterRequest {
def doCall() {
appContext.monitoringService.updateStatuses()
def statusInfo = statusInfos[code] ?: new StatusInfo()
statusInfo.recordStatus(statuses.max())
statusInfos[code] = statusInfo
}
_statusInfos.set(statusInfos)
evaluateProblems()
}

private void evaluateProblems() {
Map<String, MonitorResult> flaggedResults = results
.findAll { it.status >= WARN }
.collectEntries{
def globalStatus = it.status
[it.code, it.instanceResults.find { it.status == globalStatus }]
}

// 0) Remove all problems that are no longer problems
def probs = problems.get()?.findAll {flaggedResults[it.key]} ?: [:]

// 1) (Re)Mark all existing problems
flaggedResults.each { code, result ->
def problem = probs[code]
if (!problem) {
problem = probs[code] = [result: result, cyclesAsFail: 0, cyclesAsWarn: 0]
}

if (result.status == FAIL) {
problem.cyclesAsFail++
} else {
problem.cyclesAsFail = 0
problem.cyclesAsWarn++
}
}
problems.set(probs)
def statusInfos = _statusInfos.get()?.values() ?: [] as Collection<StatusInfo>,
failThreshold = monitorConfig.failNotifyThreshold,
warnThreshold = monitorConfig.warnNotifyThreshold

// 2) Handle alert mode transition -- notify immediately
// Note that we may get an extra transition if new master introduced in alerting
def currAlertMode = calcAlertMode()
if (currAlertMode != alertMode.get()) {
alertMode.set(currAlertMode)
// Calc new alert mode, true if crossed thresholds or already alerting and still have problems
def currAlertMode = alertMode.get()
def newAlertMode = (currAlertMode && statusInfos.any { it.status >= WARN }) ||
statusInfos.any { it.cyclesAsFail >= failThreshold || it.cyclesAsWarn >= warnThreshold }
if (newAlertMode != alertMode.get()) {
alertMode.set(newAlertMode)
notifyAlertModeChange()
}
}
Expand All @@ -188,27 +167,13 @@ class MonitoringService extends BaseService {
}
}

private boolean calcAlertMode() {
if (alertMode.get() && problems.get()) return true

def failThreshold = monitorConfig.failNotifyThreshold,
warnThreshold = monitorConfig.warnNotifyThreshold

return problems.get().values().any {
it.cyclesAsFail >= failThreshold || it.cyclesAsWarn >= warnThreshold
}
}

private MonitorStatusReport generateStatusReport() {
def results = results.collectMany{it.instanceResults}
new MonitorStatusReport(results: results)
new MonitorStatusReport(infos: results)
}

private void logResults() {
results.each { it ->
it.instanceResults.each { res ->
logInfo([instance: res.instance, code: it.code, status: res.status, metric: res.metric])
}
private void logResults(Collection<MonitorResult> results) {
results.each {
logInfo([code: it.code, status: it.status, metric: it.metric])
}

def failsCount = results.count {it.status == FAIL},
Expand All @@ -219,11 +184,9 @@ class MonitoringService extends BaseService {
}

private void onNotifyTimer() {
if (!alertMode.get() || !lastNotified.get()) return
def now = currentTimeMillis(),
timeThresholdMet = now > lastNotified.get() + monitorConfig.monitorRepeatNotifyMins * MINUTES
if (!alertMode.get()) return

if (timeThresholdMet) {
if (intervalElapsed(monitorConfig.monitorRepeatNotifyMins * MINUTES, lastNotified.get())) {
def report = generateStatusReport()
logDebug("Emitting monitor status report: ${report.title}")
getTopic('xhMonitorStatusReport').publishAsync(report)
Expand Down Expand Up @@ -260,26 +223,25 @@ class MonitoringService extends BaseService {
configService.getMap('xhMonitorConfig')
}

private void cleanup() {
for (String instance: _results.keySet()) {
if (!clusterService.isMember(instance)) {
_results.remove(instance)
}
}
}

void clearCaches() {
super.clearCaches()
if (isMaster) {
_results.clear()
problems.set(null)
_statusInfos.set(null)
if (monitorInterval > 0) {
monitorTimer.forceRun()
}
}
}

void cleanup() {
def clusterService = Utils.clusterService
for (String instance: _results.keySet()) {
if (!clusterService.isMember(instance)) {
_results.remove(instance)
}
}
}

Map getAdminStats() {[
config: configForAdminStats('xhMonitoringEnabled', 'xhMonitorConfig'),
]}
Expand Down
8 changes: 3 additions & 5 deletions src/main/groovy/io/xh/hoist/monitor/MonitorInfo.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* This file belongs to Hoist, an application development toolkit
* developed by Extremely Heavy Industries (www.xh.io | [email protected])
*
* Copyright © 2023 Extremely Heavy Industries Inc.
* Copyright © 2024 Extremely Heavy Industries Inc.
*/

package io.xh.hoist.monitor
Expand All @@ -15,8 +15,7 @@ import static io.xh.hoist.monitor.MonitorStatus.UNKNOWN
@CompileStatic
class MonitorInfo implements JSONFormat {
Monitor monitor
MonitorStatus status = UNKNOWN
Date lastStatusChange
StatusInfo statusInfo
List<MonitorResult> instanceResults = []

String getCode() {
Expand All @@ -29,9 +28,8 @@ class MonitorInfo implements JSONFormat {
name: monitor.name,
sortOrder: monitor.sortOrder,
masterOnly: monitor.masterOnly,
status: status,
lastStatusChange: lastStatusChange,
metricUnit: monitor.metricUnit,
statusInfo: statusInfo,
instanceResults: instanceResults
]
}
Expand Down
6 changes: 6 additions & 0 deletions src/main/groovy/io/xh/hoist/monitor/MonitorResult.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ class MonitorResult implements JSONFormat {
monitor.params ? JSONParser.parseObject(monitor.params) : [:]
}

/** Combines the given string with 'message', separated by formatting */
void prependMessage(String prependStr) {
// Space character before the newlines is for fallback formatting in `hoist-react <= v51.0.0`
message = prependStr + (message ? " \n\n$message" : '')
}

Map formatForJSON() {
[
instance: instance,
Expand Down
12 changes: 6 additions & 6 deletions src/main/groovy/io/xh/hoist/monitor/MonitorStatusReport.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,17 @@ import io.xh.hoist.util.Utils
import static io.xh.hoist.monitor.MonitorStatus.*

class MonitorStatusReport {
List<MonitorResult> results
List<MonitorInfo> infos

MonitorStatus getStatus() {
if (!results) return MonitorStatus.OK
results.max{it.status}.status
if (!infos) return MonitorStatus.OK
infos.max{it.status}.status
}

String getTitle() {
def failsCount = results.count{it.status == FAIL},
warnsCount = results.count{it.status == WARN},
okCount = results.count{it.status == OK},
def failsCount = infos.count{it.status == FAIL},
warnsCount = infos.count{it.status == WARN},
okCount = infos.count{it.status == OK},
title = "${Utils.appName}: ",
msgParts = []

Expand Down
41 changes: 40 additions & 1 deletion src/main/groovy/io/xh/hoist/monitor/StatusInfo.groovy
Original file line number Diff line number Diff line change
@@ -1,6 +1,45 @@
/*
* This file belongs to Hoist, an application development toolkit
* developed by Extremely Heavy Industries (www.xh.io | [email protected])
*
* Copyright © 2024 Extremely Heavy Industries Inc.
*/

package io.xh.hoist.monitor

import io.xh.hoist.util.Utils

import static io.xh.hoist.monitor.MonitorStatus.*


class StatusInfo {
MonitorStatus status
MonitorStatus status = UNKNOWN
Date lastChange
Integer cyclesAsSuccess
Integer cyclesAsFail
Integer cyclesAsWarn

void recordStatus(MonitorStatus status) {
// Keep track of the number of consecutive cycles in each status
switch (status) {
case FAIL:
cyclesAsSuccess = 0
cyclesAsFail++
break
case WARN:
cyclesAsSuccess = 0
cyclesAsFail = 0
cyclesAsWarn++
break
case OK:
cyclesAsFail = 0
cyclesAsWarn = 0
cyclesAsSuccess++
break
}
if (status != this.status) {
this.status = status
lastChange = new Date()
}
}
}
Loading

0 comments on commit c11eaca

Please sign in to comment.