Skip to content

Commit

Permalink
Checkpoint
Browse files Browse the repository at this point in the history
  • Loading branch information
lbwexler committed Apr 24, 2024
1 parent 50bc505 commit efaae39
Show file tree
Hide file tree
Showing 12 changed files with 318 additions and 362 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class MonitorResultsAdminController extends BaseClusterController {

@Access(['HOIST_ADMIN'])
def forceRunAllMonitors() {
runOnInstance(new ForceRunAllMonitors(), Utils.clusterService.localName)
runOnPrimary(new ForceRunAllMonitors())
}
static class ForceRunAllMonitors extends ClusterRequest {
def doCall() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
* Copyright © 2023 Extremely Heavy Industries Inc.
*/

package io.xh.hoist.monitor
package io.xh.hoist.admin

import io.xh.hoist.BaseService
import io.xh.hoist.exception.DataNotAvailableException
import io.xh.hoist.util.DateTimeUtils
import org.apache.tomcat.jdbc.pool.DataSource as PooledDataSource
import org.apache.tomcat.jdbc.pool.PoolConfiguration
import org.springframework.boot.jdbc.DataSourceUnwrapper
Expand All @@ -34,7 +35,7 @@ class ConnectionPoolMonitoringService extends BaseService {

void init() {
createTimer(
interval: {enabled ? config.snapshotInterval * SECONDS: -1},
interval: {enabled ? config.snapshotInterval * DateTimeUtils.SECONDS: -1},
runFn: this.&takeSnapshot
)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
* Copyright © 2023 Extremely Heavy Industries Inc.
*/

package io.xh.hoist.monitor
package io.xh.hoist.admin

import com.sun.management.HotSpotDiagnosticMXBean
import io.xh.hoist.BaseService
import io.xh.hoist.util.DateTimeUtils
import io.xh.hoist.util.Utils

import java.lang.management.GarbageCollectorMXBean
import java.lang.management.ManagementFactory
Expand All @@ -34,7 +36,7 @@ class MemoryMonitoringService extends BaseService {

void init() {
createTimer(
interval: {this.enabled ? config.snapshotInterval * SECONDS: -1},
interval: {this.enabled ? config.snapshotInterval * DateTimeUtils.SECONDS: -1},
runFn: this.&takeSnapshot
)
}
Expand Down Expand Up @@ -91,7 +93,7 @@ class MemoryMonitoringService extends BaseService {
if (newSnap.usedPctMax > 90) {
logWarn(newSnap)
logWarn("MEMORY USAGE ABOVE 90%")
} else if (intervalElapsed(1 * HOURS, _lastInfoLogged)) {
} else if (intervalElapsed(1 * DateTimeUtils.HOURS, _lastInfoLogged)) {
logInfo(newSnap)
_lastInfoLogged = new Date()
} else {
Expand Down Expand Up @@ -152,7 +154,7 @@ class MemoryMonitoringService extends BaseService {

long collectionCount = totalCollectionCount - (last ? last.totalCollectionCount : 0),
collectionTime = totalCollectionTime - (last ? last.totalCollectionTime : 0),
elapsedTime = timestamp - (last ? last.timestamp : startupTime.toInstant().toEpochMilli())
elapsedTime = timestamp - (last ? last.timestamp : Utils.startupTime.toInstant().toEpochMilli())

def avgCollectionTime = collectionCount ? Math.round(collectionTime/collectionCount) : 0

Expand Down
81 changes: 47 additions & 34 deletions grails-app/services/io/xh/hoist/monitor/MonitorResultService.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@

package io.xh.hoist.monitor

import grails.async.Promises
import grails.gorm.transactions.ReadOnly
import io.xh.hoist.BaseService
import io.xh.hoist.util.Utils
import static grails.async.Promises.task

import java.util.concurrent.TimeoutException
Expand All @@ -18,26 +18,35 @@ import static io.xh.hoist.monitor.MonitorStatus.*
import static java.util.concurrent.TimeUnit.SECONDS


/**
* Runs individual status monitor checks as directed by MonitorService and as configured by
* data-driven status monitor definitions. Timeouts and any other exceptions will be caught and
* returned cleanly as failures.
*/
class MonitorResultService extends BaseService {

def configService,
monitorDefinitionService

/**
* Runs all enabled and active monitors on this instance in parallel.
*/
@ReadOnly
MonitorResult runMonitor(String code, long timeoutSeconds) {
def monitor = Monitor.findByCode(code)
if (!monitor) throw new RuntimeException("Monitor '$code' not found.")
return runMonitor(monitor, timeoutSeconds)
List<MonitorResult> runAllMonitors() {
def timeout = getTimeoutSeconds(),
monitors = Monitor.list().findAll{it.active && (isPrimary || !it.primaryOnly)}

withDebug("Running ${monitors.size()} monitors") {
def tasks = monitors.collect { m -> task {runMonitor(m, timeout)}},
ret = Promises.waitAll(tasks)

if (monitorConfig.writeToMonitorLog != false) logResults(ret)

return ret
} as List<MonitorResult>
}

/**
* Runs individual monitor on this instance. Timeouts and any other exceptions will be
* caught and returned cleanly as failures.
*/
MonitorResult runMonitor(Monitor monitor, long timeoutSeconds) {
if (!monitor.active || (monitor.primaryOnly && !clusterService.isPrimary)) {
return inactiveMonitorResult(monitor)
}

def defSvc = Utils.appContext.monitorDefinitionService,
def defSvc = monitorDefinitionService,
code = monitor.code,
result = new MonitorResult(monitor: monitor, instance: clusterService.localName, primary: isPrimary),
startTime = new Date()
Expand Down Expand Up @@ -79,25 +88,6 @@ class MonitorResultService extends BaseService {
return result
}

MonitorResult unknownMonitorResult(Monitor monitor) {
return new MonitorResult(
status: UNKNOWN,
date: new Date(),
elapsed: 0,
monitor: monitor
)
}

MonitorResult inactiveMonitorResult(Monitor monitor) {
return new MonitorResult(
status: INACTIVE,
date: new Date(),
elapsed: 0,
monitor: monitor
)
}


//------------------------
// Implementation
//------------------------
Expand Down Expand Up @@ -129,4 +119,27 @@ class MonitorResultService extends BaseService {
result.prependMessage("Metric value is $verb warn limit of $warn $units")
}
}

//---------------------
// Implementation
//--------------------
private long getTimeoutSeconds() {
(monitorConfig.monitorTimeoutSecs ?: 15) as long
}

private Map getMonitorConfig() {
configService.getMap('xhMonitorConfig')
}

private void logResults(Collection<MonitorResult> results) {
results.each {
logInfo([code: it.code, status: it.status, metric: it.metric])
}

def failsCount = results.count {it.status == FAIL},
warnsCount = results.count {it.status == WARN},
okCount = results.count {it.status == OK}

logInfo([fails: failsCount, warns: warnsCount, okays: okCount])
}
}

This file was deleted.

104 changes: 104 additions & 0 deletions grails-app/services/io/xh/hoist/monitor/MonitoringReportService.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
* This file belongs to Hoist, an application development toolkit
* developed by Extremely Heavy Industries (www.xh.io | [email protected])
*
* Copyright © 2023 Extremely Heavy Industries Inc.
*/
package io.xh.hoist.monitor

import io.xh.hoist.BaseService
import io.xh.hoist.util.Utils

import static grails.util.Environment.isDevelopmentMode
import static io.xh.hoist.monitor.MonitorStatus.WARN
import static io.xh.hoist.util.DateTimeUtils.MINUTES
import static io.xh.hoist.util.DateTimeUtils.intervalElapsed
import static java.lang.System.currentTimeMillis

/**
* Listens for status monitor change events from MonitoringService and generates a report.
* Reports generated periodically, and also when status changes after certain thresholds.
*
* Also emails status updates to a configurable list of recipients.
*/
class MonitoringReportService extends BaseService {

def emailService,
configService

// Notification state for primary instance to manage
// If primary instance goes down, may get extra notification -- that's ok
private Long lastNotified = null
private boolean alertMode = false

void noteResultsUpdated(Collection<MonitorResults> results) {
if (!isPrimary) return;

def failThreshold = config.failNotifyThreshold,
warnThreshold = config.warnNotifyThreshold

// 1) Calc new alert mode, true if crossed thresholds or already alerting and still have problems
boolean newAlertMode = (alertMode && results?.any {it.status >= WARN}) ||
results?.any { it.cyclesAsFail >= failThreshold || it.cyclesAsWarn >= warnThreshold }

// 2) Generate report if we have a change, or still alerting and interval has elapsed
if (newAlertMode != alertMode ||
(newAlertMode && intervalElapsed(config.monitorRepeatNotifyMins * MINUTES, lastNotified))
) {
lastNotified = currentTimeMillis()
alertMode = newAlertMode
generateStatusReport(results)
}
}

//------------------------
// Implementation
//------------------------
private MonitorStatusReport generateStatusReport(results) {
def report = new MonitorStatusReport(results: results)
logDebug("Emitting monitor status report: ${report.title}")
getTopic('xhMonitorStatusReport').publishAsync(report)
if (isDevelopmentMode()) {
emailReport(report)
}
}

private void emailReport(MonitorStatusReport report) {
def to = emailService.parseMailConfig('xhMonitorEmailRecipients')
if (to) {
emailService.sendEmail(
to: to,
subject: report.title,
html: formatHtml(report),
async: true
)
}
}

private String formatHtml(MonitorStatusReport report) {
def results = report.results

results.sort{it.name}
results.sort{it.status}

if (report.status < WARN) return "There are no alerting monitors for ${Utils.appName}."

return results.findAll{it.status >= WARN}.collect {
"+ $it.name | ${it.message ? it.message + ' | ' : ''}Minutes in [${it.status}]: ${it.minsInStatus}"
}.join('<br>')
}

private Map getConfig() {
configService.getMap('xhMonitorConfig')
}

Map getAdminStats() {[
config: [
toAddress: emailService.parseMailConfig('xhMonitorEmailRecipients'),
*: configService.getForAdminStats('xhMonitorConfig')
],
lastNotifed: lastNotified,
alertMode: alertMode
]}

}
Loading

0 comments on commit efaae39

Please sign in to comment.