Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for replay_delay #84

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 87 additions & 13 deletions check_postgres.pl
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ package check_postgres;
## Which user to connect as if --dbuser is not given
$opt{defaultuser} = 'postgres';

## Number of seconds that slave can go without receiving a write from master before alerting if --maxidlemasterdelay is not given
$opt{defaultidlemasterdelay} = 3600;

## Which port to connect to if --dbport is not given
$opt{defaultport} = 5432;

Expand Down Expand Up @@ -975,19 +978,20 @@ package check_postgres;
'tempdir=s',
'get_method=s',
'language=s',
'mrtg=s', ## used by MRTG checks only
'logfile=s', ## used by check_logfile only
'queryname=s', ## used by query_runtime only
'query=s', ## used by custom_query only
'valtype=s', ## used by custom_query only
'reverse', ## used by custom_query only
'repinfo=s', ## used by replicate_row only
'noidle', ## used by backends only
'datadir=s', ## used by checkpoint only
'schema=s@', ## used by slony_status only
'filter=s@', ## used by same_schema only
'suffix=s', ## used by same_schema only
'replace', ## used by same_schema only
'mrtg=s', ## used by MRTG checks only
'logfile=s', ## used by check_logfile only
'queryname=s', ## used by query_runtime only
'query=s', ## used by custom_query only
'valtype=s', ## used by custom_query only
'reverse', ## used by custom_query only
'repinfo=s', ## used by replicate_row only
'noidle', ## used by backends only
'datadir=s', ## used by checkpoint only
'schema=s@', ## used by slony_status only
'filter=s@', ## used by same_schema only
'suffix=s', ## used by same_schema only
'replace', ## used by same_schema only
'maxidlemasterdelay=i', ## used by check_replay_delay only
);

die $USAGE if ! keys %opt and ! @ARGV;
Expand Down Expand Up @@ -1018,6 +1022,9 @@ package check_postgres;
elsif ($name =~ /^dbservice(\d+)$/o) {
push @{ $opt{dbservice} } => $value;
}
elsif ($name =~ /^maxidlemasterdelay(\d+)$/o) {
push @{ $opt{maxidlemasterdelay} } => $value;
}
else {
push @badargs => $arg;
}
Expand Down Expand Up @@ -1148,6 +1155,7 @@ package check_postgres;
hitratio => [0, 'Report if the hit ratio of a database is too low.'],
hot_standby_delay => [1, 'Check the replication delay in hot standby setup'],
index_size => [0, 'Checks the size of indexes only.'],
replay_delay => [0, 'Check the log replay delay during recovery'],
table_size => [0, 'Checks the size of tables only.'],
relation_size => [0, 'Checks the size of tables and indexes.'],
last_analyze => [0, 'Check the maximum time in seconds since any one table has been analyzed.'],
Expand Down Expand Up @@ -1224,6 +1232,7 @@ package check_postgres;
--exclude=name(s) items to specifically exclude (e.g. tables), depends on the action
--includeuser=include objects owned by certain users
--excludeuser=exclude objects owned by certain users
--maxidlemasterdelay number of seconds slave can go without receiving a write from master; defaults to '$opt{defaultidlemasterdelay}'

Other options:
--assume-standby-mode assume that server in continious WAL recovery mode
Expand Down Expand Up @@ -1752,6 +1761,7 @@ sub finishup {
fsm_pages => 'VERSION: 8.2 MAX: 8.3',
fsm_relations => 'VERSION: 8.2 MAX: 8.3',
hot_standby_delay => 'VERSION: 9.0',
replay_delay => 'VERSION: 9.0',
listener => 'MAX: 8.4',
);
if ($opt{test}) {
Expand Down Expand Up @@ -1945,6 +1955,9 @@ sub finishup {
## Check the replication delay in hot standby setup
check_hot_standby_delay() if $action eq 'hot_standby_delay';

## Check the log replay delay during recovery
check_replay_delay() if $action eq 'replay_delay';

## Check the maximum transaction age of all connections
check_txn_time() if $action eq 'txn_time';

Expand Down Expand Up @@ -4743,6 +4756,61 @@ sub check_hitratio {
} ## end of check_hitratio


sub check_replay_delay {
## Check the log replay delay during recovery
## Supports: Nagios
## Critical and warning are the thresholds of delay in seconds.
## Example: --critical=5

my ($warning, $critical) = validate_range({type => 'integer', leastone => 1});

# set max idle master delay to override or the default
my $maxidlemasterdelay = $opt{maxidlemasterdelay} || $opt{defaultidlemasterdelay};

# check if we are in recovery using pg_is_in_recovery()
$SQL = q{SELECT pg_is_in_recovery() AS recovery;};

my $info = run_command($SQL, { regex => qr([tf]) });
for $db (@{$info->{db}}) {
my $status = $db->{slurp}[0];
if ($status->{recovery} eq 'f') {
add_critical("not in recovery");
return;
}
}
# We can't assume delay is none if last replayed equals last received, because in
# reality it could mean replication has gone out for lunch.
# This can lead to false negatives on an idle master, but is preferable to the
# opposite where replication has gone away and we assume everything is hunkey-dorey.
# This is also why I renamed this check to replay_delay from hot_standby_delay_slave.
#
# It only tells you the replication delay IF the master is active (i.e. receiving updates).
# If the master is not active, if it exceeds the threshold specified it will also alert.
$SQL = qq{SELECT CASE
WHEN pg_last_xlog_receive_location() = pg_last_xlog_replay_location()
AND EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) < $maxidlemasterdelay
THEN 0
ELSE
EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) END AS log_delay;};
$info = run_command($SQL);

for $db (@{$info->{db}}) {
my $delay = $db->{slurp}[0]->{log_delay};
my $msg = qq{delay=${delay}s};

if (length $critical and $delay > $critical) {
add_critical $msg;
}
elsif (length $warning and $delay > $warning) {
add_warning $msg;
}
else {
add_ok $msg;
}
}

} ## end of check_replay_delay

sub check_hot_standby_delay {

## Check on the delay in PITR replication between master and slave
Expand Down Expand Up @@ -8874,6 +8942,12 @@ =head2 B<hot_standby_delay>

check_hot_standby_delay --dbhost=master,replica1 --warning='1048576 and 2 min' --critical='16777216 and 10 min'

=head2 B<replay_delay>

(C<symlink: check_replay_delay>) Returns seconds passed since last transaction replayed
during recovery. This practically tells you the replication delay of a hot standby (locally) IF
the master is active (i.e. master is receiving updates).

=head2 B<index_size>

=head2 B<table_size>
Expand Down