bucardo · tdevelioglu · Jan 29, 2015 · Jan 29, 2015 · Jan 30, 2015 · Nov 22, 2019
diff --git a/check_postgres.pl b/check_postgres.pl
@@ -39,6 +39,9 @@ package check_postgres;
 ## Which user to connect as if --dbuser is not given
 $opt{defaultuser} = 'postgres';
 
+## Number of seconds that slave can go without receiving a write from master before alerting if  --maxidlemasterdelay is not given
+$opt{defaultidlemasterdelay} = 3600;
+
 ## Which port to connect to if --dbport is not given
 $opt{defaultport} = 5432;
 
@@ -975,19 +978,20 @@ package check_postgres;
     'tempdir=s',
     'get_method=s',
     'language=s',
-    'mrtg=s',      ## used by MRTG checks only
-    'logfile=s',   ## used by check_logfile only
-    'queryname=s', ## used by query_runtime only
-    'query=s',     ## used by custom_query only
-    'valtype=s',   ## used by custom_query only
-    'reverse',     ## used by custom_query only
-    'repinfo=s',   ## used by replicate_row only
-    'noidle',      ## used by backends only
-    'datadir=s',   ## used by checkpoint only
-    'schema=s@',   ## used by slony_status only
-    'filter=s@',   ## used by same_schema only
-    'suffix=s',    ## used by same_schema only
-    'replace',     ## used by same_schema only
+    'mrtg=s',               ## used by MRTG checks only
+    'logfile=s',            ## used by check_logfile only
+    'queryname=s',          ## used by query_runtime only
+    'query=s',              ## used by custom_query only
+    'valtype=s',            ## used by custom_query only
+    'reverse',              ## used by custom_query only
+    'repinfo=s',            ## used by replicate_row only
+    'noidle',               ## used by backends only
+    'datadir=s',            ## used by checkpoint only
+    'schema=s@',            ## used by slony_status only
+    'filter=s@',            ## used by same_schema only
+    'suffix=s',             ## used by same_schema only
+    'replace',              ## used by same_schema only
+    'maxidlemasterdelay=i', ## used by check_replay_delay only
 );
 
 die $USAGE if ! keys %opt and ! @ARGV;
@@ -1018,6 +1022,9 @@ package check_postgres;
         elsif ($name =~ /^dbservice(\d+)$/o) {
             push @{ $opt{dbservice} } => $value;
         }
+        elsif ($name =~ /^maxidlemasterdelay(\d+)$/o) {
+            push @{ $opt{maxidlemasterdelay} } => $value;
+        }
         else {
             push @badargs => $arg;
         }
@@ -1148,6 +1155,7 @@ package check_postgres;
  hitratio            => [0, 'Report if the hit ratio of a database is too low.'],
  hot_standby_delay   => [1, 'Check the replication delay in hot standby setup'],
  index_size          => [0, 'Checks the size of indexes only.'],
+ replay_delay        => [0, 'Check the log replay delay during recovery'],
  table_size          => [0, 'Checks the size of tables only.'],
  relation_size       => [0, 'Checks the size of tables and indexes.'],
  last_analyze        => [0, 'Check the maximum time in seconds since any one table has been analyzed.'],
@@ -1224,6 +1232,7 @@ package check_postgres;
   --exclude=name(s) items to specifically exclude (e.g. tables), depends on the action
   --includeuser=include objects owned by certain users
   --excludeuser=exclude objects owned by certain users
+  --maxidlemasterdelay number of seconds slave can go without receiving a write from master; defaults to '$opt{defaultidlemasterdelay}'
 
 Other options:
   --assume-standby-mode assume that server in continious WAL recovery mode
@@ -1752,6 +1761,7 @@ sub finishup {
                   fsm_pages         => 'VERSION: 8.2 MAX: 8.3',
                   fsm_relations     => 'VERSION: 8.2 MAX: 8.3',
                   hot_standby_delay => 'VERSION: 9.0',
+                  replay_delay      => 'VERSION: 9.0',
                   listener          => 'MAX: 8.4',
 );
 if ($opt{test}) {
@@ -1945,6 +1955,9 @@ sub finishup {
 ## Check the replication delay in hot standby setup
 check_hot_standby_delay() if $action eq 'hot_standby_delay';
 
+## Check the log replay delay during recovery
+check_replay_delay() if $action eq 'replay_delay';
+
 ## Check the maximum transaction age of all connections
 check_txn_time() if $action eq 'txn_time';
 
@@ -4743,6 +4756,61 @@ sub check_hitratio {
 } ## end of check_hitratio
 
 
+sub check_replay_delay {
+    ## Check the log replay delay during recovery
+    ## Supports: Nagios
+    ## Critical and warning are the thresholds of delay in seconds.
+    ## Example: --critical=5
+
+    my ($warning, $critical) = validate_range({type => 'integer', leastone => 1});
+
+    # set max idle master delay to override or the default
+    my $maxidlemasterdelay = $opt{maxidlemasterdelay} || $opt{defaultidlemasterdelay};
+
+    # check if we are in recovery using pg_is_in_recovery()
+    $SQL = q{SELECT pg_is_in_recovery() AS recovery;};
+
+    my $info = run_command($SQL, { regex => qr([tf]) });
+    for $db (@{$info->{db}}) {
+        my $status = $db->{slurp}[0];
+        if ($status->{recovery} eq 'f') {
+            add_critical("not in recovery");
+            return;
+        }
+    }
+    # We can't assume delay is none if last replayed equals last received, because in
+    # reality it could mean replication has gone out for lunch.
+    # This can lead to false negatives on an idle master, but is preferable to the
+    # opposite where replication has gone away and we assume everything is hunkey-dorey.
+    # This is also why I renamed this check to replay_delay from hot_standby_delay_slave.
+    #
+    # It only tells you the replication delay IF the master is active (i.e. receiving updates).
+    # If the master is not active, if it exceeds the threshold specified it will also alert.
+    $SQL = qq{SELECT CASE
+      WHEN pg_last_xlog_receive_location() = pg_last_xlog_replay_location()
+        AND EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) < $maxidlemasterdelay
+      THEN 0
+      ELSE
+        EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) END AS log_delay;};
+    $info = run_command($SQL);
+
+    for $db (@{$info->{db}}) {
+        my $delay = $db->{slurp}[0]->{log_delay};
+        my $msg   = qq{delay=${delay}s};
+
+        if (length $critical and $delay > $critical) {
+            add_critical $msg;
+        }
+        elsif (length $warning and $delay > $warning) {
+            add_warning $msg;
+        }
+        else {
+            add_ok $msg;
+        }
+    }
+
+} ## end of check_replay_delay
+
 sub check_hot_standby_delay {
 
     ## Check on the delay in PITR replication between master and slave
@@ -8874,6 +8942,12 @@ =head2 B<hot_standby_delay>
 
   check_hot_standby_delay --dbhost=master,replica1 --warning='1048576 and 2 min' --critical='16777216 and 10 min'
 
+=head2 B<replay_delay>
+
+(C<symlink: check_replay_delay>) Returns seconds passed since last transaction replayed
+during recovery.  This practically tells you the replication delay of a hot standby (locally) IF
+the master is active (i.e. master is receiving updates).
+
 =head2 B<index_size>
 
 =head2 B<table_size>