From 1165eaf89ab7e50471d11da839d2de4129779cd0 Mon Sep 17 00:00:00 2001 From: Marbin Tan Date: Wed, 28 Jun 2023 12:41:57 -0700 Subject: [PATCH] Add gp_stat_progress_dtx_recovery for observability dtx recovery may take time sometimes and will leave you in the dark with how far the recovery has progressed. `gp_stat_progress_dtx_recovery` is a view that shows progress of the current dtx recovery. There are 5 main phases (excluding initializing): 1. recovering commited distributed transactions 2. gathering in-doubt transactions 3. aborting in-doubt transactions 4. gathering in-doubt orphaned transactions 5. managing in-doubt orphaned transactions `gp_stat_progress_dtx_recovery` will be shown with column: phase recover_commited_dtx_total recover_commited_dtx_completed in_doubt_tx_total in_doubt_tx_in_progress in_doubt_tx_aborted The wording of "managing" is used instead of "aborting" for in-doubt orphaned transactions to make it slightly ambigious. There is a chance an in-doubt transaction is already an on going transaction and there will be no need to dispatch a dtx abort command. fix profile --- src/backend/catalog/system_views.sql | 16 ++++ src/backend/cdb/cdbdtxrecovery.c | 68 ++++++++++++++++- src/backend/utils/adt/pgstatfuncs.c | 2 + src/backend/utils/init/postinit.c | 2 +- src/backend/utils/misc/faultinjector.c | 4 +- src/include/commands/progress.h | 15 ++++ src/include/utils/backend_progress.h | 3 +- .../expected/crash_recovery_dtm.out | 75 ++++++++++++++++++- .../segwalrep/dtx_recovery_wait_lsn.out | 39 +++++++++- .../isolation2/sql/crash_recovery_dtm.sql | 25 ++++++- .../sql/segwalrep/dtx_recovery_wait_lsn.sql | 16 +++- src/test/regress/expected/profile.out | 2 +- 12 files changed, 251 insertions(+), 16 deletions(-) diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 7ec1293af38..8cfaa58ac51 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1741,6 +1741,22 @@ CREATE VIEW pg_stat_progress_copy AS FROM pg_stat_get_progress_info('COPY') AS S LEFT JOIN pg_database D ON S.datid = D.oid; +CREATE VIEW gp_stat_progress_dtx_recovery AS + SELECT + CASE S.param1 WHEN 0 THEN 'initializing' + WHEN 1 THEN 'recovering commited distributed transactions' + WHEN 2 THEN 'gathering in-doubt transactions' + WHEN 3 THEN 'aborting in-doubt transactions' + WHEN 4 THEN 'gathering in-doubt orphaned transactions' + WHEN 5 THEN 'managing in-doubt orphaned transactions' + END AS phase, + S.param2 AS recover_commited_dtx_total, -- total commited transactions found to recover + S.param3 AS recover_commited_dtx_completed, -- recover completed, this is always 0 after startup. + S.param4 AS in_doubt_tx_total, -- total in doubt tx found, used in startup and non-startup phase + S.param5 AS in_doubt_tx_in_progress, -- in-progress in-doubt tx, this is always 0 for startup + S.param6 AS in_doubt_tx_aborted -- aborted in-doubt tx, this can be >0 for both + FROM pg_stat_get_progress_info('DTX RECOVERY') AS S; + CREATE VIEW pg_user_mappings AS SELECT U.oid AS umid, diff --git a/src/backend/cdb/cdbdtxrecovery.c b/src/backend/cdb/cdbdtxrecovery.c index 7a6f6c5d015..186b01ff214 100644 --- a/src/backend/cdb/cdbdtxrecovery.c +++ b/src/backend/cdb/cdbdtxrecovery.c @@ -19,6 +19,7 @@ #include "utils/ps_status.h" #include "postmaster/bgworker.h" #include "pgstat.h" +#include "commands/progress.h" #include "storage/ipc.h" #include "storage/latch.h" #include "storage/lwlock.h" @@ -53,6 +54,10 @@ volatile int *shmNumCommittedGxacts; static volatile sig_atomic_t got_SIGHUP = false; +static int64 in_doubt_tx_in_progress = 0; +static int64 in_doubt_tx_aborted = 0; +static int64 in_doubt_tx_total = 0; + typedef struct InDoubtDtx { char gid[TMGIDSIZE]; @@ -190,6 +195,11 @@ recoverInDoubtTransactions(void) "Going to retry commit notification for distributed transactions (count = %d)", *shmNumCommittedGxacts); + pgstat_progress_update_param(PROGRESS_DTX_RECOVERY_PHASE, + PROGRESS_DTX_RECOVERY_PHASE_STARTUP_RECOVER_COMMITTED_DTX); + pgstat_progress_update_param(PROGRESS_DTX_RECOVERY_COMITTED_DTX_TOTAL, + *shmNumCommittedGxacts); + for (i = 0; i < *shmNumCommittedGxacts; i++) { DistributedTransactionId gxid = shmCommittedGxidArray[i]; @@ -204,16 +214,27 @@ recoverInDoubtTransactions(void) doNotifyCommittedInDoubt(gid); RecordDistributedForgetCommitted(gxid); + + pgstat_progress_update_param(PROGRESS_DTX_RECOVERY_COMITTED_DTX_COMPLETED, + i+1); } + SIMPLE_FAULT_INJECTOR("post_progress_recovery_comitted"); + *shmNumCommittedGxacts = 0; + + pgstat_progress_update_param(PROGRESS_DTX_RECOVERY_PHASE, + PROGRESS_DTX_RECOVERY_PHASE_STARTUP_GATHER_IN_DOUBT_TX); + /* * Any in-doubt transctions found will be for aborted * transactions. Gather in-doubt transactions and issue aborts. */ htab = gatherRMInDoubtTransactions(0, true); + pgstat_progress_update_param(PROGRESS_DTX_RECOVERY_PHASE, + PROGRESS_DTX_RECOVERY_PHASE_STARTUP_ABORT_IN_DOUBT_TX); /* * go through and resolve any remaining in-doubt transactions that the * RM's have AFTER recoverDTMInDoubtTransactions. ALL of these in doubt @@ -351,6 +372,9 @@ gatherRMInDoubtTransactions(int prepared_seconds, bool raiseError) elog(DEBUG3, "Found in-doubt transaction with GID: %s on remote RM", gid); strncpy(lastDtx->gid, gid, TMGIDSIZE); + + pgstat_progress_update_param(PROGRESS_DTX_RECOVERY_IN_DOUBT_TX_TOTAL, + ++in_doubt_tx_total); } } } @@ -388,6 +412,9 @@ abortRMInDoubtTransactions(HTAB *htab) elog(DTM_DEBUG3, "Aborting in-doubt transaction with gid = %s", entry->gid); doAbortInDoubt(entry->gid); + + pgstat_progress_update_param(PROGRESS_DTX_RECOVERY_IN_DOUBT_TX_ABORTED, + ++in_doubt_tx_aborted); } } @@ -414,10 +441,19 @@ abortOrphanedTransactions(HTAB *htab) dtxDeformGid(entry->gid, &gxid); - if (!IsDtxInProgress(gxid)) + if (IsDtxInProgress(gxid)) + { + pgstat_progress_update_param(PROGRESS_DTX_RECOVERY_IN_DOUBT_TX_IN_PROGRESS, + ++in_doubt_tx_in_progress); + SIMPLE_FAULT_INJECTOR("post_in_doubt_tx_in_progress"); + } + else { elog(LOG, "Aborting orphaned transactions with gid = %s", entry->gid); doAbortInDoubt(entry->gid); + + pgstat_progress_update_param(PROGRESS_DTX_RECOVERY_IN_DOUBT_TX_ABORTED, + ++in_doubt_tx_aborted); } } } @@ -536,6 +572,14 @@ DtxRecoveryStartRule(Datum main_arg) return (Gp_role == GP_ROLE_DISPATCH); } +static void +ResetInDoubtStatProgress() +{ + in_doubt_tx_total = 0; + in_doubt_tx_aborted = 0; + in_doubt_tx_in_progress = 0; +} + static void AbortOrphanedPreparedTransactions() { @@ -546,9 +590,15 @@ AbortOrphanedPreparedTransactions() return; #endif + pgstat_progress_update_param(PROGRESS_DTX_RECOVERY_PHASE, + PROGRESS_DTX_RECOVERY_PHASE_GATHER_IN_DOUBT_TX); + StartTransactionCommand(); htab = gatherRMInDoubtTransactions(gp_dtx_recovery_prepared_period, false); + pgstat_progress_update_param(PROGRESS_DTX_RECOVERY_PHASE, + PROGRESS_DTX_RECOVERY_PHASE_ABORT_IN_DOUBT_TX); + /* in case an error happens somehow. */ if (htab != NULL) { @@ -629,6 +679,9 @@ DtxRecoveryMain(Datum main_arg) /* Connect to postgres */ BackgroundWorkerInitializeConnection(DB_FOR_COMMON_ACCESS, NULL, 0); + /* initialize progress */ + pgstat_progress_start_command(PROGRESS_COMMAND_DTX_RECOVERY, InvalidOid); + /* * Do dtx recovery process. It is possible that *shmDtmStarted is true * here if we terminate after this code block, e.g. due to error and then @@ -646,6 +699,8 @@ DtxRecoveryMain(Datum main_arg) set_ps_display(""); } + pgstat_progress_end_command(); + /* Fetch the gxid batch in advance. */ bumpGxid(); @@ -690,7 +745,18 @@ DtxRecoveryMain(Datum main_arg) * cases so that it could respond promptly for gxid bumping given * the abort operation might be time-consuming. */ + + /* + * The total amount of in doubt transactions are calculated per iteration + * and are destroyed after use. Keeping the old statistics may be confusing, + * so clear it up. + */ + ResetInDoubtStatProgress(); + pgstat_progress_start_command(PROGRESS_COMMAND_DTX_RECOVERY, InvalidOid); + AbortOrphanedPreparedTransactions(); + + pgstat_progress_end_command(); } rc = WaitLatch(&MyProc->procLatch, diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 242fdbc3cb7..89621072f80 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -504,6 +504,8 @@ pg_stat_get_progress_info(PG_FUNCTION_ARGS) cmdtype = PROGRESS_COMMAND_BASEBACKUP; else if (pg_strcasecmp(cmd, "COPY") == 0) cmdtype = PROGRESS_COMMAND_COPY; + else if (pg_strcasecmp(cmd, "DTX RECOVERY") == 0) + cmdtype = PROGRESS_COMMAND_DTX_RECOVERY; else ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 6163eff9454..a335f3fdef7 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -1251,7 +1251,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username, * report this backend in the PgBackendStatus array, meanwhile, we do not * want users to see auxiliary background worker like fts in pg_stat_* views. */ - if (!bootstrap && !amAuxiliaryBgWorker()) + if (!bootstrap && (!amAuxiliaryBgWorker() || IsDtxRecoveryProcess())) pgstat_bestart(); /* diff --git a/src/backend/utils/misc/faultinjector.c b/src/backend/utils/misc/faultinjector.c index 3aa202dc41d..df456b3bb08 100644 --- a/src/backend/utils/misc/faultinjector.c +++ b/src/backend/utils/misc/faultinjector.c @@ -277,7 +277,9 @@ checkBgProcessSkipFault(const char* faultName) { /* dtx recovery process */ if (0 != strcmp("before_orphaned_check", faultName) && - 0 != strcmp("after_orphaned_check", faultName)) + 0 != strcmp("after_orphaned_check", faultName) && + 0 != strcmp("post_in_doubt_tx_in_progress", faultName) && + 0 != strcmp("post_progress_recovery_comitted", faultName)) { elog(LOG, "skipped fault '%s' in dtx recovery process", faultName); return true; diff --git a/src/include/commands/progress.h b/src/include/commands/progress.h index 990f8b8e6f4..f18e63bd7af 100644 --- a/src/include/commands/progress.h +++ b/src/include/commands/progress.h @@ -155,4 +155,19 @@ #define PROGRESS_COPY_TYPE_PIPE 3 #define PROGRESS_COPY_TYPE_CALLBACK 4 +/* Progress parameters for DTX recovery */ +#define PROGRESS_DTX_RECOVERY_PHASE 0 +#define PROGRESS_DTX_RECOVERY_COMITTED_DTX_TOTAL 1 +#define PROGRESS_DTX_RECOVERY_COMITTED_DTX_COMPLETED 2 +#define PROGRESS_DTX_RECOVERY_IN_DOUBT_TX_TOTAL 3 +#define PROGRESS_DTX_RECOVERY_IN_DOUBT_TX_IN_PROGRESS 4 +#define PROGRESS_DTX_RECOVERY_IN_DOUBT_TX_ABORTED 5 + +/* Phases of DTX recovery */ +#define PROGRESS_DTX_RECOVERY_PHASE_STARTUP_RECOVER_COMMITTED_DTX 1 +#define PROGRESS_DTX_RECOVERY_PHASE_STARTUP_GATHER_IN_DOUBT_TX 2 +#define PROGRESS_DTX_RECOVERY_PHASE_STARTUP_ABORT_IN_DOUBT_TX 3 +#define PROGRESS_DTX_RECOVERY_PHASE_GATHER_IN_DOUBT_TX 4 +#define PROGRESS_DTX_RECOVERY_PHASE_ABORT_IN_DOUBT_TX 5 + #endif diff --git a/src/include/utils/backend_progress.h b/src/include/utils/backend_progress.h index 53bddf665b1..8458625c89a 100644 --- a/src/include/utils/backend_progress.h +++ b/src/include/utils/backend_progress.h @@ -27,7 +27,8 @@ typedef enum ProgressCommandType PROGRESS_COMMAND_CLUSTER, PROGRESS_COMMAND_CREATE_INDEX, PROGRESS_COMMAND_BASEBACKUP, - PROGRESS_COMMAND_COPY + PROGRESS_COMMAND_COPY, + PROGRESS_COMMAND_DTX_RECOVERY } ProgressCommandType; #define PGSTAT_NUM_PROGRESS_PARAM 20 diff --git a/src/test/isolation2/expected/crash_recovery_dtm.out b/src/test/isolation2/expected/crash_recovery_dtm.out index 7ada74a348f..7fb1a3abb85 100644 --- a/src/test/isolation2/expected/crash_recovery_dtm.out +++ b/src/test/isolation2/expected/crash_recovery_dtm.out @@ -216,6 +216,11 @@ ALTER ----------------- Success: (1 row) +11: select gp_inject_fault_infinite('before_orphaned_check', 'suspend', dbid) FROM gp_segment_configuration WHERE content=-1 AND role='p'; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) -- statement to trigger fault after writing prepare record 12&: DELETE FROM QE_panic_test_table; 11: SELECT gp_wait_until_triggered_fault('end_prepare_two_phase', 1, dbid) from gp_segment_configuration where role='p' and content=0; @@ -229,9 +234,26 @@ ALTER OK (1 row) 12<: <... completed> -ERROR: Error on receive from seg0 127.0.0.1:7002 pid=25361: server closed the connection unexpectedly +ERROR: Error on receive from seg0 127.0.0.1:7002 pid=5600: server closed the connection unexpectedly This probably means the server terminated abnormally before or while processing the request. + +11: SELECT gp_wait_until_triggered_fault('before_orphaned_check', 1, dbid) from gp_segment_configuration where role = 'p' and content = -1; + gp_wait_until_triggered_fault +------------------------------- + Success: +(1 row) +11: select * from gp_stat_progress_dtx_recovery; + phase | recover_commited_dtx_total | recover_commited_dtx_completed | in_doubt_tx_total | in_doubt_tx_in_progress | in_doubt_tx_aborted +--------------+----------------------------+--------------------------------+-------------------+-------------------------+--------------------- + initializing | 0 | 0 | 0 | 0 | 0 +(1 row) +11: SELECT gp_inject_fault_infinite('before_orphaned_check', 'reset', dbid) from gp_segment_configuration where role='p' and content=-1; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) + 13: SELECT count(*) from QE_panic_test_table; count ------- @@ -320,6 +342,11 @@ server closed the connection unexpectedly -------------------------- Success: (1 row) +19: SELECT gp_inject_fault_infinite('after_orphaned_check', 'suspend', dbid) from gp_segment_configuration where role = 'p' and content = -1; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) 19: SELECT gp_wait_until_triggered_fault('before_orphaned_check', 1, dbid) from gp_segment_configuration where role = 'p' and content = -1; gp_wait_until_triggered_fault ------------------------------- @@ -363,8 +390,24 @@ server closed the connection unexpectedly -------------------------- Success: (1 row) +-- verify orphaned prepared transacion is aborted +19: SELECT gp_wait_until_triggered_fault('after_orphaned_check', 1, dbid) from gp_segment_configuration where role = 'p' and content = -1; + gp_wait_until_triggered_fault +------------------------------- + Success: +(1 row) +19: select * from gp_stat_progress_dtx_recovery; + phase | recover_commited_dtx_total | recover_commited_dtx_completed | in_doubt_tx_total | in_doubt_tx_in_progress | in_doubt_tx_aborted +-----------------------------------------+----------------------------+--------------------------------+-------------------+-------------------------+--------------------- + managing in-doubt orphaned transactions | 0 | 0 | 1 | 0 | 1 +(1 row) +19: SELECT gp_inject_fault_infinite('after_orphaned_check', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = -1; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) 19: DROP TABLE master_reset; -DROP +DROP TABLE 19: ALTER SYSTEM RESET gp_dtx_recovery_interval; ALTER 19: ALTER SYSTEM RESET gp_dtx_recovery_prepared_period; @@ -412,6 +455,11 @@ CREATE -------------------------- Success: (1 row) +20: SELECT gp_inject_fault_infinite('post_in_doubt_tx_in_progress', 'suspend', dbid) from gp_segment_configuration where role = 'p' and content = -1; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) -- run two phase query. 21: INSERT INTO test_retry_abort SELECT generate_series(1,10); @@ -428,6 +476,22 @@ ERROR: fault triggered, fault name:'dtm_broadcast_prepare' fault type:'error' ------- 1 (1 row) +-- verify in-doubt transaction is in progress +20: SELECT gp_wait_until_triggered_fault('post_in_doubt_tx_in_progress', 1, dbid) from gp_segment_configuration where role = 'p' and content = -1; + gp_wait_until_triggered_fault +------------------------------- + Success: +(1 row) +20: select * from gp_stat_progress_dtx_recovery; + phase | recover_commited_dtx_total | recover_commited_dtx_completed | in_doubt_tx_total | in_doubt_tx_in_progress | in_doubt_tx_aborted +-----------------------------------------+----------------------------+--------------------------------+-------------------+-------------------------+--------------------- + managing in-doubt orphaned transactions | 0 | 0 | 1 | 1 | 0 +(1 row) +19: SELECT gp_inject_fault_infinite('post_in_doubt_tx_in_progress', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = -1; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) -- dtx recovery ready to handle the orphaned prepared transaction. 20: SELECT gp_inject_fault_infinite('before_orphaned_check', 'suspend', dbid) from gp_segment_configuration where role = 'p' and content = -1; @@ -440,7 +504,7 @@ ERROR: fault triggered, fault name:'dtm_broadcast_prepare' fault type:'error' ------------------------------- Success: (1 row) -20: SELECT gp_inject_fault_infinite('after_orphaned_check', 'skip', dbid) from gp_segment_configuration where role = 'p' and content = -1; +20: SELECT gp_inject_fault_infinite('after_orphaned_check', 'suspend', dbid) from gp_segment_configuration where role = 'p' and content = -1; gp_inject_fault_infinite -------------------------- Success: @@ -468,6 +532,11 @@ ERROR: fault triggered, fault name:'dtm_broadcast_prepare' fault type:'error' transaction | gid | prepared | owner | database -------------+-----+----------+-------+---------- (0 rows) +20: select * from gp_stat_progress_dtx_recovery; + phase | recover_commited_dtx_total | recover_commited_dtx_completed | in_doubt_tx_total | in_doubt_tx_in_progress | in_doubt_tx_aborted +-----------------------------------------+----------------------------+--------------------------------+-------------------+-------------------------+--------------------- + managing in-doubt orphaned transactions | 0 | 0 | 1 | 0 | 1 +(1 row) -- cleanup 20: ALTER SYSTEM RESET gp_dtx_recovery_interval; diff --git a/src/test/isolation2/expected/segwalrep/dtx_recovery_wait_lsn.out b/src/test/isolation2/expected/segwalrep/dtx_recovery_wait_lsn.out index a410a91b754..ce415fc5f96 100644 --- a/src/test/isolation2/expected/segwalrep/dtx_recovery_wait_lsn.out +++ b/src/test/isolation2/expected/segwalrep/dtx_recovery_wait_lsn.out @@ -7,6 +7,8 @@ 1: create table t_wait_lsn(a int); CREATE +5: create table t_wait_lsn2(a int); +CREATE -- suspend segment 0 before performing 'COMMIT PREPARED' 2: select gp_inject_fault_infinite('finish_prepared_start_of_function', 'suspend', dbid) from gp_segment_configuration where content=0 and role='p'; @@ -15,7 +17,8 @@ CREATE Success: (1 row) 1&: insert into t_wait_lsn values(2),(1); -2: select gp_wait_until_triggered_fault('finish_prepared_start_of_function', 1, dbid) from gp_segment_configuration where content=0 and role='p'; +5&: insert into t_wait_lsn2 values(2),(1); +2: select gp_wait_until_triggered_fault('finish_prepared_start_of_function', 2, dbid) from gp_segment_configuration where content=0 and role='p'; gp_wait_until_triggered_fault ------------------------------- Success: @@ -37,7 +40,7 @@ CREATE 0U: select count(*) from pg_prepared_xacts; count ------- - 1 + 2 (1 row) -- stop mirror @@ -69,7 +72,30 @@ server closed the connection unexpectedly This probably means the server terminated abnormally before or while processing the request. --- wait for master finish crash recovery +-- potential flakiness: there is a chance where the coordinator +-- recovers fast enough (from the panic above) that we end up fault injecting too late. +-1U: select gp_inject_fault_infinite('post_progress_recovery_comitted', 'suspend', dbid) FROM gp_segment_configuration WHERE content=-1 AND role='p'; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) +-1U: select gp_wait_until_triggered_fault('post_progress_recovery_comitted', 1, dbid) from gp_segment_configuration where content=-1 and role='p'; + gp_wait_until_triggered_fault +------------------------------- + Success: +(1 row) +-1U: select * from gp_stat_progress_dtx_recovery; + phase | recover_commited_dtx_total | recover_commited_dtx_completed | in_doubt_tx_total | in_doubt_tx_in_progress | in_doubt_tx_aborted +----------------------------------------------+----------------------------+--------------------------------+-------------------+-------------------------+--------------------- + recovering commited distributed transactions | 2 | 2 | 0 | 0 | 0 +(1 row) +-1U: select gp_inject_fault_infinite('post_progress_recovery_comitted', 'reset', dbid) from gp_segment_configuration where content=-1 and role='p'; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) + +-- wait for coordinator finish crash recovery -1U: select wait_until_standby_in_state('streaming'); wait_until_standby_in_state ----------------------------- @@ -85,8 +111,11 @@ server closed the connection unexpectedly ------- 2 (1 row) - 1<: <... completed> +server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +5<: <... completed> server closed the connection unexpectedly This probably means the server terminated abnormally before or while processing the request. @@ -121,6 +150,8 @@ server closed the connection unexpectedly (1 row) 4: drop table t_wait_lsn; DROP +4: drop table t_wait_lsn2; +DROP 4: select gp_inject_fault('walrecv_skip_flush', 'reset', dbid) from gp_segment_configuration where content=0; gp_inject_fault diff --git a/src/test/isolation2/sql/crash_recovery_dtm.sql b/src/test/isolation2/sql/crash_recovery_dtm.sql index 3c7ce2f043c..63e74bd62a5 100644 --- a/src/test/isolation2/sql/crash_recovery_dtm.sql +++ b/src/test/isolation2/sql/crash_recovery_dtm.sql @@ -113,12 +113,18 @@ select pg_reload_conf(); 11: SELECT gp_request_fts_probe_scan(); 11: SELECT gp_inject_fault('end_prepare_two_phase', 'infinite_loop', dbid) from gp_segment_configuration where role='p' and content=0; +11: select gp_inject_fault_infinite('before_orphaned_check', 'suspend', dbid) FROM gp_segment_configuration WHERE content=-1 AND role='p'; -- statement to trigger fault after writing prepare record 12&: DELETE FROM QE_panic_test_table; 11: SELECT gp_wait_until_triggered_fault('end_prepare_two_phase', 1, dbid) from gp_segment_configuration where role='p' and content=0; 11: SELECT pg_ctl(datadir, 'restart') from gp_segment_configuration where role = 'p' and content = 0; 12<: + +11: SELECT gp_wait_until_triggered_fault('before_orphaned_check', 1, dbid) from gp_segment_configuration where role = 'p' and content = -1; +11: select * from gp_stat_progress_dtx_recovery; +11: SELECT gp_inject_fault_infinite('before_orphaned_check', 'reset', dbid) from gp_segment_configuration where role='p' and content=-1; + 13: SELECT count(*) from QE_panic_test_table; 13: SELECT * FROM gp_dist_random('pg_prepared_xacts'); 13: SELECT gp_inject_fault('fts_probe', 'reset', dbid) @@ -155,6 +161,8 @@ select pg_reload_conf(); -- master suspends before running periodical checking of orphaned prepared transactions. 19: SELECT gp_inject_fault_infinite('before_orphaned_check', 'suspend', dbid) from gp_segment_configuration where role = 'p' and content = -1; +19: SELECT gp_inject_fault_infinite('after_orphaned_check', 'suspend', dbid) + from gp_segment_configuration where role = 'p' and content = -1; 19: SELECT gp_wait_until_triggered_fault('before_orphaned_check', 1, dbid) from gp_segment_configuration where role = 'p' and content = -1; 19: SELECT gp_inject_fault_infinite('after_orphaned_check', 'suspend', dbid) @@ -176,6 +184,12 @@ select pg_reload_conf(); -- with required lock of the drop operation. 19: SELECT gp_inject_fault_infinite('before_orphaned_check', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = -1; +-- verify orphaned prepared transacion is aborted +19: SELECT gp_wait_until_triggered_fault('after_orphaned_check', 1, dbid) + from gp_segment_configuration where role = 'p' and content = -1; +19: select * from gp_stat_progress_dtx_recovery; +19: SELECT gp_inject_fault_infinite('after_orphaned_check', 'reset', dbid) + from gp_segment_configuration where role = 'p' and content = -1; 19: DROP TABLE master_reset; 19: ALTER SYSTEM RESET gp_dtx_recovery_interval; 19: ALTER SYSTEM RESET gp_dtx_recovery_prepared_period; @@ -201,6 +215,8 @@ select pg_reload_conf(); from gp_segment_configuration where role = 'p' and content = -1; 20: SELECT gp_inject_fault_infinite('finish_prepared_start_of_function', 'error', dbid) from gp_segment_configuration where role = 'p' and content = 0; +20: SELECT gp_inject_fault_infinite('post_in_doubt_tx_in_progress', 'suspend', dbid) + from gp_segment_configuration where role = 'p' and content = -1; -- run two phase query. 21: INSERT INTO test_retry_abort SELECT generate_series(1,10); @@ -209,13 +225,19 @@ select pg_reload_conf(); -- transaction on seg0. 20: SELECT * from test_retry_abort; 0U: SELECT count(*) from pg_prepared_xacts; +-- verify in-doubt transaction is in progress +20: SELECT gp_wait_until_triggered_fault('post_in_doubt_tx_in_progress', 1, dbid) + from gp_segment_configuration where role = 'p' and content = -1; +20: select * from gp_stat_progress_dtx_recovery; +19: SELECT gp_inject_fault_infinite('post_in_doubt_tx_in_progress', 'reset', dbid) + from gp_segment_configuration where role = 'p' and content = -1; -- dtx recovery ready to handle the orphaned prepared transaction. 20: SELECT gp_inject_fault_infinite('before_orphaned_check', 'suspend', dbid) from gp_segment_configuration where role = 'p' and content = -1; 20: SELECT gp_wait_until_triggered_fault('before_orphaned_check', 1, dbid) from gp_segment_configuration where role = 'p' and content = -1; -20: SELECT gp_inject_fault_infinite('after_orphaned_check', 'skip', dbid) +20: SELECT gp_inject_fault_infinite('after_orphaned_check', 'suspend', dbid) from gp_segment_configuration where role = 'p' and content = -1; -- kick off abort prepared on seg0 and then dtx recovery will abort that one. @@ -228,6 +250,7 @@ select pg_reload_conf(); 20: SELECT gp_wait_until_triggered_fault('after_orphaned_check', 1, dbid) from gp_segment_configuration where role = 'p' and content = -1; 0U: SELECT * from pg_prepared_xacts; +20: select * from gp_stat_progress_dtx_recovery; -- cleanup 20: ALTER SYSTEM RESET gp_dtx_recovery_interval; diff --git a/src/test/isolation2/sql/segwalrep/dtx_recovery_wait_lsn.sql b/src/test/isolation2/sql/segwalrep/dtx_recovery_wait_lsn.sql index e9e61f2d0d6..8cd7666e236 100644 --- a/src/test/isolation2/sql/segwalrep/dtx_recovery_wait_lsn.sql +++ b/src/test/isolation2/sql/segwalrep/dtx_recovery_wait_lsn.sql @@ -6,11 +6,13 @@ -- while the 'dtx recovery' process is hanging recovering distributed transactions. 1: create table t_wait_lsn(a int); +5: create table t_wait_lsn2(a int); -- suspend segment 0 before performing 'COMMIT PREPARED' 2: select gp_inject_fault_infinite('finish_prepared_start_of_function', 'suspend', dbid) from gp_segment_configuration where content=0 and role='p'; 1&: insert into t_wait_lsn values(2),(1); -2: select gp_wait_until_triggered_fault('finish_prepared_start_of_function', 1, dbid) from gp_segment_configuration where content=0 and role='p'; +5&: insert into t_wait_lsn2 values(2),(1); +2: select gp_wait_until_triggered_fault('finish_prepared_start_of_function', 2, dbid) from gp_segment_configuration where content=0 and role='p'; -- let walreceiver on mirror 0 skip WAL flush 2: select gp_inject_fault_infinite('walrecv_skip_flush', 'skip', dbid) from gp_segment_configuration where content=0 and role='m'; @@ -32,7 +34,14 @@ -- end_matchsubs 3: select 1; --- wait for master finish crash recovery +-- potential flakiness: there is a chance where the coordinator +-- recovers fast enough (from the panic above) that we end up fault injecting too late. +-1U: select gp_inject_fault_infinite('post_progress_recovery_comitted', 'suspend', dbid) FROM gp_segment_configuration WHERE content=-1 AND role='p'; +-1U: select gp_wait_until_triggered_fault('post_progress_recovery_comitted', 1, dbid) from gp_segment_configuration where content=-1 and role='p'; +-1U: select * from gp_stat_progress_dtx_recovery; +-1U: select gp_inject_fault_infinite('post_progress_recovery_comitted', 'reset', dbid) from gp_segment_configuration where content=-1 and role='p'; + +-- wait for coordinator finish crash recovery -1U: select wait_until_standby_in_state('streaming'); -- wait for FTS to 'sync off' the mirror, meanwhile, dtx recovery process will @@ -40,8 +49,8 @@ -- the query should succeed finally since dtx recovery process is able to quit. -- this's what we want to test. 4: select count(*) from t_wait_lsn; - 1<: +5<: !\retcode gpfts -R 1 -A -D; !\retcode gprecoverseg -a; @@ -51,5 +60,6 @@ 4: select pg_sleep(10); 4: select count(*) from t_wait_lsn; 4: drop table t_wait_lsn; +4: drop table t_wait_lsn2; 4: select gp_inject_fault('walrecv_skip_flush', 'reset', dbid) from gp_segment_configuration where content=0; diff --git a/src/test/regress/expected/profile.out b/src/test/regress/expected/profile.out index b7fcbd66024..9c2c203602b 100644 --- a/src/test/regress/expected/profile.out +++ b/src/test/regress/expected/profile.out @@ -5,7 +5,7 @@ SELECT COUNT(*) FROM pg_stat_activity; count ------- - 8 + 9 (1 row) -- Display pg_authid, pg_roles, pg_profile and pg_password_history catalog