Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cherry Pick Fix orphaned temp namespace catalog entry left on coordinator #882

Merged
merged 17 commits into from
Jan 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion gpMgmt/bin/gpexpand
Original file line number Diff line number Diff line change
Expand Up @@ -1240,6 +1240,20 @@ class gpexpand:

tablespace_inputfile = self.options.filename + ".ts"

"""
Check if the tablespace input file exists or not
In cases where the user manually creates an input file, the file
will not be present. In such cases create the file and exit giving the
user a chance to review it and re-run gpexpand.
"""
if not os.path.exists(tablespace_inputfile):
self.generate_tablespace_inputfile(tablespace_inputfile)
self.logger.warning("Could not locate tablespace input configuration file '{0}'. A new tablespace input configuration file is written " \
"to '{0}'. Please review the file and re-run with: gpexpand -i {1}".format(tablespace_inputfile, self.options.filename))

logger.info("Exiting...")
sys.exit(1)

new_tblspc_info = {}

with open(tablespace_inputfile) as f:
Expand Down Expand Up @@ -2481,10 +2495,10 @@ def main(options, args, parser):
_gp_expand.validate_heap_checksums()
newSegList = _gp_expand.read_input_files()
_gp_expand.addNewSegments(newSegList)
newTableSpaceInfo = _gp_expand.read_tablespace_file()
_gp_expand.sync_packages()
_gp_expand.start_prepare()
_gp_expand.lock_catalog()
newTableSpaceInfo = _gp_expand.read_tablespace_file()
_gp_expand.add_segments(newTableSpaceInfo)
_gp_expand.update_original_segments()
_gp_expand.cleanup_new_segments()
Expand Down
21 changes: 18 additions & 3 deletions gpMgmt/bin/gppylib/commands/gp.py
Original file line number Diff line number Diff line change
Expand Up @@ -1215,16 +1215,31 @@ def get_gphome():
raise GpError('Environment Variable GPHOME not set')
return gphome

'''
gprecoverseg, gpstart, gpstate, gpstop, gpaddmirror have -d option to give the coordinator data directory.
but its value was not used throughout the utilities. to fix this the best possible way is
to set and retrieve that set coordinator dir when we call get_coordinatordatadir().
'''
option_coordinator_datadir = None
def set_coordinatordatadir(coordinator_datadir=None):
global option_coordinator_datadir
option_coordinator_datadir = coordinator_datadir

######
# Support both COORDINATOR_DATA_DIRECTORY and MASTER_DATA_DIRECTORY for backwards compatibility.
# If both are set, the former is used and the latter is ignored.
# if -d <coordinator_datadir> is provided with utility, it will be prioritiese over other options.
def get_coordinatordatadir():
coordinator_datadir = os.environ.get('COORDINATOR_DATA_DIRECTORY')
if not coordinator_datadir:
coordinator_datadir = os.environ.get('MASTER_DATA_DIRECTORY')
if option_coordinator_datadir is not None:
coordinator_datadir = option_coordinator_datadir
else:
coordinator_datadir = os.environ.get('COORDINATOR_DATA_DIRECTORY')
if not coordinator_datadir:
coordinator_datadir = os.environ.get('MASTER_DATA_DIRECTORY')

if not coordinator_datadir:
raise GpError("Environment Variable COORDINATOR_DATA_DIRECTORY not set!")

return coordinator_datadir

######
Expand Down
35 changes: 20 additions & 15 deletions gpMgmt/bin/gppylib/mainUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,18 @@ def simple_main(createOptionParserFn, createCommandFn, mainOptions=None):


def simple_main_internal(createOptionParserFn, createCommandFn, mainOptions):

"""
if -d <coordinator_data_dir> option is provided in that case doing parsing after creating
lock file would not be a good idea therefore handling -d option before lock.
"""
parser = createOptionParserFn()
(parserOptions, parserArgs) = parser.parse_args()

if parserOptions.ensure_value("coordinatorDataDirectory", None) is not None:
parserOptions.coordinator_data_directory = os.path.abspath(parserOptions.coordinatorDataDirectory)
gp.set_coordinatordatadir(parserOptions.coordinator_data_directory)

"""
If caller specifies 'pidlockpath' in mainOptions then we manage the
specified pid file within the COORDINATOR_DATA_DIRECTORY before proceeding
Expand All @@ -285,13 +297,13 @@ def simple_main_internal(createOptionParserFn, createCommandFn, mainOptions):

# at this point we have whatever lock we require
try:
simple_main_locked(createOptionParserFn, createCommandFn, mainOptions)
simple_main_locked(parserOptions, parserArgs, createCommandFn, mainOptions)
finally:
if sml is not None:
sml.release()


def simple_main_locked(createOptionParserFn, createCommandFn, mainOptions):
def simple_main_locked(parserOptions, parserArgs, createCommandFn, mainOptions):
"""
Not to be called externally -- use simple_main instead
"""
Expand All @@ -307,7 +319,6 @@ def simple_main_locked(createOptionParserFn, createCommandFn, mainOptions):
parser = None

forceQuiet = mainOptions is not None and mainOptions.get("forceQuietOutput")
options = None

if mainOptions is not None and mainOptions.get("programNameOverride"):
global gProgramName
Expand All @@ -323,30 +334,24 @@ def simple_main_locked(createOptionParserFn, createCommandFn, mainOptions):
hostname = unix.getLocalHostname()
username = unix.getUserName()

parser = createOptionParserFn()
(options, args) = parser.parse_args()

if useHelperToolLogging:
gplog.setup_helper_tool_logging(execname, hostname, username)
else:
gplog.setup_tool_logging(execname, hostname, username,
logdir=options.ensure_value("logfileDirectory", None), nonuser=nonuser)
logdir=parserOptions.ensure_value("logfileDirectory", None), nonuser=nonuser)

if forceQuiet:
gplog.quiet_stdout_logging()
else:
if options.ensure_value("verbose", False):
if parserOptions.ensure_value("verbose", False):
gplog.enable_verbose_logging()
if options.ensure_value("quiet", False):
if parserOptions.ensure_value("quiet", False):
gplog.quiet_stdout_logging()

if options.ensure_value("coordinatorDataDirectory", None) is not None:
options.coordinator_data_directory = os.path.abspath(options.coordinatorDataDirectory)

if not suppressStartupLogMessage:
logger.info("Starting %s with args: %s" % (gProgramName, ' '.join(sys.argv[1:])))

commandObject = createCommandFn(options, args)
commandObject = createCommandFn(parserOptions, parserArgs)
exitCode = commandObject.run()
exit_status = exitCode

Expand All @@ -368,10 +373,10 @@ def simple_main_locked(createOptionParserFn, createCommandFn, mainOptions):
e.cmd.results.stderr))
exit_status = 2
except Exception as e:
if options is None:
if parserOptions is None:
logger.exception("%s failed. exiting...", gProgramName)
else:
if options.ensure_value("verbose", False):
if parserOptions.ensure_value("verbose", False):
logger.exception("%s failed. exiting...", gProgramName)
else:
logger.fatal("%s failed. (Reason='%s') exiting..." % (gProgramName, e))
Expand Down
3 changes: 2 additions & 1 deletion gpMgmt/bin/gppylib/system/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ def __init__(self, coordinatorDataDir, readFromCoordinatorCatalog, timeout=None,
"""
if coordinatorDataDir is None:
self.__coordinatorDataDir = gp.get_coordinatordatadir()
else: self.__coordinatorDataDir = coordinatorDataDir
else:
self.__coordinatorDataDir = coordinatorDataDir

logger.debug("Obtaining coordinator's port from coordinator data directory")
pgconf_dict = pgconf.readfile(self.__coordinatorDataDir + "/postgresql.conf")
Expand Down
13 changes: 13 additions & 0 deletions gpMgmt/doc/gpfdist_help
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ SYNOPSIS

gpfdist [-d <directory>] [-p <http_port>] [-l <log_file>] [-t <timeout>]
[-S] [-w <time>] [-v | -V] [-m <max_length>] [--ssl <certificate_path>]
[--ssl_verify_peer <on/off>]

gpfdist [-? | --help] | --version

Expand Down Expand Up @@ -139,6 +140,18 @@ OPTIONS
The root directory (/) cannot be specified as certificate_path.


--ssl_verify_peer <on/off>

This option is used to configure the SSL authentication files required in
the gpfdists protocol, as well as to control the behavior of gpfdist regarding
SSL identity authentication. The default value is 'on'.

If the flag is 'on', gpfdist will be forced to check the identity of clients,
and CA certification file (root.crt) is necessary to be laid in the SSL
certification directory. If the flag is 'off', gpfdist will not check the
identity of clients, and it doesn't matter whether the CA certification file
exists on the gpfdist side.

-v (verbose)

Verbose mode shows progress and status messages.
Expand Down
27 changes: 27 additions & 0 deletions gpMgmt/test/behave/mgmt_utils/gpexpand.feature
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,33 @@ Feature: expand the cluster by adding more segments
When the user runs gpexpand to redistribute
Then the tablespace is valid after gpexpand

@gpexpand_no_mirrors
Scenario: expand a cluster with tablespace when there is no tablespace configuration file
Given the database is not running
And a working directory of the test as '/data/gpdata/gpexpand'
And the user runs command "rm -rf /data/gpdata/gpexpand/*"
And a temporary directory under "/data/gpdata/gpexpand/expandedData" to expand into
And a cluster is created with no mirrors on "cdw" and "sdw1"
And database "gptest" exists
And a tablespace is created with data
And another tablespace is created with data
And there are no gpexpand_inputfiles
And the cluster is setup for an expansion on hosts "cdw"
And the user runs gpexpand interview to add 1 new segment and 0 new host "ignore.host"
And the number of segments have been saved
And there are no gpexpand tablespace input configuration files
When the user runs gpexpand with the latest gpexpand_inputfile without ret code check
Then gpexpand should return a return code of 1
And gpexpand should print "[WARNING]:-Could not locate tablespace input configuration file" escaped to stdout
And gpexpand should print "A new tablespace input configuration file is written to" escaped to stdout
And gpexpand should print "Please review the file and re-run with: gpexpand -i" escaped to stdout
And verify if a gpexpand tablespace input configuration file is created
When the user runs gpexpand with the latest gpexpand_inputfile with additional parameters "--silent"
And verify that the cluster has 1 new segments
And all the segments are running
When the user runs gpexpand to redistribute
Then the tablespace is valid after gpexpand

@gpexpand_verify_redistribution
Scenario: Verify data is correctly redistributed after expansion
Given the database is not running
Expand Down
69 changes: 69 additions & 0 deletions gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,75 @@ Feature: gprecoverseg tests
And all the segments are running
And the segments are synchronized

Scenario: gprecoverseg runs with given coordinator data directory option
Given the database is running
And all the segments are running
And the segments are synchronized
And user stops all mirror processes
And user can start transactions
And "COORDINATOR_DATA_DIRECTORY" environment variable is not set
Then the user runs utility "gprecoverseg" with coordinator data directory and "-F -a"
And gprecoverseg should return a return code of 0
And "COORDINATOR_DATA_DIRECTORY" environment variable should be restored
And all the segments are running
And the segments are synchronized

Scenario: gprecoverseg priorities given coordinator data directory over env option
Given the database is running
And all the segments are running
And the segments are synchronized
And user stops all mirror processes
And user can start transactions
And the environment variable "COORDINATOR_DATA_DIRECTORY" is set to "/tmp/"
Then the user runs utility "gprecoverseg" with coordinator data directory and "-F -a"
And gprecoverseg should return a return code of 0
And "COORDINATOR_DATA_DIRECTORY" environment variable should be restored
And all the segments are running
And the segments are synchronized

Scenario: gprecoverseg differential recovery displays rsync progress to the user
Given the database is running
And all the segments are running
And the segments are synchronized
And all files in gpAdminLogs directory are deleted on all hosts in the cluster
And user stops all mirror processes
When user can start transactions
And the user runs "gprecoverseg --differential -a -s"
Then gprecoverseg should return a return code of 0
And gprecoverseg should print "Initiating segment recovery. Upon completion, will start the successfully recovered segments" to stdout
And gprecoverseg should print "total size" to stdout for each mirror
And gprecoverseg should print "Segments successfully recovered" to stdout
And gpAdminLogs directory has no "pg_basebackup*" files on all segment hosts
And gpAdminLogs directory has no "pg_rewind*" files on all segment hosts
And gpAdminLogs directory has no "rsync*" files on all segment hosts
And gpAdminLogs directory has "gpsegrecovery*" files
And gpAdminLogs directory has "gpsegsetuprecovery*" files
And all the segments are running
And the segments are synchronized
And verify replication slot internal_wal_replication_slot is available on all the segments
And check segment conf: postgresql.conf

Scenario: gprecoverseg does not display rsync progress to the user when --no-progress option is specified
Given the database is running
And all the segments are running
And the segments are synchronized
And all files in gpAdminLogs directory are deleted on all hosts in the cluster
And user stops all mirror processes
When user can start transactions
And the user runs "gprecoverseg --differential -a -s --no-progress"
Then gprecoverseg should return a return code of 0
And gprecoverseg should print "Initiating segment recovery. Upon completion, will start the successfully recovered segments" to stdout
And gprecoverseg should not print "total size is .* speedup is .*" to stdout
And gprecoverseg should print "Segments successfully recovered" to stdout
And gpAdminLogs directory has no "pg_basebackup*" files on all segment hosts
And gpAdminLogs directory has no "pg_rewind*" files on all segment hosts
And gpAdminLogs directory has no "rsync*" files on all segment hosts
And gpAdminLogs directory has "gpsegrecovery*" files
And gpAdminLogs directory has "gpsegsetuprecovery*" files
And all the segments are running
And the segments are synchronized
And check segment conf: postgresql.conf

Scenario: When gprecoverseg incremental recovery uses pg_rewind to recover and an existing postmaster.pid on the killed primary segment corresponds to a non postgres process
Given the database is running
And all the segments are running
Expand Down
26 changes: 26 additions & 0 deletions gpMgmt/test/behave/mgmt_utils/gpstart.feature
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,32 @@ Feature: gpstart behave tests
And gpstart should return a return code of 0
And all the segments are running

@demo_cluster
Scenario: gpstart runs with given coordinator data directory option
Given the database is running
And running postgres processes are saved in context
And the user runs "gpstop -a"
And gpstop should return a return code of 0
And verify no postgres process is running on all hosts
And "COORDINATOR_DATA_DIRECTORY" environment variable is not set
Then the user runs utility "gpstart" with coordinator data directory and "-a"
And gpstart should return a return code of 0
And "COORDINATOR_DATA_DIRECTORY" environment variable should be restored
And all the segments are running

@demo_cluster
Scenario: gpstart priorities given coordinator data directory over env option
Given the database is running
And running postgres processes are saved in context
And the user runs "gpstop -a"
And gpstop should return a return code of 0
And verify no postgres process is running on all hosts
And the environment variable "COORDINATOR_DATA_DIRECTORY" is set to "/tmp/"
Then the user runs utility "gpstart" with coordinator data directory and "-a"
And gpstart should return a return code of 0
And "COORDINATOR_DATA_DIRECTORY" environment variable should be restored
And all the segments are running

@concourse_cluster
@demo_cluster
Scenario: gpstart starts even if a segment host is unreachable
Expand Down
Loading
Loading