Backup and restore test #67
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
# TODO: | |
# - [ ] test on XL, also break the puppetdb database | |
# - [ ] test custom scenario for only restoring primary | |
# - [ ] test custom scenario for only restoring puppetdb (on external server) | |
name: "Backup and restore test" | |
on: | |
pull_request_target: | |
types: [ready_for_review] | |
workflow_dispatch: | |
inputs: | |
image: | |
description: "GCP image for test cluster" | |
required: true | |
default: "almalinux-cloud/almalinux-8" | |
architecture: | |
description: "PE architecture to test" | |
required: true | |
default: "standard" | |
type: choice | |
options: | |
- standard | |
- standard-with-dr | |
- large | |
- large-with-dr | |
- extra-large | |
- extra-large-with-dr | |
version: | |
description: "PE version to install" | |
required: true | |
default: "2023.5.0" | |
ssh-debugging: | |
description: "Boolean; whether or not to pause for ssh debugging" | |
required: true | |
default: "false" | |
jobs: | |
backup: | |
name: "Backup cluster: PE ${{ github.event.inputs.version || '2023.5.0' }} ${{ github.event.inputs.architecture || 'extra-large' }} on ${{ github.event.inputs.image || 'almalinux-cloud/almalinux-8' }}" | |
runs-on: ubuntu-20.04 | |
env: | |
BOLT_GEM: true | |
BOLT_DISABLE_ANALYTICS: true | |
LANG: "en_US.UTF-8" | |
steps: | |
- name: "Start SSH session" | |
if: ${{ github.event.inputs.ssh-debugging == 'true' }} | |
uses: luchihoratiu/debug-via-ssh@main | |
with: | |
NGROK_AUTH_TOKEN: ${{ secrets.NGROK_AUTH_TOKEN }} | |
SSH_PASS: ${{ secrets.SSH_PASS }} | |
- name: "Checkout Source" | |
uses: actions/checkout@v2 | |
- name: "Activate Ruby 2.7" | |
uses: ruby/setup-ruby@v1 | |
with: | |
ruby-version: "2.7" | |
bundler-cache: true | |
- name: "Print bundle environment" | |
if: ${{ github.repository_owner == 'puppetlabs' }} | |
run: | | |
echo ::group::info:bundler | |
bundle env | |
echo ::endgroup:: | |
- name: "Provision test cluster" | |
timeout-minutes: 15 | |
run: | | |
echo ::group::prepare | |
mkdir -p $HOME/.ssh | |
echo 'Host *' > $HOME/.ssh/config | |
echo ' ServerAliveInterval 150' >> $HOME/.ssh/config | |
echo ' ServerAliveCountMax 2' >> $HOME/.ssh/config | |
bundle exec rake spec_prep | |
echo ::endgroup:: | |
echo ::group::provision | |
bundle exec bolt plan run peadm_spec::provision_test_cluster \ | |
--modulepath spec/fixtures/modules \ | |
provider=provision_service \ | |
image=${{ github.event.inputs.image || 'almalinux-cloud/almalinux-8' }} \ | |
architecture=${{ github.event.inputs.architecture || 'extra-large' }} | |
echo ::endgroup:: | |
echo ::group::info:request | |
cat request.json || true; echo | |
echo ::endgroup:: | |
echo ::group::info:inventory | |
sed -e 's/password: .*/password: "[redacted]"/' < spec/fixtures/litmus_inventory.yaml || true | |
echo ::endgroup:: | |
- name: "Install PE on test cluster" | |
timeout-minutes: 120 | |
run: | | |
bundle exec bolt plan run peadm_spec::install_test_cluster \ | |
--inventoryfile spec/fixtures/litmus_inventory.yaml \ | |
--modulepath spec/fixtures/modules \ | |
architecture=${{ github.event.inputs.architecture || 'extra-large' }} \ | |
version=${{ github.event.inputs.version || '2023.5.0' }} \ | |
--stream | |
- name: Perform peadm backup of cluster | |
timeout-minutes: 10 | |
continue-on-error: true | |
run: | | |
echo ::group::prepare | |
mkdir -p $HOME/.ssh | |
echo 'Host *' > $HOME/.ssh/config | |
echo ' ServerAliveInterval 150' >> $HOME/.ssh/config | |
echo ' ServerAliveCountMax 2' >> $HOME/.ssh/config | |
bundle exec rake spec_prep | |
echo ::endgroup:: | |
echo ::group::backup | |
bundle exec bolt plan run peadm_spec::test_backup \ | |
--inventoryfile spec/fixtures/litmus_inventory.yaml \ | |
--modulepath spec/fixtures/modules \ | |
--stream | |
echo ::endgroup:: | |
- name: Set up yq | |
uses: frenck/action-setup-yq@v1 | |
with: | |
version: v4.30.5 | |
- name: Break PE cluster | |
run: | | |
primary=$(yq '.groups[].targets[] | select(.vars.role == "primary") | .uri' spec/fixtures/litmus_inventory.yaml) | |
echo "Removing ssl directories" | |
bundle exec bolt command run "rm -rf /etc/puppetlabs/puppetserver/ca /etc/puppetlabs/puppet/ssl" -t $primary --inventoryfile spec/fixtures/litmus_inventory.yaml | |
echo "Removing classifier database" | |
bundle exec bolt command run "rm -rf /opt/puppetlabs/server/data/postgresql/classifier" -t $primary --inventoryfile spec/fixtures/litmus_inventory.yaml | |
bundle exec bolt command run "puppet infrastructure status" -t $primary --inventoryfile spec/fixtures/litmus_inventory.yaml | |
#TODO if arch is XL, run pe-uninstaller on the primary database | |
if [ ${{ github.event.inputs.architecture || 'extra-large' }} == 'extra-large' ]; then | |
echo "Uninstalling PE from primary database" | |
primary_db=$(yq '.groups[].targets[] | select(.vars.role == "primary-pdb-postgresql") | .uri' spec/fixtures/litmus_inventory.yaml) | |
bundle exec bolt command run "/opt/puppetlabs/bin/puppet-enterprise-uninstaller -p -d -y || true" -t $primary_db \ | |
--inventoryfile spec/fixtures/litmus_inventory.yaml \ | |
--modulepath spec/fixtures/modules \ | |
--verbose \ | |
--stream | |
fi | |
- name: Reinstall PE primary in a non-peadm way | |
continue-on-error: true | |
run: | | |
primary=$(yq '.groups[].targets[] | select(.vars.role == "primary") | .uri' spec/fixtures/litmus_inventory.yaml) | |
bundle exec bolt task run peadm_spec::reinstall_pe version=${{ github.event.inputs.version || '2023.5.0' }} -t $primary \ | |
--inventoryfile spec/fixtures/litmus_inventory.yaml \ | |
--modulepath spec/fixtures/modules \ | |
--verbose \ | |
--stream | |
- name: Perform peadm restore of primary server | |
timeout-minutes: 30 | |
continue-on-error: true | |
run: | | |
echo ::group::prepare | |
mkdir -p $HOME/.ssh | |
echo 'Host *' > $HOME/.ssh/config | |
echo ' ServerAliveInterval 150' >> $HOME/.ssh/config | |
echo ' ServerAliveCountMax 2' >> $HOME/.ssh/config | |
bundle exec rake spec_prep | |
echo ::endgroup:: | |
primary=$(yq '.groups[].targets[] | select(.vars.role == "primary") | .uri' spec/fixtures/litmus_inventory.yaml) | |
bundle exec bolt command run "echo '[Service]' | sudo tee /etc/systemd/system/pe-puppetdb.service.d/10-shortcircuit.conf; echo 'TimeoutStartSec=1' | sudo tee -a /etc/systemd/system/pe-puppetdb.service.d/10-shortcircuit.conf; echo 'TimeoutStopSec=1' | sudo tee -a /etc/systemd/system/pe-puppetdb.service.d/10-shortcircuit.conf; echo 'Restart=no' | sudo tee -a /etc/systemd/system/pe-puppetdb.service.d/10-shortcircuit.conf; sudo systemctl daemon-reload" -t $primary --inventoryfile spec/fixtures/litmus_inventory.yaml | |
#TODO update the restore to: | |
# - restore the puppetdb in the standard cases | |
# - not restore the puppetdb if there is a broken external db | |
echo ::group::restore | |
bundle exec bolt plan run peadm_spec::test_restore \ | |
--inventoryfile spec/fixtures/litmus_inventory.yaml \ | |
--modulepath spec/fixtures/modules \ | |
--stream \ | |
|| true # ignore errors | |
echo ::endgroup:: | |
bundle exec bolt command run "systemctl stop pe-puppetdb.service && rm /etc/systemd/system/pe-puppetdb.service.d/10-shortcircuit.conf && systemctl daemon-reload" -t $primary --inventoryfile spec/fixtures/litmus_inventory.yaml | |
# TODO: add docs to README about actions the customer needs to do to recover the primary db | |
- name: Setup Primary DB in XL | |
if: ${{ github.event.inputs.architecture || 'extra-large' }} == 'extra-large' | |
run: | | |
# primary_db=$(yq '.groups[].targets[] | select(.vars.role == "primary-pdb-postgresql") | .uri' spec/fixtures/litmus_inventory.yaml) | |
# bundle exec bolt plan run peadm_spec::init_db_server db_host=$primary_db \ | |
# --inventoryfile spec/fixtures/litmus_inventory.yaml \ | |
# --modulepath spec/fixtures/modules \ | |
# --stream \ | |
# || true # ignore errors | |
#TODO: update peadm configuration by running the plan peadm::util::update_classification | |
# specifying the restored peadm config except providing "undef" for both database hosts | |
# (this will initialise puppetdb on the primary) | |
# should this ^^^ be part of the restore plan with recovery-db restore type? | |
#TODO: run peadm::add_database on the primary db | |
# after this step, we expect this to be the state: | |
# - the primary (external) postgresql server is up and running | |
# - puppet primary points to the external server for its puppetdb | |
# restore the puppetdb database | |
bundle exec bolt plan run peadm_spec::test_restore restore_type="recovery-db" \ | |
--inventoryfile spec/fixtures/litmus_inventory.yaml \ | |
--modulepath spec/fixtures/modules \ | |
--stream | |
- name: Output PE cluster status | |
run: | | |
primary=$(yq '.groups[].targets[] | select(.vars.role == "primary") | .uri' spec/fixtures/litmus_inventory.yaml) | |
bundle exec bolt command run "puppet infrastructure status" -t $primary --inventoryfile spec/fixtures/litmus_inventory.yaml | |
- name: "Wait as long as the file ${HOME}/pause file is present" | |
continue-on-error: true | |
if: ${{ always() && github.event.inputs.ssh-debugging == 'true' }} | |
# if: github.event.inputs.ssh-debugging == 'true' | |
run: | | |
while [ -f "${HOME}/pause" ] ; do | |
echo "${HOME}/pause present, sleeping for 60 seconds..." | |
sleep 10 | |
done | |
echo "${HOME}/pause absent, continuing workflow." | |
- name: "Tear down cluster" | |
if: always() | |
run: | | |
if [ -f spec/fixtures/litmus_inventory.yaml ]; then | |
echo ::group::tear_down | |
bundle exec rake 'litmus:tear_down' | |
echo ::endgroup:: | |
echo ::group::info:request | |
cat request.json || true; echo | |
echo ::endgroup:: | |
fi |