Skip to content

Backup and restore test #66

Backup and restore test

Backup and restore test #66

---
# TODO:
# - [ ] test on XL, also break the puppetdb database
# - [ ] test custom scenario for only restoring primary
# - [ ] test custom scenario for only restoring puppetdb (on external server)
name: "Backup and restore test"
on:
pull_request_target:
types: [ready_for_review]
workflow_dispatch:
inputs:
image:
description: "GCP image for test cluster"
required: true
default: "almalinux-cloud/almalinux-8"
architecture:
description: "PE architecture to test"
required: true
default: "standard"
type: choice
options:
- standard
- standard-with-dr
- large
- large-with-dr
- extra-large
- extra-large-with-dr
version:
description: "PE version to install"
required: true
default: "2023.5.0"
ssh-debugging:
description: "Boolean; whether or not to pause for ssh debugging"
required: true
default: "false"
jobs:
backup:
name: "Backup cluster: PE ${{ github.event.inputs.version || '2023.5.0' }} ${{ github.event.inputs.architecture || 'extra-large' }} on ${{ github.event.inputs.image || 'almalinux-cloud/almalinux-8' }}"
runs-on: ubuntu-20.04
env:
BOLT_GEM: true
BOLT_DISABLE_ANALYTICS: true
LANG: "en_US.UTF-8"
steps:
- name: "Start SSH session"
if: ${{ github.event.inputs.ssh-debugging == 'true' }}
uses: luchihoratiu/debug-via-ssh@main
with:
NGROK_AUTH_TOKEN: ${{ secrets.NGROK_AUTH_TOKEN }}
SSH_PASS: ${{ secrets.SSH_PASS }}
- name: "Checkout Source"
uses: actions/checkout@v2
- name: "Activate Ruby 2.7"
uses: ruby/setup-ruby@v1
with:
ruby-version: "2.7"
bundler-cache: true
- name: "Print bundle environment"
if: ${{ github.repository_owner == 'puppetlabs' }}
run: |
echo ::group::info:bundler
bundle env
echo ::endgroup::
- name: "Provision test cluster"
timeout-minutes: 15
run: |
echo ::group::prepare
mkdir -p $HOME/.ssh
echo 'Host *' > $HOME/.ssh/config
echo ' ServerAliveInterval 150' >> $HOME/.ssh/config
echo ' ServerAliveCountMax 2' >> $HOME/.ssh/config
bundle exec rake spec_prep
echo ::endgroup::
echo ::group::provision
bundle exec bolt plan run peadm_spec::provision_test_cluster \
--modulepath spec/fixtures/modules \
provider=provision_service \
image=${{ github.event.inputs.image || 'almalinux-cloud/almalinux-8' }} \
architecture=${{ github.event.inputs.architecture || 'extra-large' }}
echo ::endgroup::
echo ::group::info:request
cat request.json || true; echo
echo ::endgroup::
echo ::group::info:inventory
sed -e 's/password: .*/password: "[redacted]"/' < spec/fixtures/litmus_inventory.yaml || true
echo ::endgroup::
- name: "Install PE on test cluster"
timeout-minutes: 120
run: |
bundle exec bolt plan run peadm_spec::install_test_cluster \
--inventoryfile spec/fixtures/litmus_inventory.yaml \
--modulepath spec/fixtures/modules \
architecture=${{ github.event.inputs.architecture || 'extra-large' }} \
version=${{ github.event.inputs.version || '2023.5.0' }} \
--stream
- name: Perform peadm backup of cluster
timeout-minutes: 10
continue-on-error: true
run: |
echo ::group::prepare
mkdir -p $HOME/.ssh
echo 'Host *' > $HOME/.ssh/config
echo ' ServerAliveInterval 150' >> $HOME/.ssh/config
echo ' ServerAliveCountMax 2' >> $HOME/.ssh/config
bundle exec rake spec_prep
echo ::endgroup::
echo ::group::backup
bundle exec bolt plan run peadm_spec::test_backup \
--inventoryfile spec/fixtures/litmus_inventory.yaml \
--modulepath spec/fixtures/modules \
--stream
echo ::endgroup::
- name: Set up yq
uses: frenck/action-setup-yq@v1
with:
version: v4.30.5
- name: Break PE cluster
run: |
primary=$(yq '.groups[].targets[] | select(.vars.role == "primary") | .uri' spec/fixtures/litmus_inventory.yaml)
echo "Removing ssl directories"
bundle exec bolt command run "rm -rf /etc/puppetlabs/puppetserver/ca /etc/puppetlabs/puppet/ssl" -t $primary --inventoryfile spec/fixtures/litmus_inventory.yaml
echo "Removing classifier database"
bundle exec bolt command run "rm -rf /opt/puppetlabs/server/data/postgresql/classifier" -t $primary --inventoryfile spec/fixtures/litmus_inventory.yaml
bundle exec bolt command run "puppet infrastructure status" -t $primary --inventoryfile spec/fixtures/litmus_inventory.yaml
#TODO if arch is XL, run pe-uninstaller on the primary database
if [ ${{ github.event.inputs.architecture || 'extra-large' }} == 'extra-large' ]; then
echo "Uninstalling PE from primary database"
primary_db=$(yq '.groups[].targets[] | select(.vars.role == "primary-pdb-postgresql") | .uri' spec/fixtures/litmus_inventory.yaml)
bundle exec bolt command run "/opt/puppetlabs/bin/puppet-enterprise-uninstaller -p -d -y || true" -t $primary_db \
--inventoryfile spec/fixtures/litmus_inventory.yaml \
--modulepath spec/fixtures/modules \
--verbose \
--stream
fi
- name: Reinstall PE primary in a non-peadm way
continue-on-error: true
run: |
primary=$(yq '.groups[].targets[] | select(.vars.role == "primary") | .uri' spec/fixtures/litmus_inventory.yaml)
bundle exec bolt task run peadm_spec::reinstall_pe version=${{ github.event.inputs.version || '2023.5.0' }} -t $primary \
--inventoryfile spec/fixtures/litmus_inventory.yaml \
--modulepath spec/fixtures/modules \
--verbose \
--stream
- name: Perform peadm restore of primary server
timeout-minutes: 30
continue-on-error: true
run: |
echo ::group::prepare
mkdir -p $HOME/.ssh
echo 'Host *' > $HOME/.ssh/config
echo ' ServerAliveInterval 150' >> $HOME/.ssh/config
echo ' ServerAliveCountMax 2' >> $HOME/.ssh/config
bundle exec rake spec_prep
echo ::endgroup::
primary=$(yq '.groups[].targets[] | select(.vars.role == "primary") | .uri' spec/fixtures/litmus_inventory.yaml)
bundle exec bolt command run "echo '[Service]' | sudo tee /etc/systemd/system/pe-puppetdb.service.d/10-shortcircuit.conf; echo 'TimeoutStartSec=1' | sudo tee -a /etc/systemd/system/pe-puppetdb.service.d/10-shortcircuit.conf; echo 'TimeoutStopSec=1' | sudo tee -a /etc/systemd/system/pe-puppetdb.service.d/10-shortcircuit.conf; echo 'Restart=no' | sudo tee -a /etc/systemd/system/pe-puppetdb.service.d/10-shortcircuit.conf; sudo systemctl daemon-reload" -t $primary --inventoryfile spec/fixtures/litmus_inventory.yaml
#TODO update the restore to:
# - restore the puppetdb in the standard cases
# - not restore the puppetdb if there is a broken external db
echo ::group::restore
bundle exec bolt plan run peadm_spec::test_restore \
--inventoryfile spec/fixtures/litmus_inventory.yaml \
--modulepath spec/fixtures/modules \
--stream \
|| true # ignore errors
echo ::endgroup::
bundle exec bolt command run "systemctl stop pe-puppetdb.service && rm /etc/systemd/system/pe-puppetdb.service.d/10-shortcircuit.conf && systemctl daemon-reload" -t $primary --inventoryfile spec/fixtures/litmus_inventory.yaml
# TODO: add docs to README about actions the customer needs to do to recover the primary db
- name: Setup Primary DB in XL
if: ${{ github.event.inputs.architecture || 'extra-large' }} == 'extra-large'
run: |
# primary_db=$(yq '.groups[].targets[] | select(.vars.role == "primary-pdb-postgresql") | .uri' spec/fixtures/litmus_inventory.yaml)
# bundle exec bolt plan run peadm_spec::init_db_server db_host=$primary_db \
# --inventoryfile spec/fixtures/litmus_inventory.yaml \
# --modulepath spec/fixtures/modules \
# --stream \
# || true # ignore errors
#TODO: update peadm configuration by running the plan peadm::util::update_classification
# specifying the restored peadm config except providing "undef" for both database hosts
# (this will initialise puppetdb on the primary)
# should this ^^^ be part of the restore plan with recovery-db restore type?
#TODO: run peadm::add_database on the primary db
# after this step, we expect this to be the state:
# - the primary (external) postgresql server is up and running
# - puppet primary points to the external server for its puppetdb
# restore the puppetdb database
bundle exec bolt plan run peadm_spec::test_restore restore_type="recovery-db" \
--inventoryfile spec/fixtures/litmus_inventory.yaml \
--modulepath spec/fixtures/modules \
--stream
- name: Output PE cluster status
run: |
primary=$(yq '.groups[].targets[] | select(.vars.role == "primary") | .uri' spec/fixtures/litmus_inventory.yaml)
bundle exec bolt command run "puppet infrastructure status" -t $primary --inventoryfile spec/fixtures/litmus_inventory.yaml
- name: "Wait as long as the file ${HOME}/pause file is present"
continue-on-error: true
if: ${{ always() && github.event.inputs.ssh-debugging == 'true' }}
# if: github.event.inputs.ssh-debugging == 'true'
run: |
while [ -f "${HOME}/pause" ] ; do
echo "${HOME}/pause present, sleeping for 60 seconds..."
sleep 10
done
echo "${HOME}/pause absent, continuing workflow."
- name: "Tear down cluster"
if: always()
run: |
if [ -f spec/fixtures/litmus_inventory.yaml ]; then
echo ::group::tear_down
bundle exec rake 'litmus:tear_down'
echo ::endgroup::
echo ::group::info:request
cat request.json || true; echo
echo ::endgroup::
fi