diff --git a/FSx_ONTAP_AWS_CLI_Scripts/README.md b/FSx_ONTAP_AWS_CLI_Scripts/README.md new file mode 100644 index 0000000..4ca00aa --- /dev/null +++ b/FSx_ONTAP_AWS_CLI_Scripts/README.md @@ -0,0 +1,27 @@ +# FSxN Convenience Scripts +This folder contains sample scripts that are designed to help you use FSxN from +a command line. Most of the scripts are written in Bash, intended to be run either from +a UNIX based O/S (e.g. Linux, MacOS, FreeBSD), or from a Microsoft Windows based system with a +Windows Subsystem for Linux (WSL) based Linux distribution installed. + +## Preparation +Before running the UNIX based scripts, make sure the following package is installed: + +* jq - lightweight and flexible command-line JSON processor +* aws-cli - Command Line Environment for AWS + +## Summary of the convenience scripts + +| Script | Description | +|:--------------------------|:----------------| +|create_fsxn_filesystem | Createa a new FSxN file system.| +|create_fsxn_svm | Creates a new storage virtual machine under the specified file system. | +|create_fsxn_volume | Creates a new volume under a specified SVM. | +|list_fsxn_filesystems | List all the FSxN file systems that the user has access to. | +|list_fsxn_filesystems.ps1 | List all the FSxN file systems that the user has access to, written in PowerShell. | +|list_fsxn_svms | List all the FSxN storage virtual machines that the user access to. | +|list_fsxn_volumes | List all the FSxN volumes that the user has access to. | +|delete_fsxn_filesystem | Deletes a FSxN file system. | +|delete_fsxn_svm | Deltees a FSxN storage virtual machine. | +|delete_fsxn_volume | Deletes a FSxN volume. | +|purge_fsxn_backups | Purges old FSxN backups. | diff --git a/FSx_ONTAP_AWS_CLI_Scripts/create_fsxn_filesystem b/FSx_ONTAP_AWS_CLI_Scripts/create_fsxn_filesystem new file mode 100755 index 0000000..112391b --- /dev/null +++ b/FSx_ONTAP_AWS_CLI_Scripts/create_fsxn_filesystem @@ -0,0 +1,161 @@ +#!/bin/bash +################################################################################ +# THIS SOFTWARE IS PROVIDED BY NETAPP "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL NETAPP BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR' +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +################################################################################ +# +# This script is used to create an FSxN filesystem. +################################################################################ + +################################################################################ +# This function just outputs the usage information and exits. +################################################################################ +usage () { +cat 1>&2 < /dev/null 2>&1; then + : +else + echo "Error, both the 'aws' and 'jq' commands are required to run this script." 1>&2 + exit 1 +fi +# +# Set some defaults. +size=1024 +throughput=128 +region=$(aws configure list | egrep '^.*egion ' | awk '{print $2}') +securityGroupOption="" +endpointips="" +azType="MULTI_AZ_1" +# +# Process command line arguments. +while [ ! -z "$1" ]; do + case $(echo "$1" | tr [A-Z] [a-z]) in + -name|--name) fileSystemName="$2" + shift + ;; + -region|--region) region="$2" + shift + ;; + -size|--size) size="$2" + if ! [[ "$size" =~ '^[0-9]+$' ]]; then + echo "-size must be an integer." + usage + fi + if [ "$size" -le 1024 ]; then + usage + fi + shift + ;; + -subnetid1|--subnetid1) subnetID1="$2" + shift + ;; + -subnetid2|--subnetid2) subnetID2="$2" + shift + ;; + -security-group-id|--security-group-id) securityGroupOption="--security-group-ids $2" + shift + ;; + -type|--type) + if [ "$(echo $2 | tr [A-Z] [a-z])" == "single" ]; then + azType="SINGLE_AZ_1" + elif [ "$(echo $2 | tr [A-Z] [a-z])" == "multi" ]; then + azType="MULTI_AZ_1" + else + echo "Error, known availability type '$2'." + usage + fi + shift + ;; + -throughput|--throughput) throughput="$2" + if ! [[ "$throughput" =~ '^[0-9]+$' ]]; then + echo "-throughput must be an integer." + usage + fi + if [ "$througput" != "128" -a "$througput" != "256" -a "$throughput" != "512" -a "$throughput" != "1024" -a "$throughput" != "2048" -a "$throughput" != "4096" ]; then + echo "-throughput must be 128 or 256 or 512 or 1024 or 2048 or 4096." + usage + fi + shift + ;; + -endpointiprange|--endpointiprange) + endpointips='"EndpointIpAddressRange": "'$2'",' + shift + ;; + -h|-help|--help) + usage + ;; + *) echo "Error, unknown option $1." 1>&2 + usage + ;; + esac + shift +done +# +# Ensure all the required parameters have been provided. +if [ -z "$fileSystemName" -o -z "$subnetID1" -o "$azType" == "MULTI_AZ_1" -a -z "$subnetID2" ]; then + echo "Missing arguments." 1>&2 + usage + exit 1 +fi +if [ $azType == "SINGLE_AZ_1" ]; then + if [ ! -z "$endpointips" ]; then + echo "Error, you can not specify Endpoint IP address range when deploying in a single availability zone." 1>&2 + exit 1 + fi + + if [ ! -z "$subnetID2" ]; then + echo "Error, you can't specify a second subnet with deploying in a single availability zone." 1>&2 + exit 1 + fi +fi + +aws fsx create-file-system --output=json --file-system-type ONTAP --storage-capacity $size --subnet-ids $subnetID1 $subnetID2 --storage-type SSD --tags "Key=Name,Value=$fileSystemName" $securityGroupOption --ontap-configuration '{ + "PreferredSubnetId": "'$subnetID1'", + '$endpointips' + "DeploymentType": "'$azType'", + "ThroughputCapacity": '$throughput'}' --region=$region > $tmpout 2>&1 + +if [ $? != "0" ]; then + echo "Failed to create FSxN file system." 1>&2 + cat $tmpout 1>&2 + exit 1 +else + status=$(jq -r .FileSystem.Lifecycle $tmpout 2> /dev/null) + if [ "$status" == "CREATING" -o "$status" == "PENDING" ]; then + echo "File system '$fileSystemName' ($(jq -r .FileSystem.FileSystemId $tmpout)) is being created." + exit 0 + else + echo "Unknown status '$status'. Complete output returned from the AWS api:" 1>&2 + cat $tmpout 1>&2 + exit 1 + fi +fi diff --git a/FSx_ONTAP_AWS_CLI_Scripts/create_fsxn_svm b/FSx_ONTAP_AWS_CLI_Scripts/create_fsxn_svm new file mode 100755 index 0000000..6677fee --- /dev/null +++ b/FSx_ONTAP_AWS_CLI_Scripts/create_fsxn_svm @@ -0,0 +1,104 @@ +#!/bin/bash +################################################################################ +# THIS SOFTWARE IS PROVIDED BY NETAPP "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL NETAPP BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR' +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +################################################################################ +# +# This script is used to create an FSxN virtual storage machine under the +# specified FSxN "filesystem". +################################################################################ + +################################################################################ +# This function just outputs the usage information and exits. +################################################################################ +usage () { +cat 1>&2 < /dev/null 2>&1; then + : +else + echo "Error, both the 'aws' and 'jq' commands are required to run this script." 1>&2 + exit 1 +fi +# +# Set some defaults. +region=$(aws configure list | egrep '^.*egion ' | awk '{print $2}') +# +# Process command line arguments. +while getopts "hn:r:f:i:" option; do + case $option in + n) svmName=$OPTARG + ;; + r) region=$OPTARG + ;; + f) fileSystemName=$OPTARG + ;; + i) fsid=$OPTARG + ;; + *) usage + ;; + esac +done + +if [ ! -z "$fileSystemName" -a ! -z "$fsid" ]; then + echo "Error, you can only specify the -i OR the -f option. Not both." 1>&2 + usage +fi +# +# Ensure all the required parameters have been provided. +if [ -z "$svmName" -o -z "$fileSystemName" -a -z "$fsid" ]; then + echo "Error, missing required arguments." 1>&2 + usage +fi +# +# Get the file system id from the file system name. +if [ -z "$fsid" ]; then + fsid=$(aws fsx describe-file-systems --output=json 2> /dev/null | jq -r ".FileSystems[] | if((.Tags[] | select(.Key == \"Name\") .Value) == \"${fileSystemName}\") then .FileSystemId else empty end" 2> /dev/null) +fi + +if [ -z "$fsid" ]; then + echo "Error, could not find the file system with name '$fileSystemName}' in region $region." 1>&2 + exit 1 +fi +# +# Create the SVM +aws fsx create-storage-virtual-machine --name $svmName --region=$region --file-system-id $fsid --output=json > $tmpout 2>&1 + +if [ $? != "0" ]; then + echo "Failed to create storage virtual machine." 1>&2 + cat $tmpout 1>&2 + exit 1 +else + status=$(jq -r .StorageVirtualMachine.Lifecycle $tmpout 2> /dev/null) + if [ "$status" == "CREATING" -o "$status" == "PENDING" ]; then + echo "Stroage Virtaul Machine '$svmName'($(jq -r '.StorageVirtualMachine.StorageVirtualMachineId' $tmpout)) is being created." + exit 0 + else + echo "Unknown status '$status'. Complete output returned from the AWS api:" 1>&2 + cat $tmpout 1>&2 + exit 1 + fi +fi diff --git a/FSx_ONTAP_AWS_CLI_Scripts/create_fsxn_volume b/FSx_ONTAP_AWS_CLI_Scripts/create_fsxn_volume new file mode 100755 index 0000000..502dfcf --- /dev/null +++ b/FSx_ONTAP_AWS_CLI_Scripts/create_fsxn_volume @@ -0,0 +1,90 @@ +#!/bin/bash +################################################################################ +# THIS SOFTWARE IS PROVIDED BY NETAPP "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL NETAPP BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR' +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +################################################################################ +# +# This script is used to create an FSxN volume under the specified SVM. The +# FSxN "filesystem" is implied by the SVM ID. +################################################################################ + +################################################################################ +# This function just outputs the usage information and exits. +################################################################################ +usage () { +cat 1>&2 <&2 + usage +fi + +aws fsx create-volume --volume-type ONTAP --name $volumeName --ontap-configuration "{ + \"JunctionPath\": \"/$volumeName\", + \"SecurityStyle\": \"UNIX\", + \"SizeInMegabytes\" : $size, + \"StorageEfficiencyEnabled\": true, + \"StorageVirtualMachineId\": \"$svmId\", + \"TieringPolicy\" : {\"CoolingPeriod\": 31, \"Name\": \"SNAPSHOT_ONLY\"}, + \"OntapVolumeType\": \"RW\", + \"SnapshotPolicy\": \"default\"}" --region=$region --output=json > $tmpout 2>&1 + +if [ $? != "0" ]; then + echo "Failed to create the FSxN volume." 1>&2 + cat $tmpout 1>&2 + exit 1 +else + status=$(jq -r .Volume.Lifecycle $tmpout 2> /dev/null) + if [ "$status" == "CREATING" -o "$status" == "PENDING" ]; then + echo "FSxN volume '$volumeName'($(jq -r .Volume.VolumeId $tmpout)) is being created." + exit 0 + else + echo "Unknown status '$status'. Complete output returned from the AWS api:" 1>&2 + cat $tmpout 1>&2 + exit 1 + fi +fi diff --git a/FSx_ONTAP_AWS_CLI_Scripts/delete_fsxn_filesystem b/FSx_ONTAP_AWS_CLI_Scripts/delete_fsxn_filesystem new file mode 100755 index 0000000..0494269 --- /dev/null +++ b/FSx_ONTAP_AWS_CLI_Scripts/delete_fsxn_filesystem @@ -0,0 +1,360 @@ +#!/bin/bash +################################################################################ +# THIS SOFTWARE IS PROVIDED BY NETAPP "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL NETAPP BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR' +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +################################################################################ +# +# This script is used to delete a FSxN filesystem. +################################################################################ + +################################################################################ +# This function just outputs the usage information and forces the script to +# exit. +################################################################################ +usage () { +cat 1>&2 < $tmpout 2>&1 + if [ $? != "0" ]; then + printf "\nError, failed to delete a volume with volumeId: '$volumeId'.\n" 1>&2 + cat $tmpout 1>&2 + return 1 + fi + # + # Wait for the volume to be deleted. + i=0 + while [ $i -lt $MaxIterations ]; do + aws fsx describe-volumes --volume-ids $volumeId --output=json --region=$region > $tmpout 2>&1 + if [ $? -eq 0 ]; then + status=$(jq -r .Volumes[0].Lifecycle $tmpout 2> /dev/null) + if [ "$status" != "DELETING" -a "$status" != "PENDING" ]; then + printf "\nError, failed to delete volume with volume ID '$volumeId'. Status = ${status}.\n" 1>&2 + cat $tmpout 1>&2 + return 1 + fi + else + # Assume if it failed, it is because the volume was deleted and doesn't exist anymore. + break + fi + sleep $SleepTime + let i+=1 + done + if [ $i -ge $MaxIterations ]; then + echo "Failed to delete volume with volume ID of '$volumeId'. Taking too long." 1>&2 + return 1 + fi + return 0 +} + +################################################################################ +# This function is used to delete an FSxN SVM. It waits for the SVM to be +# deleted. It assumes the SVM has been deleted when the API call to display +# its status returns an error. +################################################################################ +delete_svm() { + + local tmpout=/tmp/delete_fsxn_delete_svm.$BASHPID + trap 'rm -f $tmpout' RETURN + + local svmId=$1 + aws fsx delete-storage-virtual-machine --region=$region --output=json --storage-virtual-machine-id $svmId > $tmpout 2>&1 + if [ $? != "0" ]; then + printf "\nError, failed to delete a SVM with svmID: '$svmId'.\n" 1>&2 + cat $tmpout 1>&2 + return 1 + fi + # + # Wait for the svm to be deleted. + i=0 + while [ $i -lt $MaxIterations ]; do + aws fsx describe-storage-virtual-machines --storage-virtual-machine-ids $svmId --output=json --region=$region > $tmpout 2>&1 + if [ $? -eq 0 ]; then + status=$(jq -r '.StorageVirtualMachines[0].Lifecycle' $tmpout 2> /dev/null) + if [ "$status" != "DELETING" -a "$status" != "PENDING" ]; then + printf "\nError, failed to delete SVM with SVM ID '$svmId'. Status = $status\n" 1>&2 + cat $tmpout 1>&2 + return 1 + fi + else + # Assume if it failed, it is because the SVM was delted and therefore doesn't exist anymore. + break + fi + sleep $SleepTime + let i+=1 + done + if [ $i -ge $MaxIterations ]; then + printf "\nFailed to delete SVM with SVM ID of '$svmID'. Taking too long.\n" 1>&2 + return 1 + fi + return 0 +} + +################################################################################ +# Main logic starts here. +################################################################################ +tmpout=/tmp/fsx_fs_delete.$$ +svmsFile=/tmp/fsx_fs_delete_svms.$$ +volumesFile=/tmp/fsx_fs_delete_volumes.$$ +trap 'rm -f $tmpout $svmsFile $volumesFile' exit +# +# Set the maximum number of times to check that a volume and/or SVM has been +# deleted. Multiple it by the SleepTime set below to the total amount of +# time allowed. Note, it takes at least 4 minutes to delete a volume. +MaxIterations=120 +# +# Set the number of seconds to wait between checks that a volume and/or SVM has been deleted. +SleepTime=5 +# +# Set the maximum number of "volume deletes" that can be running at the same time. +MaxDeletesRunning=20 +# +# Check that the required commands are available. +if which jq aws > /dev/null 2>&1; then + : +else + echo "Error, both the 'aws' and 'jq' commands is required to run this script." 1>&2 + exit 1 +fi +# +# Get the default region. +region=$(aws configure list | egrep '^.*egion ' | awk '{print $2}') +skipBackup=true +# +# Process command line arguments. +while getopts "hbr:f:i:" option; do + case $option in + f) fileSystemName=$OPTARG + ;; + r) region=$OPTARG + ;; + i) fsid=$OPTARG + ;; + b) skipBackup=false + ;; + *) usage + ;; + esac +done + +if [ ! -z "$fsid" -a ! -z "$fileSystemName" ]; then + echo "Error, you can only specify the -i OR the -f option, not both." 1>&2 + usage # implied exit +fi +# +# Ensure all the required parameters have been provided. +if [ -z "$fileSystemName" -a -z "$fsid" ]; then + echo "Error, missing required arguments." 1>&2 + usage # implied exit +fi +# +# Get the file system id based on the name. +if [ -z "$fsid" ]; then + fsid=($(aws fsx describe-file-systems --region=$region --output=json 2> $tmpout | jq -r '.FileSystems[] | if((.Tags[] | select(.Key == "Name") .Value) == "'"${fileSystemName}"'") then .FileSystemId else empty end' 2> /dev/null)) + + if [ ${#fsid[*]} -gt 1 ]; then + echo "Error, more than one file system matched the file system name '$fileSystemName'." 1>&2 + echo "Please use the -i option to specify the exact file system you want to delete." 1>&2 + exit 1 + fi + + if [ -z "$fsid" ]; then + echo "Error, could not find the file system with name '$fileSystemName'." 1>&2 + cat $tmpout 1>&2 + exit 1 + fi +else + # + # Get the file system name based on the fsid. + fileSystemName=$(aws fsx describe-file-systems --file-system-ids $fsid --region=$region --output=json 2> /dev/null | jq -r '.FileSystems[0].Tags[] | select(.Key == "Name") .Value' 2> /dev/null) + if [ -z "$fileSystemName" ]; then + echo "Error, failed to get the file system name based on the ID ($fsid)." 1>&2 + exit 1 + fi +fi +# +# Create a JSON file with all the FSxN SVMs in the region. +aws fsx describe-storage-virtual-machines --region=$region --output=json > $svmsFile 2>&1 +if [ $? -ne 0 ]; then + echo "Error, failed to get the list of SVMs." 1>&2 + cat $svmsFile 1>&2 + exit 1 +fi +# +# Create a JSON file with all the FSXN volumes in the region. +aws fsx describe-volumes --region=$region --output=json > $volumesFile 2>&1 +if [ $? -ne 0 ]; then + echo "Error, failed to get the list of volumes." 1>&2 + cat $volumesFile 1>&2 + exit 1 +fi +# +# Make sure the user really wants to delete the file system. +echo "Here are the current contents of the '$fileSystemName'($fsid) file system you have indicated you want to delete:" +displayFileSystemContents $fsid +read -p "Are you sure you want to delete this file system, with all the above volumes (yes/no)? " response +if [ "$response" != "yes" ]; then + echo "Aborted." + exit 1 +fi +# +# Before you can delete a file system, you have to first delete all the volumes, +# and then all the SVMs. So, first get the list of SVMs: +declare -a svms +declare -a volumes +svms=($(jq -r '.StorageVirtualMachines[] | if(.FileSystemId == "'$fsid'") then .StorageVirtualMachineId else empty end' $svmsFile)) +# +# Now delete all the volumes for each SVM. I could just deleted all the volumes +# associated with the fsid, but I wanted the extra check on the volumeId to be +# associated with one of the SVMs that is associated with the fsid. +for svmId in ${svms[*]}; do + # + # Create an array with all the non-root volume IDs for this SVM. + volumes=($(jq -r '.Volumes[] | if(.OntapConfiguration.StorageVirtualMachineId == "'$svmId'" and (.OntapConfiguration.StorageVirtualMachineRoot | not) and .FileSystemId == "'$fsid'") then .VolumeId else empty end' $volumesFile)) + if [ ! -z "${volumes[*]}" ]; then + # + # Since it can take a while for a single volume to be deleted (e.g. 4 minutes + # for a small empty volume) and you can do multiple deletes in parallel, + # spawn them in the background and wait for them to finish. Although, since + # we don't want to overwhelm either AWS or ONTAP, only allow a certain + # number at a time. + i=0 + numRunning=0 + numVolumes=${#volumes[*]} + maxNumRunning=1 # Only do one initially, if it completes successfully, then do the rest concurrently. + printf "\nDeleting all the volumes associated with ${svmId}.\n" + while [ $i -lt $numVolumes ]; do + delete_volume ${volumes[$i]} $skipBackup & + let i+=1 + let numRunning+=1 + printf "\rTotal number of volumes to delete: ${numVolumes}. Number of deletes currently running: ${numRunning}. Number waiting to be started: $((numVolumes-i)). " + if [ $numRunning -ge $maxNumRunning ]; then + # + # Wait for a job to complete. + wait -n + rc=$? + if [ $rc -eq 127 ]; then + # + # 127 means there were no background jobs. Since we just deployed one, that shouldn't happen. + printf "\nError, got an expected response from 'wait'. Aborting.\n" 1>&2 + exit 1 + fi + if [ $rc -ne 0 ]; then + printf "\nError, one of the volume deletes failed. Aborting!\n" 1>&2 + exit 1 + fi + let numRunning-=1 + if [ $i -eq 1 ]; then + # The first one succeeded, open up the flood gates. + maxNumRunning=$MaxDeletesRunning + fi + fi + done + # + # Now that we have queued them all up, wait for them to finish. + wait -n + rc=$? + let numRunning-=1 + while [ "$rc" != 127 ]; do + printf "\rTotal number of volumes to delete: ${numVolumes}. Number of deletes currently running: ${numRunning}. Number waiting to be started: $((numVolumes-i)). " + if [ "$rc" != 0 ]; then + printf "\nError, one of the volume deletes failed. Aborting!\n" 1>&2 + exit 1 + fi + wait -n + rc=$? + let numRunning-=1 + done + fi +done # for svmId in ${svms[*]}; do +# +# Now that all the volumes are deleted, delete the SVMs. +# Since there can only be 24 SVMs, don't really have to worry about spawning +# too many at a time. +[ ${#svms[*]} -gt 0 ] && printf "\nDeleting SVMs.\n" +for svmId in ${svms[*]}; do + delete_svm $svmId & +done +# +# Now wait for them to finish. +if [ ! -z "$svms" ]; then + numRunning=${#svms[*]} + printf "\rTotal number of SVMs to delete: ${#svms[*]}. Number of deletes currently running: ${numRunning}. Number waiting to be started: 0. " + wait -n + rs=$? + let numRunning-=1 + while [ "$rs" != 127 ]; do + if [ "$rs" != 0 ]; then + printf "\nError, one of the SVM deletes failed. Aborting!\n" 1>&2 + exit 1 + fi + printf "\rTotal number of SVMs to delete: ${#svms[*]}. Number of deletes currently running: ${numRunning}. Number waiting to be started: 0. " + wait -n + rs=$? + let numRunning-=1 + done +fi +# +# Now that all the volumes and all the SVMs have been deleted, we can delete the filesystem. +aws fsx delete-file-system --file-system-id $fsid --output=json --region=$region > $tmpout 2>&1 +if [ $? != "0" ]; then + printf "\nError, failed to delete file system.\n" 1>&2 + cat $tmpout 1>&2 + exit 1 +else + status=$(jq -r .Lifecycle $tmpout) + if [ "$status" == "DELETING" -o "$status" == "PENDING" ]; then + printf "\nFile system '$fileSystemName' is being deleted.\n" + exit 0 + else + printf "\nUnknown status '$status'. Complete output returned from the AWS api:\n" 1>&2 + cat $tmpout 1>&2 + exit 1 + fi +fi diff --git a/FSx_ONTAP_AWS_CLI_Scripts/delete_fsxn_svm b/FSx_ONTAP_AWS_CLI_Scripts/delete_fsxn_svm new file mode 100755 index 0000000..3c9707a --- /dev/null +++ b/FSx_ONTAP_AWS_CLI_Scripts/delete_fsxn_svm @@ -0,0 +1,85 @@ +#!/bin/bash +################################################################################ +# THIS SOFTWARE IS PROVIDED BY NETAPP "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL NETAPP BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR' +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +################################################################################ +# +# This script is used to delete a storage virtual machine from a +# FSxN filesystem. +################################################################################ + +################################################################################ +# This function just outputs the usage information and exits. +################################################################################ +usage () { +cat 1>&2 < /dev/null 2>&1; then + : +else + echo "Error, both the 'aws' and 'jq' commands are required to run this script." 1>&2 + exit 1 +fi +# +# Set any defaults. +region=$(aws configure list | egrep '^.*egion ' | awk '{print $2}') +# +# Process command line arguments. +while getopts "hi:r:" option; do + case $option in + i) svmID=$OPTARG + ;; + r) region=$OPTARG + ;; + *) usage + ;; + esac + shift +done +# +# Ensure all the required parameters have been provided. +if [ -z "$svmID" ]; then + echo "Error, missing reuqired arguments." 1>&2 + usage + exit 1 +fi + +aws fsx delete-storage-virtual-machine --region=$region --storage-virtual-machine-id $svmID > $tmpout 2>&1 + +if [ $? != "0" ]; then + echo "Failed to delete storage virtual machine." 1>&2 + cat $tmpout + exit 1 +else + status=$(jq -r .Lifecycle $tmpout) + if [ "$status" == "DELETING" ]; then + echo "Storage Virtual Machine with an id of '$svmID' is being deleted." + exit 0 + else + echo "Unknown status '$status'. Complete output returned from the AWS api:" + cat $tmpout + exit 1 + fi +fi diff --git a/FSx_ONTAP_AWS_CLI_Scripts/delete_fsxn_volume b/FSx_ONTAP_AWS_CLI_Scripts/delete_fsxn_volume new file mode 100755 index 0000000..fbb7862 --- /dev/null +++ b/FSx_ONTAP_AWS_CLI_Scripts/delete_fsxn_volume @@ -0,0 +1,78 @@ +#!/bin/bash +################################################################################ +# THIS SOFTWARE IS PROVIDED BY NETAPP "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL NETAPP BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR' +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +################################################################################ +# +# This script is used to delete an FSxN volume. +################################################################################ + +################################################################################ +# This function just outputs the usage information and exits. +################################################################################ +usage () { +cat 1>&2 <&2 + usage +fi + +aws fsx delete-volume --volume-id $volumeId --region=$region --output=json --ontap-configuration '{"SkipFinalBackup": '$skipBackup'}' > $tmpout 2>&1 + +if [ $? != "0" ]; then + echo "Failed to delete volume." 1>&2 + cat $tmpout 1>&2 + exit 1 +else + status=$(jq -r .Lifecycle $tmpout 2> /dev/null) + if [ "$status" == "DELETING" -o "$status" == "PENDING" ]; then + echo "Volume '$volumeId' is being deleted." + exit 0 + else + echo "Unknown status '$status'. Complete output returned from the AWS api:" 1>&2 + cat $tmpout 1>&2 + exit 1 + fi +fi diff --git a/FSx_ONTAP_AWS_CLI_Scripts/list_fsxn_filesystems b/FSx_ONTAP_AWS_CLI_Scripts/list_fsxn_filesystems new file mode 100755 index 0000000..412f97a --- /dev/null +++ b/FSx_ONTAP_AWS_CLI_Scripts/list_fsxn_filesystems @@ -0,0 +1,265 @@ +#!/bin/bash +# +################################################################################# +# THIS SOFTWARE IS PROVIDED BY NETAPP "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL NETAPP BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR' +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +################################################################################ +# +################################################################################ +# This script will list all the AWS FSxN file systems the user has access to. +# It will list: +# o Region +# o File System ID +# o File Systesm "name" +# o Status +# o Management IP +# o VPC ID - optional +# o Subnet ID - optional +# o ARN - optional +# o Backup - The Backup retention period. +# +# In the case of the Management IP and Subnet ID, it will only show the first +# one defined. Based on the potential output from the API call, there could +# be more than one. +# +# If the '-c' option is provided, instead of providing the information above +# it will display a hierarchical view of each file system, meaning it will +# display all the SVMs, and under each SVM, all the volumes that are under it. +################################################################################ + +################################################################################ +# This function is used to output the usage information for the script and exit. +################################################################################ +usage () { + cat 1>&2 < /dev/null 2>&1; then + : +else + echo "Error, both the 'aws' and 'jq' commands are required to run this script." 1>&2 + exit 1 +fi +# +# Set defaults. +allRegions=false +includeExtraInfo=false +contents=false +showStatus=false +showARN=false +showBackup=false +region=$(aws configure list | egrep '^.*egion ' | awk '{print $2}') +# +# Process command line arguments. +while getopts "bhcxsanr:i:f:" option; do + case "$option" in + r) region="$OPTARG" + ;; + a) allRegions=true + ;; + b) showBackup=true + ;; + x) includeExtraInfo=true + ;; + c) contents=true + ;; + i) fsid=$OPTARG + ;; + f) fileSystemName="$OPTARG" + ;; + s) showStatus=true + ;; + n) showARN=true + ;; + *) usage + ;; + esac +done +shift $((OPTIND-1)) +# +# Check for invalid options. +if [ "$showBackup" == "true" -a \( "$contents" == "true" -o "$includeExtraInfo" == "true" \) ]; then + echo "Error, the -b option is not compatiable with the -c or -x options." 1>&2 + echo "" + usage +fi + +if [ "$showARN" == "true" -a \( "$contents" == "true" -o "$includeExtraInfo" == "true" \) ]; then + echo "Error, the -n option is not compatiable with the -c or -x options." 1>&2 + echo "" + usage +fi + +if [ "$showStatus" == "true" -a "$contents" != "true" ]; then + echo "Error, the -s option is only compatiable with the -c option." 1>&2 + echo "" + usage +fi + +declare -a regions +if [ "$allRegions" == "true" ]; then + # + # Generate a list of all the valid regions the user can search. That is the + # intersection of all the regions they have enabled, and the regions that + # support FSxN + allEndabledRegions=$(aws ec2 describe-regions --query "Regions[].RegionName" --output=json | jq -r '.[]') + allFsxnRegions=$(curl -s https://api.regional-table.region-services.aws.a2z.com/index.json | jq -r '.prices[] | select(.attributes."aws:serviceName" == "Amazon FSx for NetApp ONTAP") .attributes."aws:region"') + for reg in $allEndabledRegions; do + for fsxnReg in $allFsxnRegions; do + if [ $reg == $fsxnReg ]; then + regions+=($reg) + fi + done + done + if [ -z "$regions" ]; then + echo "Error, failed to get the list of regions that support FSxN" 1>&2 + exit 1 + fi +else + regions=($region) +fi + +if [ ! -z "$fsid" -a ! -z "$fileSystemName" ]; then + echo "Error, you can't specify both a file system ID and file system name." 1>&2 + exit 1 +fi +# +# Get the regions that support the FSxN for ONTAP service. +# +# Define the query string that is used to get only the fields needed to generate the output. +# It also makes the 'jq' commands below easily, since it flattens the JSON structure. +queryString="FileSystems[*].{ + FileSystemId: FileSystemId, + Lifecycle:Lifecycle, + Name:Tags[?Key=='Name']|[0].Value, + ManagementIp: OntapConfiguration.Endpoints.Management.IpAddresses[0], + VpcId: VpcId, + SubnetId: SubnetIds[0], + ResourceARN: ResourceARN, + AutomaticBackupRetentionDays: OntapConfiguration.AutomaticBackupRetentionDays, + DeploymentType: OntapConfiguration.DeploymentType, + DiskIopsConfiguration: OntapConfiguration.DiskIopsConfiguration.Iops, + ThroughputCapacity: OntapConfiguration.ThroughputCapacity, + StorageCapacity: StorageCapacity +}" +# +# Loop on all the requested regions. +for region in ${regions[*]}; do + if [ ! -z "$fileSystemName" ]; then + fsid=$(aws fsx describe-file-systems --region=$region --output=json 2> /dev/null | jq -r '.FileSystems[] | if((.Tags[] | select(.Key == "Name") .Value) == "'"${fileSystemName}"'") then .FileSystemId else empty end' 2> /dev/null) + if [ ! -z "$fsid" ]; then + aws fsx describe-file-systems --file-system-ids $fsid --region=$region --query "$queryString" --output=json > $fileSystemsFile 2>&1 + else + echo "Error, failed to get the file system ID based on a file system name of '$fileSystemName'." 1>&2 + exit 1 + fi + else + if [ -z "$fsid" ]; then + aws fsx describe-file-systems --region=$region --query "$queryString" --output=json > $fileSystemsFile 2>&1 + else + aws fsx describe-file-systems --file-system-ids $fsid --region=$region --query "$queryString" --output=json > $fileSystemsFile 2>&1 + fi + fi + + if [ $? -ne 0 ]; then + echo "Error, failed to get the list of file systems." 1>&2 + cat $fileSystemsFile 1>&2 + exit 1 + fi + + if [ $contents == "true" ]; then + aws fsx describe-storage-virtual-machines --region=$region --output=json > $svmsFile 2>&1 + if [ $? -ne 0 ]; then + echo "Error, failed to get the list of SVMs." 1>&2 + cat $svmsFile 1>&2 + exit 1 + fi + + aws fsx describe-volumes --region=$region --output=json > $volumesFile 2>&1 + if [ $? -ne 0 ]; then + echo "Error, failed to get the list of volumes." 1>&2 + cat $volumesFile 1>&2 + exit 1 + fi + + printf "$region\n" + jq -r '.[] | .FileSystemId + " " + .Lifecycle + " =" + .Name + "="' $fileSystemsFile | while read fs fsStatus fsName; do + x="${fsName#=}" + fsName="${x%=}" + [ "$showStatus" == "true" ] && printf "\t$fs($fsStatus) - '$fsName'\n" + [ "$showStatus" != "true" ] && printf "\t$fs - '$fsName'\n" + jq -r '.StorageVirtualMachines[] | if(.FileSystemId == "'$fs'") then .StorageVirtualMachineId + " " + .Lifecycle + " " + .Name else empty end' $svmsFile | while read svm svmStatus svmName; do + [ "$showStatus" == "true" ] && printf "\t\t$svm($svmStatus) - '$svmName'\n" + [ "$showStatus" != "true" ] && printf "\t\t$svm - '$svmName'\n" + jq -r '.Volumes[] | if(.FileSystemId == "'$fs'" and .OntapConfiguration.StorageVirtualMachineId == "'$svm'") then .VolumeId + " " + .Lifecycle + " " + .Name else empty end' $volumesFile | while read volume volStatus volumeName; do + [ "$showStatus" == "true" ] && printf "\t\t\t$volume($volStatus) - '$volumeName'\n" + [ "$showStatus" != "true" ] && printf "\t\t\t$volume - '$volumeName'\n" + done + done + done + else + # + # Convert JSON into a CSV format. + # 1 = fsid + # 2 = arn + # 3 = name + # 4 = lifecycle + # 5 = management ip + # 6 = vpc id + # 7 = subnet id + # 8 = backup retention + # 9 = deployment type + # 10 = iops + # 11 = throughput + # 12 = size + if [ ! -z "$1" ]; then + #set -x + jq -r '.[] | .FileSystemId + "," + .ResourceARN + "," + (if((.Name | tostring) | test("'$1'")) then .Name else empty end) + "," + .Lifecycle + "," + .ManagementIp + "," + .VpcId + "," + .SubnetId + "," + (if(.AutomaticBackupRetentionDays == null) then "Dissabled" else (.AutomaticBackupRetentionDays | tostring) end) + "," + .DeploymentType + "," + (.DiskIopsConfiguration | tostring) + "," + (.ThroughputCapacity | tostring) + "," + (.StorageCapacity | tostring)' $fileSystemsFile > $tmpout + else + jq -r '.[] | .FileSystemId + "," + .ResourceARN + "," + .Name + "," + .Lifecycle + "," + .ManagementIp + "," + .VpcId + "," + .SubnetId + "," + if(.AutomaticBackupRetentionDays == null) then "Dissabled" else (.AutomaticBackupRetentionDays | tostring) end + "," + .DeploymentType + "," + (.DiskIopsConfiguration | tostring) + "," + (.ThroughputCapacity | tostring) + "," + (.StorageCapacity | tostring)' $fileSystemsFile > $tmpout + fi + + if [ "$includeExtraInfo" == "true" ]; then + awk -F, -v region=$region 'BEGIN {first=1; formatStr="%12s %23s %35s %10s %15s %22s %25s %6s %12s %11s %6s\n"}; {if(first) {printf "\n"; printf formatStr, "Region", "FileSystem ID", "Name", "Status", "Management IP", "VPC ID", "Subnet ID", "Size", "Deployment", "Throughput", "Iops"; first=0}; printf formatStr, region, $1, "\"" $3 "\"", $4, $5, $6, $7, $(12), $9, $(11), $(10)}' < $tmpout + else + if [ "$showARN" == "true" ]; then + awk -F, -v region=$region 'BEGIN {first=1; formatStr="%12s %23s %70s %35s %10s %15s\n"}; {if(first) {printf "\n"; printf formatStr, "Region", "FileSystem ID", "ARN", "Name", "Status", "Management IP"; first=0}; printf formatStr, region, $1, $2, "\"" $3 "\"", $4, $5}' < $tmpout + else + formatStr='%12s %23s %35s %10s %15s\n' + [ "$showBackup" == "true" ] && formatStr='%12s %23s %35s %10s %15s %17s\n' + awk -F, -v region=$region 'BEGIN {first=1; formatStr="'"${formatStr}"'"}; {if(first) {printf "\n"; printf formatStr, "Region", "FileSystem ID", "Name", "Status", "Management IP", "Backup Retention"; first=0}; printf formatStr, region, $1, "\"" $3 "\"", $4, $5, $8}' < $tmpout + fi + fi + fi +done diff --git a/FSx_ONTAP_AWS_CLI_Scripts/list_fsxn_filesystems.ps1 b/FSx_ONTAP_AWS_CLI_Scripts/list_fsxn_filesystems.ps1 new file mode 100644 index 0000000..fd03c0e --- /dev/null +++ b/FSx_ONTAP_AWS_CLI_Scripts/list_fsxn_filesystems.ps1 @@ -0,0 +1,71 @@ +################################################################################ +# THIS SOFTWARE IS PROVIDED BY NETAPP "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL NETAPP BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR' +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +################################################################################ +# +# This script is used to list all the FSxN File Systems a user has access to. +# It accepts two optional parameters: +# -region +# -all +# Where: +# is the region you want to list the file systems from. By default it +# list all the file systems from the default region set by the +# "aws configuration" command. +# -all means you want to list the file systems from all the known AWS regions. +# -network means you want the network informaiton associated with the filesystem. +# +################################################################################## + +param ([switch]$all, [string]$region, [switch]$network) + +if($region -eq "") { + $region=(get-content ~/.aws/config | select-string "region") -replace "region = ","" +} + +if ($all.IsPresent) { + $regions=(aws ec2 describe-regions --query "Regions[].RegionName" --output=json | ConvertFrom-Json) +} else { + $regions=@($region) +} + +foreach ($region in $regions) { + $first=$true + $fss=(aws fsx describe-file-systems --region=$region --output=json | ConvertFrom-Json) + + foreach ($fs in $fss.FileSystems) { + if($first) { + #%12s %23s %35s %10s %15s %21s %24s + if ($network.IsPresent) { + "`n{0,12} {1,23} {2,35} {3,10} {4, 15} {5, 21} {6, 24}" -f "Region", "File System ID", "Name", "Status", "Management IP", "VPC ID", "Subnet ID" + } else { + "`n{0,12} {1,23} {2,35} {3,10} {4, 15}" -f "Region", "File System ID", "Name", "Status", "Management IP" + } + $first=$false + } + + $name="N/A" + foreach ($tag in $fs.tags) { + if($tag.Key -eq "Name") { + $name=$tag.Value + } + } + if ($null -ne $fs.OntapConfiguration.Endpoints.Management.IpAddresses) { + $manIP = $fs.OntapConfiguration.Endpoints.Management.IpAddresses[0] + } else { + $manIP = "N/A" + } + if ($network.IsPresent) { + "{0,12} {1,23} {2,35} {3,10} {4, 15} {5, 21} {6, 24}" -f $region, $fs.FileSystemId, $name, $fs.Lifecycle, $manIP, $fs.VpcId, $fs.SubnetIds[0] + } else { + "{0,12} {1,23} {2,35} {3,10} {4, 15}" -f $region, $fs.FileSystemId, $name, $fs.Lifecycle, $manIP + } + } +} diff --git a/FSx_ONTAP_AWS_CLI_Scripts/list_fsxn_svms b/FSx_ONTAP_AWS_CLI_Scripts/list_fsxn_svms new file mode 100755 index 0000000..26b7d64 --- /dev/null +++ b/FSx_ONTAP_AWS_CLI_Scripts/list_fsxn_svms @@ -0,0 +1,121 @@ +#!/bin/bash +################################################################################ +# THIS SOFTWARE IS PROVIDED BY NETAPP "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL NETAPP BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR' +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +################################################################################ +# +################################################################################ +# This script will list all the AWS FSxN storage virtual machines (SVMs) that +# the user has access to. It will list: +# o Region +# o File system ID +# o File System Name - optional +# o SVM ID +# o SVM Name +################################################################################ + +################################################################################ +# This function outputs the usage information and exists. +################################################################################ +usage () { + cat 1>&2 < /dev/null 2>&1; then + : +else + echo "Error, both the 'aws' and 'jq' commands are required to run this script." 1>&2 + exit 1 +fi +# +# Process command line arguments. +allRegions=false +includeFsName=false +region=$(aws configure list | egrep '^.*egion ' | awk '{print $2}') +while getopts "hanr:i:f:" option; do + case "$option" in + r) region="$OPTARG" + ;; + a) allRegions=true + ;; + n) includeFsName=true + ;; + i) fileSystemID="$OPTARG" + ;; + f) fileSystemName="$OPTARG" + ;; + *) usage + ;; + esac +done + +if [ ! -z "$fileSystemID" -a ! -z "$fileSystemName" ]; then + echo "Error, you can't specify both a file system ID and a file system name." 1>&2 + exit 1 +fi + +if [ "$allRegions" = "true" ]; then + # + # Generate a list of all the valid regions the user can search. That is the + # intersection of all the regions they have enabled, and the regions that + # support FSxN + allEndabledRegions=$(aws ec2 describe-regions --query "Regions[].RegionName" --output=json | jq -r '.[]') + allFsxnRegions=$(curl -s https://api.regional-table.region-services.aws.a2z.com/index.json | jq -r '.prices[] | select(.attributes."aws:serviceName" == "Amazon FSx for NetApp ONTAP") .attributes."aws:region"') + for reg in $allEndabledRegions; do + for fsxnReg in $allFsxnRegions; do + if [ $reg == $fsxnReg ]; then + regions+=($reg) + fi + done + done + if [ -z "$regions" ]; then + echo "Error, failed to get the list of regions that support FSxN" 1>&2 + exit 1 + fi +else + regions=($region) +fi + +if [ ! -z "$fileSystemName" ]; then + fileSystemID=$(aws fsx describe-file-systems --output=json 2> /dev/null | jq -r '.FileSystems[] | if((.Tags[] | select(.Key == "Name") .Value) == "'"${fileSystemName}"'") then .FileSystemId else empty end' 2> /dev/null) +fi +# +# Loop on all the regions. +for region in ${regions[*]}; do + if [ -z "$fileSystemID" ]; then + aws fsx describe-storage-virtual-machines --region=$region | jq -r '.StorageVirtualMachines[] | .FileSystemId + "," + .StorageVirtualMachineId + "," + .Name' | sort > $tmpout + else + aws fsx describe-storage-virtual-machines --region=$region | jq -r '.StorageVirtualMachines[] | if(.FileSystemId == "'$fileSystemID'") then .FileSystemId + "," + .StorageVirtualMachineId + "," + .Name else empty end' | sort > $tmpout + fi + + if [ $includeFsName == "true" ]; then + aws fsx describe-file-systems --region=$region | jq -r '.FileSystems[] | .FileSystemId + "," + (.Tags[] | select(.Key == "Name") .Value)' > $tmpout2 + awk -F, -v region=$region 'BEGIN {first=1; maxNameLen=0; while(getline < "'$tmpout2'") {fss[$1]=$2; if(length($2) > maxNameLen) {maxNameLen=length($2)}}; maxNameLen +=2; formatStr="%12s %20s%-"maxNameLen"s %23s %s\n"}; {if(first) {printf "\n"; printf formatStr, "Region", "FileSystem ID", "(Name)", "SVM ID", "SVM Name"; first=0}; name="("fss[$1]")"; printf formatStr, region, $1, name, $2, $3}' < $tmpout + else + awk -F, -v region=$region 'BEGIN {first=1; formatStr="%12s %23s %23s %s\n"}; {if(first) {printf "\n"; printf formatStr, "Region", "FileSystem ID", "SVM ID", "SVM Name"; first=0}; printf formatStr, region, $1, $2, $3}' < $tmpout + fi +done diff --git a/FSx_ONTAP_AWS_CLI_Scripts/list_fsxn_volumes b/FSx_ONTAP_AWS_CLI_Scripts/list_fsxn_volumes new file mode 100755 index 0000000..d6f70a0 --- /dev/null +++ b/FSx_ONTAP_AWS_CLI_Scripts/list_fsxn_volumes @@ -0,0 +1,165 @@ +#!/bin/bash +################################################################################ +# THIS SOFTWARE IS PROVIDED BY NETAPP "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL NETAPP BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR' +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +################################################################################ +# +################################################################################ +# This script will list all the AWS FSx volumes that a user has access to. +# It will list: +# o Region +# o File System ID +# o File System Name - optional +# o Volume ID +# o Volume Name +# o Volume Status +# +################################################################################ + +################################################################################ +# This function outputs the usage information and exists. +################################################################################ +usage () { + cat 1>&2 < /dev/null 2>&1; then + : +else + echo "Error, this script requires both the 'aws' and 'jq' commands to run." 1>&2 + exit 1 +fi +# +# Process command line arguments. +allRegions=false +region=$(aws configure list | egrep '^.*egion ' | awk '{print $2}') +includeFsName=false +excludeRoot=false +filter="" +fsid="" +fileSystemName="" +while getopts "hr:af:i:nos:" option; do + case "$option" in + r) region="$OPTARG" + ;; + a) allRegions=true + ;; + f) fileSystemName="$OPTARG" + ;; + i) fsid="$OPTARG" + ;; + n) includeFsName=true + ;; + o) excludeRoot=true + er="and (.OntapConfiguration.StorageVirtualMachineRoot | not)" + ;; + s) svmID="$OPTARG" + ;; + *) usage + ;; + esac +done + +if [ ! -z "$fileSystenName" -a ! -z "$fsid" ]; then + echo "Error, you can't provide both -f and -n options." 1>&2 + exit 1 +fi + +if [ ! -z "$fileSystemName" ]; then + fsid=$(aws fsx describe-file-systems --region $region --output=json 2> /dev/null | jq -r ".FileSystems[] | if((.Tags[] | select(.Key == \"Name\") .Value) == \"${fileSystemName}\") then .FileSystemId else empty end" 2> /dev/null) + if [ -z "$fsid" ]; then + echo "Error, failed to find the file system with the file system name of '$fileSystemName'." 1>&2 + exit 1 + fi + filter='--filters [{"Name":"file-system-id","Values":["'$fsid'"]}]' +fi + +if [ ! -z "$fsid" -a -z "$fileSystemName" ]; then + fileSystemName=$(aws fsx describe-file-systems --region $region --output=json 2> /dev/null | jq -r ".FileSystems[] | if(.FileSystemId == \"$fsid\") then (.Tags[] | select(.Key == \"Name\") .Value) else empty end" 2> /dev/null) + if [ -z "$fileSystemName" ]; then + echo "Error, failed to find the file system with the file system ID of '$fsid'." 1>&2 + exit 1 + fi + filter='--filters [{"Name":"file-system-id","Values":["'$fsid'"]}]' +fi + +if [ "$allRegions" = "true" ]; then + # + # Generate a list of all the valid regions the user can search. That is the + # intersection of all the regions they have enabled, and the regions that + # support FSxN + allEndabledRegions=$(aws ec2 describe-regions --query "Regions[].RegionName" --output=json | jq -r '.[]') + allFsxnRegions=$(curl -s https://api.regional-table.region-services.aws.a2z.com/index.json | jq -r '.prices[] | select(.attributes."aws:serviceName" == "Amazon FSx for NetApp ONTAP") .attributes."aws:region"') + for reg in $allEndabledRegions; do + for fsxnReg in $allFsxnRegions; do + if [ $reg == $fsxnReg ]; then + regions+=($reg) + fi + done + done + if [ -z "$regions" ]; then + echo "Error, failed to get the list of regions that support FSxN" 1>&2 + exit 1 + fi +else + regions=($region) +fi +# +# Loop on all the regions. +for region in ${regions[*]}; do + # + # Check that the fsx service is supported in thie region + if [ ! -z "$(getent hosts fsx.$region.amazonaws.com)" ]; then + if [ -z "$svmID" ]; then + if [ "$excludeRoot" != "true" ]; then + aws fsx describe-volumes $filter --region=$region --output=json | jq -r '.Volumes[] | .FileSystemId + "," + .Name + "," + .VolumeId + "," + .Lifecycle' | sort > $tmpout + else + aws fsx describe-volumes $filter --region=$region --output=json | jq -r '.Volumes[] | if(.OntapConfiguration.StorageVirtualMachineRoot | not) then .FileSystemId + "," + .Name + "," + .VolumeId + "," + .Lifecycle else empty end' | sort > $tmpout + fi + else + if [ "$excludeRoot" != "true" ]; then + aws fsx describe-volumes $filter --region=$region --output=json | jq -r '.Volumes[] | if(.OntapConfiguration.StorageVirtualMachineId == "'$svmID'") then .FileSystemId + "," + .Name + "," + .VolumeId + "," + .Lifecycle else empty end' | sort > $tmpout + else + aws fsx describe-volumes $filter --region=$region --output=json | jq -r '.Volumes[] | if(.OntapConfiguration.StorageVirtualMachineId == "'$svmID'" and (.OntapConfiguration.StorageVirtualMachineRoot | not)) then .FileSystemId + "," + .Name + "," + .VolumeId + "," + .Lifecycle else empty end' | sort > $tmpout + fi + fi + + if [ $includeFsName == "true" ]; then + aws fsx describe-file-systems --region=$region --output=json | jq -r '.FileSystems[] | .FileSystemId + "," + (.Tags[] | select(.Key == "Name") .Value)' | fgrep "$fileSystemName" > $tmpout2 + awk -F, -v region=$region 'BEGIN {first=1; maxNameLen=0; while(getline < "'$tmpout2'") {fss[$1]=$2; if(length($2) > maxNameLen) {maxNameLen=length($2)}}; maxNameLen +=2; formatStr="%12s %21s%-"maxNameLen"s %24s %10s %s\n"}; {if(first) {printf "\n"; printf formatStr, "Region", "FileSystem ID", "(Name)", "Volume ID", "State", "Volume Name"; first=0}; name="("fss[$1]")"; printf formatStr, region, $1, name, $3, $4, $2}' < $tmpout + else + awk -F, -v region=$region 'BEGIN {first=1; formatStr="%12s %21s %24s %10s %s\n"}; {if(first) {printf "\n"; printf formatStr, "Region", "FileSystem ID", "Volume ID", "State", "Volume Name"; first=0}; printf formatStr, region, $1, $3, $4, $2}' < $tmpout + fi + else + if [ $allRegions != "true" ]; then + printf "The fsx service is currently not supported in the $region region.\n" + fi + fi +done diff --git a/FSx_ONTAP_AWS_CLI_Scripts/purge_fsxn_backups b/FSx_ONTAP_AWS_CLI_Scripts/purge_fsxn_backups new file mode 100755 index 0000000..5b17421 --- /dev/null +++ b/FSx_ONTAP_AWS_CLI_Scripts/purge_fsxn_backups @@ -0,0 +1,130 @@ +#!/bin/bash +# +################################################################################ +# This script is used to delete any undesirable FSxN backups. By default, an +# undesirable backup is defined as one that was created when a volume was +# deleted and is more than 31 days old. These backups will be of type +# "User Initiated", so they will never get deleted, and have a "Name" tag +# with a value that starts with: "Final backup for fsvol". There are various +# options that allow you to broaden the scope of what is considered +# undesirable. +################################################################################ +# +# Check that the required commands are installed +if which aws jq > /dev/null 2>&1; then + : +else + echo "Error, this script requires the 'aws' and 'jq' commands to be installed and configured before it can be run." 1>&2 + exit 1 +fi +# +# Define a tempoary file to write data to. +tmpout=/tmp/purge_backups.$$ +trap 'rm -f $tmpout' exit +# +# Get the default region. +region=$(aws configure list | egrep '^.*egion ' | awk '{print $2}') +# +# Define an "old backup" as one that is older than 31 days. +oldBackup=$((60*60*24*31)) +# +# Process command line noptions.. +deleteAuto=false +deleteAllUser=false +deleteFinal=true +allRegions=false +cmd=aws +dryRun=false +while getopts "nfuar:dt:h" option; do + case $option in + r) region=$OPTARG + ;; + a) allRegions=true + ;; + t) oldBackup=$(($OPTARG*60*60*24)) + ;; + d) deleteAuto=true + ;; + f) deleteFinal=false + ;; + u) deleteAllUser=true + ;; + n) dryRun=true + cmd="echo aws" + ;; + *) cat 1>&2 <&2 + exit 1 + fi +else + regions=($region) +fi +# +# Get the current time in seconds. +currentTime=$(date +%s) +if [ -z "$currentTime" ]; then + echo "Error, could not the current time." + exit 1 +fi +# +# Loop on all the regions defined above. +for region in ${regions[*]}; do + if [ "$dryRun" == "true" ]; then + echo "Looking for backups in region: $region." + fi + # + # Get the current list of the backup IDs with their creation time and type. + aws fsx describe-backups --region $region --query "Backups[*].{BackupId: BackupId, CreationTime: CreationTime, Type: Type, Name: Tags[?Key=='Name']|[0].Value}" --output=json > $tmpout + jq -r '.[] | "\(.BackupId) \(.CreationTime) \(.Type) \(.Name)\n"' $tmpout | + while read backupId creationTime backupType backupName; do + backupTime=$(date --date="$creationTime" "+%s") + if [ -z "$backupTime" ]; then + echo "Error, could not calculate the backup time." 1>&2 + exit 1 + fi + backupAge=$((currentTime-backupTime)) + deleteIt=false + if [ $backupAge -gt $oldBackup ]; then + if [ $deleteAuto == true -a "$backupType" == "AUTOMATIC" -o \ + $deleteAllUser == true -a "$backupType" == "USER_INITIATED" -o \ + "${backupName:0:22}" == "Final backup for fsvol" -a "$backupType" == "USER_INITIATED" -a $deleteFinal == true ]; then + deleteIt=true + fi + fi + if [ $deleteIt == true ]; then + if $cmd fsx delete-backup --backup-id "$backupId" --region $region; then + : + else + echo "Error, the deletion failed." 1>&2 + exit 1 + fi + fi + done +done diff --git a/FSx_ONTAP_Monitoring_with_Lambda/README.md b/FSx_ONTAP_Monitoring_with_Lambda/README.md new file mode 100644 index 0000000..bd33969 --- /dev/null +++ b/FSx_ONTAP_Monitoring_with_Lambda/README.md @@ -0,0 +1,268 @@ +# Introduction +Currently there is some functionality within an FSx for NetApp ONTAP file system for which there is no corresponding +CloudWatch metrics. For example, there is no CloudWatch metrics for a SnapMirror relationship, so there is no way to +alert on when an update has stalled, or it is simply not considered Healthy by Data ONTAP. The purpose of this blog +is to show how a relatively small Python program, that can be run as a Lambda function, can leverage the ONTAP APIs +to obtain the required information to detect certain conditions, and when found, send SNS messages to alert someone. + +This program was initially created to forward EMS messages to an AWS service outside of the FSxN file system since +there was no way to do that from the FSxN file system itself (i.e. the syslog forwarding didn't work at the time). As it turns out this is +no longer the case, in that as of Data ONTAP 9.13.1 you can now forward EMS messages to a 'syslog' server. However, once this program was created, +other funtionality was added to monitor other Data ONTAP services that AWS didn't provide a way to trigger an alert when +something was outside of an expected realm. For example, if the lag time between SnapMirror synchroniation were more +than a specified amount of time. Or, if a SnapMirror update was stalled. This program can alert on all these things and more. +Here is an itemized list of the services that this program can monitor: +- If the file system is available. +- If the underlying Data ONTAP version has changed. +- If the file system is running off its partner node (i.e. a failover has occurred). +- Any EMS message, with filtering to allow you to only be alerted on the ones you care about. +- If a SnapMirror relationship hasn't been updated in a user specified amount of time. +- If a SnapMirror update has stalled. +- If a SnapMirror relationship is in a "non-healthy" state. +- If the aggregate is over a certain percentage full. User can set two thresholds (Warning and Critical). +- If a volume is over a certain percentage full. User can set two thresholds (Warning and Critical). +- If any quotas are over a certain percentage full. User can follow both soft and hard limits. + +## Preparation +There are a few things you need to do to properly deploy this script. + +### Create an AWS Role +This program doesn't need many permissions. It just needs to be able to read the ONTAP credentials stored in a Secrets Manager secret, +read and write objects in an s3 bucket, and be able to publish SNS messages. Below is the specific list of permissions +needed. The easiest way to give the Lambda function the permissions it needs is by creating a role with these +permissions and assigning that role to the Lambda function. + +| Permission | Reason | +|:-----------------------------|:----------------| +|secretsmanager:GetSecretValue | Needs to be able to retrieve the FSxN administrator credentials. | +|sns:Publish | Since it sends messages (alerts) via SNS, it needs to be able to do so. | +|s3:PutObjecct | The program stores its state information in various s3 objects.| +|s3:GetObject | The program reads previous state information, as well as configuration from various s3 objects. | +|s3:ListBucket | To allow the program to know if an object exist or not. | + +### Create an S3 Bucket +One of the goals of the program is to not send multiple messages for the same event. It does this by storing the event +information in an s3 object so it can be compared against before sending a second message for the same event. +Note that it doesn't keep every event indefinitely, it only stores them while the condition is true. So, say for +example it sends an alert for a SnapMirror relationship that has a lag time that is too long. It will +send the alert and store the event. Once a successful SnapMirror synchronization has happen, the event will be removed +from the s3 object allowing for a new event to be created and alarmed on. + +So, for the program to function, you will need to provide an S3 bucket for it to store event history. It is recommended to +have a separate bucket for each deployment of this function. However, that isn't required, since you can +specify the object names for the event file and therefore you could manually ensure that each instance of the Lambda function doesn't +overwrite the event files of another instance. + +### Create an SNS Topic +Since the way this program sends alerts is via an SNS topic, you need to either create SNS topic, or use an +existing one. + +### Endpoints for AWS services +If you deploy this as a Lambda function, you will have to attach it to the VPC that your FSxN file system resides +in so it can run ONTAP APIs against it. When you do that, the Lambda function will not be able to access the +Internet, even if the subnet it is attached to can. Therefore, the Lambda function will require AWS Service Endpoints for +any service that it uses. In the case of this program, it needs an endpoint for the SNS, Secrets Manager and S3 services. +For the S3 service, it is best to deploy a "Gateway" type endpoint, since they are free. Unfortunately, you can't +deploy a Gateway type endpoint for the SNS and Secret Manager services, so those have to be "Interface" type. If +you don't setup the endpoints, the Lambda function will hang on the first AWS API call it tries to perform, which is typically calling the +Secrets Managers to obtain the credentials of the administrator account for the FSxN File System. So, if you +find that the Lambda function times out, even after adjusting the timeout to more than a minute, then chances +are this is your problem. + +**NOTE:** The way the Lambda function is able to use the "local" (i.e. within the subnet) Interface endpoint, as +opposed to the Internet facing one, is usually from the DNS resolution of the endpoint hostname +"..amazonaws.com". In order for that to happen, you have to enable “Private DNS names” +for the endpoint. In order to do that, it is required to enable “DNS Hostnames” within the VPC settings. This VPC +setting is not enabled by default. After making these changes, if you are using Route53 as your DNS resolver for +your VPC, then it will automatically return the local endpoint IP address instead of the Internet facing one. +However, if you have your VPC setup to not use Route53 as its DNS resolver then you'll need to override the +endpoint that the Lambda function uses for the SNS and Secrets Manager services by setting the snsEndPointHostname, +and secretsManagerEndPointHostname configuration variables (you'll see how to do that below). You should set +them to the "local" DNS name of the respective endpoints. + +### Lambda Function +There are a few things you need to do to properly configure the Lambda function. +- Give it the permissions listed above. +- Put it into the same VPC and subnet as the FSxN file system. +- Increase the total run time to at least 10 seconds. You might have to raise that if you have a lot of components in your FSxN file system. However, if you have to raise it to more than a minute, it could be an issue with the endpoint causing the calls to the AWS services to hang. See the Endpoint section above for more information. +- Provide for the base configuration via environment variables and a configuration file. +- Create the "Matching Conditions" file, that specifies when the Lambda function should send an alert. +- Set up an EventBridge Schedule rule to trigger the function on a regular basis. + +#### Configuration Parameters +Below is a list of parameters that are used to configure the program. Some parameters are required to be set +for the program to function, and others that are optional. Some of the optional ones are still required but +will have a usable default value if the parameter is not set. For the parameters that aren't required to be +set via an environment variable, they can be set by creating a "configuration file" and putting the assignments +in it. The assignments should be of the form "parameter=value". The default filename for the configuration +file is what you set the OntapAdminServer variable to plus the string "-config". If you want to use a different +filename, then set the configFilename environment variable to the name of your choosing. + +**NOTE:** Parameter names are case sensitive. + +|Parameter Name | Required | Required as an Environment Variable | Default Value | Description | +|:--------------|:---------|:------------------------------------|:--------------|:------------| +|s3BucketName | Yes | Yes | None | Set to the name of the S3 bucket you want the program to store events to. It will also read the matching configuration file from this bucket. | +|s3BucketRegion | Yes | Yes | None | Set to the region the S3 bucket resides in. | +|configFilename | No | Yes | OntapAdminServer + "-config" | Set to the filename (S3 object) that contains parameter assignments. It's okay if it doesn't exist, as long as there are environment variables for all the required parameters. | +| emsEventsFilename | No | No | OntapAdminServer + "-emsEvents" | Set to the filename (S3 object) that you want the program to store the EMS events that it alerts on into. This file will be created as necessary. | +| smEventsFilesname | No | No | OntapAdminServer + "-smEvents" | Set to the filename (S3 object) that you want the program to store the SnapMirror alerts into. This file will be created as necessary. | +| smRelationshipsFilename | No | No | OntapAdminServer + "-smRelationships" | Set to the filename (S3 object) that you want the program to store the SnapMirror relationships into. This file will be created as necessary. | +| storageEventsFilename | No | No | OntapAdminServer + "-storageEvents" | Set to the filename (S3 object) that you want the program to store the Storage alerts into. This file will be created as necessary. | +| quotaEventsFilename | No | No | OntapAdminServer + "-quotaEvents" | Set to the filename (S3 object) that you want the program to store the Quota alerts into. This file will be created as necessary. | +| systemStatusFilename | No | No | OntapAdminServer + "-systemStatus" | Set to the filename (S3 object) that you want the program to store the overall system status information into. This file will be created as necessary. | +| snsTopicArn | Yes | No | None | Set to the ARN of the SNS topic you want the program to publish alert messages to. | +| snsRegion | Yes | No | None | The region where the SNS topic resides. | +| conditionsFilename | Yes | No | OntapAdminServer + "-conditions" | Set to the filename (S3 object) where you want the program to read the matching condition information from. | +| secretName | Yes | No | None | Set to the name of the secret within the AWS Secrets Manager that holds the ONTAP credentials. | +| secretRegion | Yes | No | None | Set to the region where the secretName is stored. | +| secretUsernameKey | Yes | No | None | Set to the key name within the secretName that holds the username portion of the ONTAP credentials. | +| secretPasswordKey | Yes | No | None | Set to the key name within the secretName that holds the password portion of the ONTAP credentials. | +| snsEndPointHostname | No | No | None | Set to the DNS hostname assigned to the SNS endpoint created above. | +| secretsManagerEndPointHostname | No | No | None | Set to the DNS hostname assigned to the SecretsManager endpoint created above. | +| syslogIP | No | No | None | To have the program send syslog messages anytime it sends an SNS message set this to the IP address (or hostname) of the syslog server to send the messages to. | + +#### Matching Conditions File +To specify which events you want to be alerted on, you create a "Matching Conditions" file. The format of the +file is JSON. JSON is basically a series of "key" : "value" pairs. Where the value can be object that also has +"key" : "value" pairs. For more information about the format of a JSON file, please refer to this page. The JSON +schema in this file is made up of an array with a key name of "services". Each element of the "services" array +is an object with two keys. The first key is “name" which specifies the name of the service it is going to provide +matching conditions (rules) for. The second key is "rules" which is an array of objects that provide the specific +matching conditions. Note that each service's rules has its own unique schema. The following is the unique schema +for each of the service's rules. + +##### Matching condition schema for System Health +Each rule should be an object with one, or more, of the following keys: + +- versionChange - Is a Boolean (true, false) and if 'true' will send an alert when the ONTAP version changes. If it is set to false, it will not report on version changes. +- failover - Is a Boolean (true, false) and if 'true' will send an alert if the FSxN cluster is running on its standby node. If it is set to false, it will not report on failover status. +- networkInterfaces - Is a Boolean (true, false) and if 'true' will send an alert if any of the network interfaces are down. If it is set to false, it will not report on any network interfaces that are down. + +##### Matching condition schema for EMS Messages +Each rule should be an object with three keys: + +- "name" - Which will match on the EMS event name. +- "message" - Which will match on the EMS event message text. +- "severity" - Which will match on the severity of the EMS event (debug, informational, notice, error, alert or emergency). +Note that all values to each of the keys are used as a regular expressions against the associated EMS component. So, for example, if you want to match on any event message text that starts with “snapmirror” then you would put “^snapmirror”. The “^” character matches the beginning on the string. If you want to match on a specific EMS event name, then you should anchor it with an regular express that starts with “^” for the beginning of the string and ends with “$” for the end of the string. For example, “^arw.volume.state$’. For a complete explanation of the regular expression syntax and special characters, please see the Python documentation found here Regular expression operations. + +##### Matching condition schema for SnapMirror relationships +Each rule should be an object with one, or more, of the following keys: + +- maxLagTime - Specifies the maximum allowable time, in seconds, since the last successful SnapMirror update before an alert will be sent. +- stalledTransferSeconds - Specifies the minimum number of seconds that have to transpire before a SnapMirror transfer will be considered stalled. +- health - Is a Boolean (true, false) which specifies if you want to alert on a healthy relationship (true) or an unhealthy relationship (false). + +##### Matching condition schema for Storage +Each rule should be an object with one, or more, of the following keys: + +- aggrWarnPercentUsed - Specifies the maximum allowable physical storage (aggregate) utilization (between 0 and 100) before an alert is sent. +- aggrCriticalPercentUsed - Specifies the maximum allowable physical storage (aggregate) utilization (between 0 and 100) before an alert is sent. +- volumeWarnPercentUsed - Specifies the maximum allowable volume utilization (between 0 and 100) before an alert is sent. +- volumeCriticalPercentUsed - Specifies the maximum allowable volume utilization (between 0 and 100) before an alert is sent. + +##### Matching condition schema for Quota +Each rule should be an object with one, or more, of the following keys: + +- maxHardQuotaSpacePercentUsed - Specifies the maximum allowable storage utilization (between 0 and 100) against the hard quota limit before an alert is sent. +- maxSoftQuotaSpacePercentUsed - Specifies the maximum allowable storage utilization (between 0 and 100) against the soft quota limit before an alert is sent. +- maxQuotaInodesPercentUsed - Specifies the maximum allowable inode utilization (between 0 and 100) before an alert is sent. + +##### Example Matching conditions file: +``` +{ + "services": [ + { + "name": "systemHealth", + "rules": [ + { + "versionChange": true, + "failover": true + }, + { + "networkInterfaces": true + } + ] + }, + { + "name": "ems", + "rules": [ + { + "name": "^passwd.changed$", + "severity": "", + "message": "" + }, + { + "name": "", + "severity": "notice|info|error|alert|emergency", + "message": "" + } + ] + }, + { + "name": "snapmirror", + "rules": [ + { + "maxLagTime": 120 + }, + { + "healthy": false + }, + { + "stalledTransferSeconds": 60 + } + ] + }, + { + "name": "storage", + "rules": [ + { + "aggrWarnPercentUsed": 80 + }, + { + "aggrCriticalPercentUsed": 95 + }, + { + "volumeWarnPercentUsed": 85 + }, + { + "volumeCriticalPercentUsed": 90 + } + ] + }, + { + "name": "quota", + "rules": [ + { + "maxHardQuotaSpacePercentUsed": 95 + }, + { + "maxSoftQuotaSpacePercentUsed": 100 + }, + { + "maxQuotaInodesPercentUsed": 95 + } + ] + } + ] +} +``` +In the above example, it will alert on: + +- Any version change, including patch level, of the ONTAP O/S. +- If the system is running off of the standby node. +- Any network interfaces that are down. +- Any EMS message that has an event name of “passwd.changed”. +- Any EMS message that has a severity of "alert" or “emergency”. +- Any SnapMirror relationship with a lag time more than 86400 seconds (24 hours). +- Any SnapMirror relationship that has a non-healthy status. +- Any SnapMirror update that hasn't had any flow of data in 600 seconds (10 minutes). +- If the cluster aggregate is more than 80% full. +- If the cluster aggregate is more than 95% full. +- If any volume is more than 85% full. +- if any volume is more than 90% full. +- If any quota policies where the space utilization is more than 95% of the hard limit. +- If any quota policies where the space utilization is more than 100% of the soft limit. +- If any quota policies are showing any inode utilization more than 95% +A matching conditions file must be created and stored in the S3 bucket with the name given as the "conditionsFilename" configuration variable. Feel free to use the example above as a starting point. Note that you should ensure it is in valid JSON format, otherwise the program will fail to load the file. There are various programs and websites that can validate a JSON file for you. diff --git a/FSx_ONTAP_Monitoring_with_Lambda/monitor_fsxn_services.py b/FSx_ONTAP_Monitoring_with_Lambda/monitor_fsxn_services.py new file mode 100755 index 0000000..96c9c2a --- /dev/null +++ b/FSx_ONTAP_Monitoring_with_Lambda/monitor_fsxn_services.py @@ -0,0 +1,1036 @@ +#!/bin/python3 +################################################################################ +# THIS SOFTWARE IS PROVIDED BY NETAPP "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL NETAPP BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR' +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +################################################################################ +# +################################################################################ +# This program is used to monitor some of Data ONTAP services (EMS Message, +# Snapmirror relationships, quotas) running under AMS, and alert on any +# "matching conditions." It is intended to be run as a Lambda function, but +# can be run as a standalone program. +# +# Version: %%VERSION%% +# Date: %%DATE%% +################################################################################ + +import botocore +import boto3 +import json +import urllib3 +from urllib3.util import Retry +import re +import os +import datetime +import logging +import socket +from logging.handlers import SysLogHandler +import uuid + +eventResilience = 4 # Times an event has to be missing before it is removed + # from the alert history. + # This was added since the Ontap API that returns EMS + # events would often drop some events and then including + # them in the subsequent calls. If I don't "age" the + # alert history duplicate alerts will be sent. +initialVersion = "Initial Run" # The version to store if this is the first + # time the program has been run against a + # FSxN. + +################################################################################ +# This function is used to extract the one-, two-, or three-digit number from +# the string passed in, starting at the 'start' character. Then, multiple it +# by the unit after the number: +# D = Day = 60*60*24 +# H = Hour = 60*60 +# M = Minutes = 60 +# +# It returns a tuple that has the extracted number and the end position. +################################################################################ +def getNumber(string, start): + + if len(string) <= start: + return (0, start) + # + # Check to see if it is a 1, 2 or 3 digit number. + startp1=start+1 # Single digit + startp2=start+2 # Double digit + startp3=start+3 # Triple digit + if re.search('[0-9]', string[startp1:startp2]) and re.search('[0-9]', string[startp2:startp3]): + end=startp3 + elif re.search('[0-9]', string[startp1:startp2]): + end=startp2 + else: + end=startp1 + + num=int(string[start:end]) + + endp1=end+1 + if string[end:endp1] == "D": + num=num*60*60*24 + elif string[end:endp1] == "H": + num=num*60*60 + elif string[end:endp1] == "M": + num=num*60 + elif string[end:endp1] != "S": + print(f'Unknown lag time specifier "{string[end:endp1]}".') + + return (num, endp1) + +################################################################################ +# This function is used to parse the lag time string returned by the +# ONTAP API and return the equivalent seconds it represents. +# The input string is assumed to follow this pattern "P#DT#H#M#S" where +# each of those "#" can be one to three digits long. Also, if the lag isn't +# more than 24 hours, then the "#D" isn't there and the string simply starts +# with "PT". Similarly, if the lag time isn't more than an hour then the "#H" +# string is missing. +################################################################################ +def parseLagTime(string): + # + num=0 + # + # First check to see if the Day field is there, by checking to see if the + # second character is a digit. If not, it is assumed to be 'T'. + includesDay=False + if re.search('[0-9]', string[1:2]): + includesDay=True + start=1 + else: + start=2 + data=getNumber(string, start) + num += data[0] + + start=data[1] + # + # If there is a 'D', then there is a 'T' between the D and the # of hours + # so skip pass it. + if includesDay: + start += 1 + data=getNumber(string, start) + num += data[0] + + start=data[1] + data=getNumber(string, start) + num += data[0] + + start=data[1] + data=getNumber(string, start) + num += data[0] + + return(num) + +################################################################################ +# This function checks to see if an event is in the events array based on +# the unique Identifier passed in. It will also update the "refresh" field on +# any matches. +################################################################################ +def eventExist (events, uniqueIdentifier): + for event in events: + if event["index"] == uniqueIdentifier: + event["refresh"] = eventResilience + return True + + return False + +################################################################################ +# This function makes an API call to the FSxN to ensure it is up. If the +# errors out, then it sends an alert, and returns 'False'. Otherwise it returns +# 'True'. +################################################################################ +def checkSystem(): + global config, s3Client, snsClient, http, headers, clusterName, clusterVersion, logger + + changedEvents = False + # + # Get the previous status. + try: + data = s3Client.get_object(Key=config["systemStatusFilename"], Bucket=config["s3BucketName"]) + except botocore.exceptions.ClientError as err: + # If the error is that the object doesn't exist, then this must be the + # first time this script has run against thie filesystem so create an + # initial status structure. + if err.response['Error']['Code'] == "NoSuchKey": + fsxStatus = { + "systemHealth": True, + "version" : initialVersion, + "numberNodes" : 2, + "downInterfaces" : [] + } + changedEvents = True + else: + raise err + else: + fsxStatus = json.loads(data["Body"].read().decode('UTF-8')) + # + # Get the cluster name and ONTAP version from the FSxN. + # This is also a way to test that the FSxN cluster is accessible. + try: + endpoint = f'https://{config["OntapAdminServer"]}/api/cluster?fields=version,name' + response = http.request('GET', endpoint, headers=headers, timeout=1.0) + if response.status == 200: + if not fsxStatus["systemHealth"]: + fsxStatus["systemHealth"] = True + changedEvents = True + + data = json.loads(response.data) + if config["awsAccountId"] != None: + clusterName = f'{data["name"]}({config["awsAccountId"]})' + else: + clusterName = data['name'] + # + # The following assumes that the format of the "full" version + # looks like: "NetApp Release 9.13.1P6: Tue Dec 05 16:06:25 UTC 2023". + # The reason for looking at the "full" instead of the individual + # keys (generation, major, minor) is because they don't provide + # the patch level. :-( + clusterVersion = data["version"]["full"].split()[2].replace(":", "") + if fsxStatus["version"] == initialVersion: + fsxStatus["version"] = clusterVersion + else: + print(f'API call to {endpoint} failed. HTTP status code: {response.status}.') + except: + if fsxStatus["systemHealth"]: + if config["awsAccountId"] != None: + clusterName = f'{config["OntapAdminServer"]}({config["awsAccountId"]})' + else: + clusterName = config["OntapAdminServer"] + message = f'CRITICAL: Failed to issue API against {clusterName}. Cluster could be down.' + logger.critical(message) + snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + fsxStatus["systemHealth"] = False + changedEvents = True + + if changedEvents: + s3Client.put_object(Key=config["systemStatusFilename"], Bucket=config["s3BucketName"], Body=json.dumps(fsxStatus).encode('UTF-8')) + # + # If the cluster is done, return false so the program can exit cleanly. + return(fsxStatus["systemHealth"]) + +################################################################################ +# This function checks the following things: +# o If the ONTAP version has changed. +# o If one of the nodes are down. +# o If a network interface is down. +# +# ASSUMPTIONS: That checkSystem() has been called before it. +################################################################################ +def checkSystemHealth(service): + global config, s3Client, snsClient, http, headers, clusterName, clusterVersion, logger + + changedEvents = False + # + # Get the previous status. + # Shouldn't have to check for status of the get_object() call, to see if the object exist or not, + # since "checkSystem()" should already have been called and it creates the object if it doesn't + # already exist. So, if there is a failure, it should be something else than "non-existent". + data = s3Client.get_object(Key=config["systemStatusFilename"], Bucket=config["s3BucketName"]) + fsxStatus = json.loads(data["Body"].read().decode('UTF-8')) + + for rule in service["rules"]: + for key in rule.keys(): + lkey = key.lower() + if lkey == "versionchange": + if rule[key] and clusterVersion != fsxStatus["version"]: + message = f'NOTICE: The ONTAP vesion changed on cluster {clusterName} from {fsxStatus["version"]} to {clusterVersion}.' + logger.info(message) + snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + fsxStatus["version"] = clusterVersion + changedEvents = True + elif lkey == "failover": + # + # Check that both nodes are available. + # Using the CLI passthrough API because I couldn't find the equivalent API call. + if rule[key]: + endpoint = f'https://{config["OntapAdminServer"]}/api/private/cli/system/node/virtual-machine/instance/show-settings' + response = http.request('GET', endpoint, headers=headers) + if response.status == 200: + data = json.loads(response.data) + if data["num_records"] != fsxStatus["numberNodes"]: + message = f'Alert: The number of nodes on cluster {clusterName} went from {fsxStatus["numberNodes"]} to {data["num_records"]}.' + logger.info(message) + snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + fsxStatus["numberNodes"] = data["num_records"] + changedEvents = True + else: + print(f'API call to {endpoint} failed. HTTP status code: {response.status}.') + elif lkey == "networkinterfaces": + if rule[key]: + endpoint = f'https://{config["OntapAdminServer"]}/api/network/ip/interfaces?fields=state' + response = http.request('GET', endpoint, headers=headers) + if response.status == 200: + # + # Decrement the refresh field to know if any events have really gone away. + for interface in fsxStatus["downInterfaces"]: + interface["refresh"] -= 1 + + data = json.loads(response.data) + for interface in data["records"]: + if interface.get("state") != None and interface["state"] != "up": + uniqueIdentifier = interface["name"] + if(not eventExist(fsxStatus["downInterfaces"], uniqueIdentifier)): # Resets the refresh key. + message = f'Alert: Network interface {interface["name"]} on cluster {clusterName} is down.' + logger.info(message) + snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + event = { + "index": uniqueIdentifier, + "refresh": eventResilience + } + fsxStatus["downInterfaces"].append(event) + changedEvents = True + # + # After processing the records, see if any events need to be removed. + i = 0 + while i < len(fsxStatus["downInterfaces"]): + if fsxStatus["downInterfaces"][i]["refresh"] <= 0: + print(f'Deleting downed interface: {fsxStatus["downInterfaces"][i]["index"]}') + del fsxStatus["downInterfaces"][i] + changedEvents = True + else: + if fsxStatus["downInterfaces"][i]["refresh"] != eventResilience: + changedEvents = True + i += 1 + else: + print(f'API call to {endpoint} failed. HTTP status code: {response.status}.') + else: + print(f'Unknown System Health alert type: "{key}".') + + if changedEvents: + s3Client.put_object(Key=config["systemStatusFilename"], Bucket=config["s3BucketName"], Body=json.dumps(fsxStatus).encode('UTF-8')) + +################################################################################ +# This function processes the EMS events. +################################################################################ +def processEMSEvents(service): + global config, s3Client, snsClient, http, headers, clusterName, clusterVersion, logger + + changedEvents = False + # + # Get the saved events so we can ensure we are only reporting on new ones. + try: + data = s3Client.get_object(Key=config["emsEventsFilename"], Bucket=config["s3BucketName"]) + except botocore.exceptions.ClientError as err: + # If the error is that the object doesn't exist, then it will get created once an alert it sent. + if err.response['Error']['Code'] == "NoSuchKey": + events = [] + else: + raise err + else: + events = json.loads(data["Body"].read().decode('UTF-8')) + # + # Decrement the refresh field to know if any records have really gone away. + for event in events: + event["refresh"] -= 1 + # + # Run the API call to get the current list of EMS events. + endpoint = f'https://{config["OntapAdminServer"]}/api/support/ems/events' + response = http.request('GET', endpoint, headers=headers) + if response.status == 200: + data = json.loads(response.data) + # + # Process the events to see if there are any new ones. + print(f'Received {len(data["records"])} EMS records.') + logger.debug(f'Received {len(data["records"])} EMS records.') + for record in data["records"]: + for rule in service["rules"]: + if (re.search(rule["name"], record["message"]["name"]) and + re.search(rule["severity"], record["message"]["severity"]) and + re.search(rule["message"], record["log_message"])): + if (not eventExist (events, record["index"])): # This resets the "refresh" field if found. + message = f'{record["time"]} : {clusterName} {record["message"]["name"]}({record["message"]["severity"]}) - {record["log_message"]}' + useverity=record["message"]["severity"].upper() + if useverity == "EMERGENCY": + logger.critical(message) + elif useverity == "ALERT": + logger.error(message) + elif useverity == "ERROR": + logger.warning(message) + elif useverity == "NOTICE" or useverity == "INFORMATIONAL": + logger.info(message) + elif useverity == "DEBUG": + logger.debug(message) + else: + print(f'Received unknown severity from ONTAP "{record["message"]["severity"]}". The message received is next.') + logger.info(f'Received unknown severity from ONTAP "{record["message"]["severity"]}". The message received is next.') + logger.info(message) + snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + changedEvents = True + event = { + "index": record["index"], + "time": record["time"], + "messageName": record["message"]["name"], + "message": record["log_message"], + "refresh": eventResilience + } + print(message) + events.append(event) + # + # Now that we have processed all the events, check to see if any events should be deleted. + i = 0 + while i < len(events): + if events[i]["refresh"] <= 0: + print(f'Deleting event: {events[i]["time"]} : {events[i]["message"]}') + del events[i] + changedEvents = True + else: + # If an event wasn't refreshed, then we need to save the new refresh count. + if events[i]["refresh"] != eventResilience: + changedEvents = True + i += 1 + # + # If the events array changed, save it. + if changedEvents: + s3Client.put_object(Key=config["emsEventsFilename"], Bucket=config["s3BucketName"], Body=json.dumps(events).encode('UTF-8')) + else: + print(f'API call to {endpoint} failed. HTTP status code: {response.status}.') + logger.debug(f'API call to {endpoint} failed. HTTP status code: {response.status}.') + +################################################################################ +# This function is used to find an existing SM relationship based on the source +# and destinatino path passed in. It returns None if one isn't found +################################################################################ +def getPreviousSMRecord(relationShips, sourceCluster, sourcePath, destPath): + for relationship in relationShips: + if relationship['sourcePath'] == sourcePath and relationship['destPath'] == destPath and relationship['sourceCluster'] == sourceCluster: + relationship['refresh'] = True + return(relationship) + + return(None) + +################################################################################ +# This function is used to check SnapMirror relationships. +################################################################################ +def processSnapMirrorRelationships(service): + global config, s3Client, snsClient, http, headers, clusterName, clusterVersion, logger + # + # Get the saved events so we can ensure we are only reporting on new ones. + try: + data = s3Client.get_object(Key=config["smEventsFilename"], Bucket=config["s3BucketName"]) + except botocore.exceptions.ClientError as err: + # If the error is that the object doesn't exist, then it will get created once an alert it sent. + if err.response['Error']['Code'] == "NoSuchKey": + events = [] + else: + raise err + else: + events = json.loads(data["Body"].read().decode('UTF-8')) + # + # Decrement the refresh field to know if any records have really gone away. + for event in events: + event["refresh"] -= 1 + + changedEvents=False + # + # Get the saved SM relationships. + try: + data = s3Client.get_object(Key=config["smRelationshipsFilename"], Bucket=config["s3BucketName"]) + except botocore.exceptions.ClientError as err: + # If the error is that the object doesn't exist, then it will get created once an alert it sent. + if err.response['Error']['Code'] == "NoSuchKey": + smRelationships = [] + else: + raise err + else: + smRelationships = json.loads(data["Body"].read().decode('UTF-8')) + # + # Set the refresh to False to know if any of the relationships still exist. + for relationship in smRelationships: + relationship["refresh"] = False + + updateRelationships = False + # + # Get the current time in seconds since UNIX epoch 01/01/1970. + curTime = int(datetime.datetime.now().timestamp()) + # + # Run the API call to get the current state of all the snapmirror relationships. + endpoint = f'https://{config["OntapAdminServer"]}/api/snapmirror/relationships?fields=*' + response = http.request('GET', endpoint, headers=headers) + if response.status == 200: + data = json.loads(response.data) + + for record in data["records"]: + for rule in service["rules"]: + for key in rule.keys(): + lkey = key.lower() + if lkey == "maxlagtime": + if record.get("lag_time") != None: + lagSeconds = parseLagTime(record["lag_time"]) + if lagSeconds > rule["maxLagTime"]: + uniqueIdentifier = record["uuid"] + "_" + key + if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. + message = f'Snapmirror Lag Alert: {record["source"]["cluster"]["name"]}::{record["source"]["path"]} -> {clusterName}::{record["destination"]["path"]} has a lag time of {lagSeconds} seconds.' + logger.warning(message) + snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + changedEvents=True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + print(message) + events.append(event) + elif lkey == "healthy": + if not record["healthy"]: + uniqueIdentifier = record["uuid"] + "_" + key + if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. + message = f'Snapmirror Health Alert: {record["source"]["cluster"]["name"]}::{record["source"]["path"]} {clusterName}::{record["destination"]["path"]} has a status of {record["healthy"]}' + logger.warning(message) # Intentionally put this before adding the reasons, since I'm not sure how syslog will handle a multi-line message. + for reason in record["unhealthy_reason"]: + message += "\n" + reason["message"] + snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + changedEvents=True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + print(message) + events.append(event) + elif lkey == "stalledtransferseconds": + if record.get('transfer') and record['transfer']['state'].lower() == "transferring": + sourcePath = record['source']['path'] + destPath = record['destination']['path'] + sourceCluster = record['source']['cluster']['name'] + bytesTransferred = record['transfer']['bytes_transferred'] + + prevRec = getPreviousSMRecord(smRelationships, sourceCluster, sourcePath, destPath) + + if prevRec != None: + timeDiff=curTime - prevRec["time"] + print(f'transfer bytes last time:{prevRec["bytesTransferred"]} this time:{bytesTransferred} and {timeDiff} > {rule[key]}') + if prevRec['bytesTransferred'] == bytesTransferred: + if (curTime - prevRec['time']) > rule[key]: + uniqueIdentifier = record['uuid'] + "_" + "transfer" + + if not eventExist(events, uniqueIdentifier): + message = f'Snapmiorror transfer has stalled: {sourceCluster}::{sourcePath} -> {clusterName}::{destPath}.' + logger.warning(message) + snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject='Monitor ONTAP Services Alert for cluster {clusterName}') + changedEvents=True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + print(message) + events.append(event) + else: + prevRec['time'] = curTime + prevRec['refresh'] = True + prevRec['bytesTransferred'] = bytesTransferred + updateRelationships = True + else: + prevRec = { + "time": curTime, + "refresh": True, + "bytesTransferred": bytesTransferred, + "sourcePath": sourcePath, + "destPath": destPath, + "sourceCluster": sourceCluster + } + updateRelationships = True + smRelationships.append(prevRec) + else: + print(f'Unknown snapmirror alert type: "{key}".') + # + # After processing the records, see if any SM relationships need to be removed. + i = 0 + while i < len(smRelationships): + if not smRelationships[i]["refresh"]: + del smRelationships[i] + updateRelationships = True + else: + i += 1 + # + # If any of the SM relationships changed, save it. + if(updateRelationships): + s3Client.put_object(Key=config["smRelationshipsFilename"], Bucket=config["s3BucketName"], Body=json.dumps(smRelationships).encode('UTF-8')) + # + # After processing the records, see if any events need to be removed. + i = 0 + while i < len(events): + if events[i]["refresh"] <= 0: + print(f'Deleting event: {events[i]["message"]}') + del events[i] + changedEvents = True + else: + # If an event wasn't refreshed, then we need to save the new refresh count. + if events[i]["refresh"] != eventResilience: + changedEvents = True + i += 1 + # + # If the events array changed, save it. + if(changedEvents): + s3Client.put_object(Key=config["smEventsFilename"], Bucket=config["s3BucketName"], Body=json.dumps(events).encode('UTF-8')) + else: + print(f'API call to {endpoint} failed. HTTP status code {response.status}.') + +################################################################################ +# This function is used to check all the volume and aggregate utlization. +################################################################################ +def processStorageUtilization(service): + global config, s3Client, snsClient, http, headers, clusterName, clusterVersion, logger + + changedEvents=False + # + # Get the saved events so we can ensure we are only reporting on new ones. + try: + data = s3Client.get_object(Key=config["storageEventsFilename"], Bucket=config["s3BucketName"]) + except botocore.exceptions.ClientError as err: + # If the error is that the object doesn't exist, then it will get created once an alert it sent. + if err.response['Error']['Code'] == "NoSuchKey": + events = [] + else: + raise err + else: + events = json.loads(data["Body"].read().decode('UTF-8')) + # + # Decrement the refresh field to know if any records have really gone away. + for event in events: + event["refresh"] -= 1 + + for rule in service["rules"]: + for key in rule.keys(): + lkey=key.lower() + if lkey == "aggrwarnpercentused" or lkey == 'aggrcriticalpercentused': + # + # Run the API call to get the physical storage used. + endpoint = f'https://{config["OntapAdminServer"]}/api/storage/aggregates?fields=space' + response = http.request('GET', endpoint, headers=headers) + if response.status == 200: + data = json.loads(response.data) + for aggr in data["records"]: + if (aggr["space"]["block_storage"]["used"]/aggr["space"]["block_storage"]["size"]) * 100 >= rule[key]: + uniqueIdentifier = aggr["uuid"] + "_" + key + if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. + alertType = 'Warning' if lkey == "aggrwarnpercentused" else 'Critical' + message = f'Aggregate {alertType} Alert: Aggregate {aggr["name"]} on {clusterName} is more than {rule[key]} full.' + logger.warning(message) + snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + changedEvents = True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + print(event) + events.append(event) + else: + print(f'API call to {endpoint} failed. HTTP status code {response.status}.') + elif lkey == "volumewarnpercentused" or lkey == "volumecriticalpercentused": + # + # Run the API call to get the physical storage used. + endpoint = f'https://{config["OntapAdminServer"]}/api/storage/volumes?fields=space,svm' + response = http.request('GET', endpoint, headers=headers) + if response.status == 200: + data = json.loads(response.data) + for record in data["records"]: + if record["space"].get("logical_space") and record["space"]["logical_space"].get("used_percent"): + if record["space"]["logical_space"]["used_percent"] >= rule[key]: + uniqueIdentifier = record["uuid"] + "_" + key + if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. + alertType = 'Warning' if lkey == "volumewarnpercentused" else 'Critical' + message = f'Volume Usage {alertType} Alert: volume {record["svm"]["name"]}:/{record["name"]} on {clusterName} is {record["space"]["logical_space"]["used_percent"]}% full, which is more than {rule[key]}% full.' + logger.warning(message) + snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + changedEvents = True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + print(message) + events.append(event) + else: + print(f'API call to {endpoint} failed. HTTP status code {response.status}.') + # + # After processing the records, see if any events need to be removed. + i = 0 + while i < len(events): + if events[i]["refresh"] <= 0: + print(f'Deleting event: {events[i]["message"]}') + del events[i] + changedEvents = True + else: + # If an event wasn't refreshed, then we need to save the new refresh count. + if events[i]["refresh"] != eventResilience: + changedEvents = True + i += 1 + # + # If the events array changed, save it. + if(changedEvents): + s3Client.put_object(Key=config["storageEventsFilename"], Bucket=config["s3BucketName"], Body=json.dumps(events).encode('UTF-8')) + +################################################################################ +# This function is used to check utilization of quota limits. +################################################################################ +def processQuotaUtilization(service): + global config, s3Client, snsClient, http, headers, clusterName, clusterVersion, logger + + changedEvents=False + # + # Get the saved events so we can ensure we are only reporting on new ones. + try: + data = s3Client.get_object(Key=config["quotaEventsFilename"], Bucket=config["s3BucketName"]) + except botocore.exceptions.ClientError as err: + # If the error is that the object doesn't exist, then it will get created once an alert it sent. + if err.response['Error']['Code'] == "NoSuchKey": + events = [] + else: + raise err + else: + events = json.loads(data["Body"].read().decode('UTF-8')) + # + # Decrement the refresh field to know if any records have really gone away. + for event in events: + event["refresh"] -= 1 + # + # Run the API call to get the quota report. + endpoint = f'https://{config["OntapAdminServer"]}/api/storage/quota/reports?fields=*' + response = http.request('GET', endpoint, headers=headers) + if response.status == 200: + data = json.loads(response.data) + for record in data["records"]: + for rule in service["rules"]: + for key in rule.keys(): + lkey = key.lower() # Convert to all lower case so the key can be case insensitive. + if lkey == "maxquotainodespercentused": + # + # Since the quota report might not have the files key, and even if it does, it might not have + # the hard_limit_percent" key, need to check for their existencae first. + if(record.get("files") != None and record["files"]["used"].get("hard_limit_percent") != None and + record["files"]["used"]["hard_limit_percent"] > rule[key]): + uniqueIdentifier = str(record["index"]) + "_" + key + if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. + if record.get("qtree") != None: + qtree=f' under qtree: {record["qtree"]["name"]} ' + else: + qtree=' ' + if record.get("users") != None: + users=None + for user in record["users"]: + if users == None: + users = user["name"] + else: + users += ',{user["name"]}' + user=f'associated with user(s) "{users}" ' + else: + user='' + message = f'Quota Inode Usage Alert: Quota of type "{record["type"]}" on {record["svm"]["name"]}:/{record["volume"]["name"]}{qtree}{user}on {clusterName} is using {record["files"]["used"]["hard_limit_percent"]}% which is more than {rule[key]}% of its inodes.' + logger.warning(message) + snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + changedEvents=True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + print(message) + events.append(event) + elif lkey == "maxhardquotaspacepercentused": + if(record.get("space") != None and record["space"]["used"].get("hard_limit_percent") and + record["space"]["used"]["hard_limit_percent"] >= rule[key]): + uniqueIdentifier = str(record["index"]) + "_" + key + if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. + if record.get("qtree") != None: + qtree=f' under qtree: {record["qtree"]["name"]} ' + else: + qtree=" " + if record.get("users") != None: + users=None + for user in record["users"]: + if users == None: + users = user["name"] + else: + users += ',{user["name"]}' + user=f'associated with user(s) "{users}" ' + else: + user='' + message = f'Quota Space Usage Alert: Hard quota of type "{record["type"]}" on {record["svm"]["name"]}:/{record["volume"]["name"]}{qtree}{user}on {clusterName} is using {record["space"]["used"]["hard_limit_percent"]}% which is more than {rule[key]}% of its allocaed space.' + logger.warning(message) + snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + changedEvents=True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + print(message) + events.append(event) + elif lkey == "maxsoftquotaspacepercentused": + if(record.get("space") != None and record["space"]["used"].get("soft_limit_percent") and + record["space"]["used"]["soft_limit_percent"] >= rule[key]): + uniqueIdentifier = str(record["index"]) + "_" + key + if not eventExist(events, uniqueIdentifier): # This resets the "refresh" field if found. + if record.get("qtree") != None: + qtree=f' under qtree: {record["qtree"]["name"]} ' + else: + qtree=" " + if record.get("users") != None: + users=None + for user in record["users"]: + if users == None: + users = user["name"] + else: + users += ',{user["name"]}' + user=f'associated with user(s) "{users}" ' + else: + user='' + message = f'Quota Space Usage Alert: Soft quota of type "{record["type"]}" on {record["svm"]["name"]}:/{record["volume"]["name"]}{qtree}{user}on {clusterName} is using {record["space"]["used"]["soft_limit_percent"]}% which is more than {rule[key]}% of its allocaed space.' + logger.info(message) + snsClient.publish(TopicArn=config["snsTopicArn"], Message=message, Subject=f'Monitor ONTAP Services Alert for cluster {clusterName}') + changedEvents=True + event = { + "index": uniqueIdentifier, + "message": message, + "refresh": eventResilience + } + print(message) + events.append(event) + else: + print(f'Unknown storage matching condition type "{key}".') + # + # After processing the records, see if any events need to be removed. + i=0 + while i < len(events): + if events[i]["refresh"] <= 0: + print(f'Deleting event: {events[i]["message"]}') + del events[i] + changedEvents = True + else: + # If an event wasn't refreshed, then we need to save the new refresh count. + if events[i]["refresh"] != eventResilience: + changedEvents = True + i += 1 + # + # If the events array changed, save it. + if(changedEvents): + s3Client.put_object(Key=config["quotaEventsFilename"], Bucket=config["s3BucketName"], Body=json.dumps(events).encode('UTF-8')) + else: + print(f'API call to {endpoint} failed. HTTP status code {response.status}.') + +################################################################################ +# This function is used to read in all the configuration parameters from the +# various places: +# Environment Variables +# Config File +# Calculated +################################################################################ +def readInConfig(): + # + # Define global variables so we don't have to pass them to all the functions. + global config, s3Client, snsClient, http, headers, clusterName, clusterVersion, logger + # + # Define a dictionary with all the required variables so we can + # easily add them and check for their existence. + requiredEnvVariables = { + "OntapAdminServer": None, + "s3BucketName": None, + "s3BucketRegion": None + } + + optionalVariables = { + "configFilename": None, + "secretsManagerEndPointHostname": None, + "snsEndPointHostname": None, + "syslogIP": None, + "awsAccountId": None + } + + filenameVariables = { + "emsEventsFilename": None, + "smEventsFilename": None, + "smRelationshipsFilename": None, + "conditionsFilename": None, + "storageEventsFilename": None, + "quotaEventsFilename": None, + "systemStatusFilename": None + } + + config = { + "snsTopicArn": None, + "snsRegion": None, + "secretName": None, + "secretRegion": None, + "secretUsernameKey": None, + "secretPasswordKey": None + } + config.update(filenameVariables) + config.update(optionalVariables) + config.update(requiredEnvVariables) + # + # Get the required, and any additional, paramaters from the environment. + for var in config: + config[var] = os.environ.get(var) + # + # Check that required environmental variables are there. + for var in requiredEnvVariables: + if config[var] == None: + raise Exception (f'\n\nMissing required environment variable "{var}".') + # + # Open a client to the s3 service. + s3Client = boto3.client('s3', config["s3BucketRegion"]) + # + # Calculate the config filename if it hasn't already been provided. + defaultConfigFilename = config["OntapAdminServer"] + "-config" + if config["configFilename"] == None: + config["configFilename"] = defaultConfigFilename + # + # Process the config file if it exist. + try: + lines = s3Client.get_object(Key=config["configFilename"], Bucket=config["s3BucketName"])['Body'].iter_lines() + except botocore.exceptions.ClientError as err: + if err.response['Error']['Code'] != "NoSuchKey": + raise err + else: + if config["configFilename"] != defaultConfigFilename: + print(f"Warning, did not find file '{config['configFilename']}' in s3 bucket '{config['s3BucketName']}' in region '{config['s3BucketRegion']}'.") + else: + # + # While iterating through the file, get rid of any "export ", comments, blank lines, or anything else that isn't key=value. + for line in lines: + line = line.decode('utf-8') + if line[0:7] == "export ": + line = line[7:] + comment = line.split("#") + line=comment[0].strip().replace('"', '') + x = line.split("=") + if len(x) == 2: + (key, value) = line.split("=") + key = key.strip() + value = value.strip() + # + # Preserve any environment variables settings. + if key in config: + if config[key] == None: + config[key] = value + else: + print(f"Warning, unknown config parameter '{key}'.") + # + # Now, fill in the filenames for any that aren't already defined. + for filename in filenameVariables: + if config[filename] == None: + config[filename] = config["OntapAdminServer"] + "-" + filename.replace("Filename", "") + # + # Define the endpoints if an alternate wasn't already given. + if config["secretsManagerEndPointHostname"] == None or config["secretsManagerEndPointHostname"] == "": + config["secretsManagerEndPointHostname"] = f'secretsmanager.{config["secretRegion"]}.amazonaws.com' + + if config["snsEndPointHostname"] == None or config["snsEndPointHostname"] == "": + config["snsEndPointHostname"] = f'sns.{config["snsRegion"]}.amazonaws.com' + # + # Now, check that all the configuration parameters have been set. + for key in config: + if config[key] == None and key not in optionalVariables: + raise Exception(f'Missing configuration parameter "{key}".') + +################################################################################ +# Main logic +################################################################################ +def lambda_handler(event, context): + # + # Define global variables so we don't have to pass them to all the functions. + global config, s3Client, snsClient, http, headers, clusterName, clusterVersion, logger + # + # Read in the configuraiton. + readInConfig() # This defines the s3Client variable. + # + # Set up loging. + logger = logging.getLogger("mon_fsxn_service") + logger.setLevel(logging.DEBUG) # Anything at this level and above this get logged. + if config["syslogIP"] != None: + # + # Due to a bug with the SysLogHandler() of not sending proper framing with a message + # when using TCP (it should end it with a LF and not a NUL like it does now) you must add + # an additional frame delimiter to the receiving syslog server. With rsyslog, you add + # a AddtlFrameDelimiter="0" directive to the "input()" line where they have it listen + # to a TCP port. For example: + # + # # provides TCP syslog reception + # module(load="imtcp") + # input(type="imtcp" port="514" AddtlFrameDelimiter="0") + # + # Because of this bug, I am going to stick with UDP, the default protocol used by + # the syslog handler. If TCP is required, then the above changes will have to be made + # to the syslog server. Or, the program will have to handle closing and opening the + # connection for each message. The following will do that: + # handler.flush() + # handler.close() + # logger.removeHandler(handler) + # handler = logging.handlers.SysLogHandler(facility=SysLogHandler.LOG_LOCAL0, address=(syslogIP, 514), socktype=socket.SOCK_STREAM) + # handler.setFormatter(formatter) + # logger.addHandler(handler) + # + # You might get away with a simple handler.open() after the close(), without having to + # remove and add the handler. I didn't test that. + handler = logging.handlers.SysLogHandler(facility=SysLogHandler.LOG_LOCAL0, address=(config["syslogIP"], 514)) + formatter = logging.Formatter( + fmt="%(name)s:%(funcName)s - Level:%(levelname)s - Message:%(message)s", + datefmt="%Y-%m-%d %H:%M:%S" + ) + handler.setFormatter(formatter) + logger.addHandler(handler) + # + # Create a Secrets Manager client. + session = boto3.session.Session() + client = session.client(service_name='secretsmanager', region_name=config["secretRegion"], endpoint_url=f'https://{config["secretsManagerEndPointHostname"]}') + # + # Get the username and password of the ONTAP/FSxN system. + secretsInfo = client.get_secret_value(SecretId=config["secretName"]) + secrets = json.loads(secretsInfo['SecretString']) + username = secrets[config['secretUsernameKey']] + password = secrets[config['secretPasswordKey']] + # + # Create clients to the other AWS services we will be using. + s3Client = boto3.client('s3', config["s3BucketRegion"]) + snsClient = boto3.client('sns', region_name=config["snsRegion"], endpoint_url=f'https://{config["snsEndPointHostname"]}') + # + # Create a http handle to make ONTAP/FSxN API calls with. + auth = urllib3.make_headers(basic_auth=f'{username}:{password}') + headers = { **auth } + # + # Disable warning about connecting to servers with self-signed SSL certificates. + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + retries = Retry(total=None, connect=1, read=1, redirect=10, status=0, other=0) + http = urllib3.PoolManager(cert_reqs='CERT_NONE', retries=retries) + # + # Get the conditions we want to alert on. + try: + data = s3Client.get_object(Key=config["conditionsFilename"], Bucket=config["s3BucketName"]) + except botocore.exceptions.ClientError as err: + print(f'\n\nError, could not retrieve configuration file {config["conditionsFilename"]} from: s3://{config["s3BucketName"]}.\nBelow is additional information:\n\n') + raise err + + matchingConditions = json.loads(data["Body"].read().decode('UTF-8')) + + if(checkSystem()): + # + # Loop on all the configured ONTAP services we want to check on. + for service in matchingConditions["services"]: + if service["name"].lower() == "systemhealth": + checkSystemHealth(service) + elif service["name"].lower() == "ems": + processEMSEvents(service) + elif (service["name"].lower() == "snapmirror"): + processSnapMirrorRelationships(service) + elif service["name"].lower() == "storage": + processStorageUtilization(service) + elif service["name"].lower() == "quota": + processQuotaUtilization(service) + else: + print(f'Unknown service "{service["name"]}".') + return + +if os.environ.get('AWS_LAMBDA_FUNCTION_NAME') == None: + lambdaFunction = False + lambda_handler(None, None) +else: + lambdaFunction = True