diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..952003f --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,22 @@ +# +# Set a "catch-all" owner. Should be set to the current active admin for the repo. +* @kcantrel +# +# Set the owner for all the individual samples, if the owner is known and they want to continue to maintain the sample: +/AI/GenAI-ChatBot-application-sample/ @cbenbass +/CloudFormation/deploy-fsx-ontap/ @kcantrel +/Ansible/fsx_inventory_report/ @kcantrel +/Ansible/snapmirror_report/ @kcantrel +/EKS/FSxN-as-PVC-for-EKS/ @mickeysh +/Management-Utilities/auto_create_sm_relationships/ @kcantrel +/Management-Utilities/auto_set_fsxn_auto_grow/ @kcantrel +/Management-Utilities/fsx-ontap-aws-cli-scripts/ @kcantrel +/Management-Utilities/fsxn-rotate-secret/ @kcantrel +/Management-Utilities/warm_performance_tier/ @kcantrel +/Monitoring/CloudWatch-FSx/ @LirazRom10 +/Monitoring/LUN-monitoring/ @LirazRom10 +/Monitoring/auto-add-cw-alarms/ @kcantrel +/Monitoring/monitor-ontap-services/ @kcantrel +/Terraform/deploy-fsx-ontap-sqlserver/ @varunrai +/Terraform/deploy-fsx-ontap-fileshare-access/ @varunrai +/Terraform/deploy-fsx-ontap/ @kcantrel diff --git a/.github/linters/.yaml-lint.yml b/.github/linters/.yaml-lint.yml index 2959ad2..1fc32a2 100644 --- a/.github/linters/.yaml-lint.yml +++ b/.github/linters/.yaml-lint.yml @@ -2,6 +2,7 @@ extends: default rules: line-length: disable + comments-indentation: disable trailing-spaces: disable truthy: disable braces: diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml new file mode 100755 index 0000000..d19e21b --- /dev/null +++ b/.github/workflows/dependency-review.yml @@ -0,0 +1,39 @@ +# Dependency Review Action +# +# This Action will scan dependency manifest files that change as part of a Pull Request, +# surfacing known-vulnerable versions of the packages declared or updated in the PR. +# Once installed, if the workflow run is marked as required, PRs introducing known-vulnerable +# packages will be blocked from merging. +# +# Source repository: https://github.com/actions/dependency-review-action +# Public documentation: https://docs.github.com/en/code-security/supply-chain-security/understanding-your-software-supply-chain/about-dependency-review#dependency-review-enforcement +name: 'Dependency review' +on: + pull_request: + branches: [ "main" ] + +# If using a dependency submission action in this workflow this permission will need to be set to: +# +# permissions: +# contents: write +# +# https://docs.github.com/en/enterprise-cloud@latest/code-security/supply-chain-security/understanding-your-software-supply-chain/using-the-dependency-submission-api +permissions: + contents: read + # Write permissions for pull-requests are required for using the `comment-summary-in-pr` option, comment out if you aren't using this option + pull-requests: write + +jobs: + dependency-review: + runs-on: ubuntu-latest + steps: + - name: 'Checkout repository' + uses: actions/checkout@v4 + - name: 'Dependency Review' + uses: actions/dependency-review-action@v4 + # Commonly enabled options, see https://github.com/actions/dependency-review-action#configuration-options for all available options. + with: + comment-summary-in-pr: always + # fail-on-severity: moderate + # deny-licenses: GPL-1.0-or-later, LGPL-2.0-or-later + # retry-on-snapshot-warnings: true diff --git a/.github/workflows/update-CloudformationTemplate-auto-add-cw-alarms.yml b/.github/workflows/update-CloudformationTemplate-auto-add-cw-alarms.yml new file mode 100644 index 0000000..1138aee --- /dev/null +++ b/.github/workflows/update-CloudformationTemplate-auto-add-cw-alarms.yml @@ -0,0 +1,37 @@ +--- +# Copyright (c) NetApp, Inc. +# SPDX-License-Identifier: Apache-2.0 + +name: "Update Cloudformation Template" + +on: + pull_request: + paths: + - 'Monitoring/auto-add-cw-alarms/auto_add_cw_alarms.py' + push: + paths: + - 'Monitoring/auto-add-cw-alarms/auto_add_cw_alarms.py' + branches: + - main + +jobs: + update-Cloudformation-Template: + runs-on: ubuntu-latest + permissions: + # Give the default GITHUB_TOKEN write permission to commit and push the + # added or changed files to the repository. + contents: write + + steps: + - name: Checkout pull request + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.ref }} + + - name: Update the Cloudformation Template + shell: bash + working-directory: Monitoring/auto-add-cw-alarms + run: ./update-auto-add-cw-alarms-CF-Template + + - name: Commit the changes + uses: stefanzweifel/git-auto-commit-action@v5 diff --git a/AI/GenAI-ChatBot-application-sample/package.json b/AI/GenAI-ChatBot-application-sample/package.json index 77801ec..65643cf 100644 --- a/AI/GenAI-ChatBot-application-sample/package.json +++ b/AI/GenAI-ChatBot-application-sample/package.json @@ -1,6 +1,6 @@ { "name": "gen-ai-standalone-app", - "version": "1.0.0", + "version": "1.0.1", "private": true, "scripts": { "dev": "cross-env HOST=localhost PORT=9091 next dev", @@ -9,27 +9,27 @@ "lint": "next lint" }, "dependencies": { - "@clerk/nextjs": "^5.2.2", - "@reduxjs/toolkit": "^2.2.5", - "@types/react-redux": "^7.1.33", + "@clerk/nextjs": "^5.7.2", + "@reduxjs/toolkit": "^2.2.8", + "@types/react-redux": "^7.1.34", "aws-amplify": "5.3.11", "cross-env": "^7.0.3", - "html-react-parser": "^5.1.10", + "html-react-parser": "^5.1.18", "moment": "^2.30.1", - "next": "14.2.4", + "next": "14.2.15", "react": "^18", "react-dom": "^18", "react-popper-tooltip": "^4.4.2", "react-redux": "^9.1.2", - "sass": "^1.77.6" + "sass": "^1.79.5" }, "devDependencies": { "@svgr/webpack": "^8.1.0", - "@types/node": "^20", + "@types/node": "^22", "@types/react": "^18", "@types/react-dom": "^18", "eslint": "^8", - "eslint-config-next": "14.2.4", + "eslint-config-next": "14.2.15", "typescript": "^5" } } diff --git a/Ansible/README.md b/Ansible/README.md new file mode 100644 index 0000000..aedd3e4 --- /dev/null +++ b/Ansible/README.md @@ -0,0 +1,26 @@ +# Ansible Samples +Ansible is a general purpose automation engine. It can be used to automate most repetitive IT tasks. +The typical input you provide Ansible is a "Playbook." This folder holds various playbooks that can be used +to help automate tasks around the management of a FSx for ONTAP file system. +Click here for more information on [Ansible](https://www.ansible.com/). + +| Tool | Description | +| --- | --- | +| [Inventory Report](./fsx_inventory_report) | Creates a report of all the FSxN file systems within an AWS account.| +| [SnapMirror Report](./snapmirror_report) | Creates a report of all SnapMirror relationships within all the FSxN file systems within an AWS account.| + +## Author Information + +This repository is maintained by the contributors listed on [GitHub](https://github.com/NetApp/FSx-ONTAP-samples-scripts/graphs/contributors). + +## License + +Licensed under the Apache License, Version 2.0 (the "License"). + +You may obtain a copy of the License at [apache.org/licenses/LICENSE-2.0](http://www.apache.org/licenses/LICENSE-2.0). + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an _"AS IS"_ basis, without WARRANTIES or conditions of any kind, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. + +© 2024 NetApp, Inc. All Rights Reserved. diff --git a/Ansible/fsx_inventory_report/README.md b/Ansible/fsx_inventory_report/README.md new file mode 100644 index 0000000..e880749 --- /dev/null +++ b/Ansible/fsx_inventory_report/README.md @@ -0,0 +1,68 @@ +# Ansible Inventory Report +This Ansible playbook generates a report of all the FSx for ONTAP file systems within an AWS account. +In includes all the SVMs and Volumes. The format of the report is as follows: +``` +Region: + File System ID: + SVM ID: + Volumes: + + + SVM ID: + Volumes: + + + File System ID: + SVM ID: + Volumes: + + + SVM ID: + Volumes: + + +``` +Where: + - \ is the provisioned size of the volume in megabytes. + - \ is the security style of the volume (e.g. UNIX, NTFS). + - \ is the type of the volume (e.g. RW, DP). + +## Requirements +- jq - A lightweight and flexible command-line JSON processor. Installation instructions can be found [here](https://jqlang.github.io/jq/download/) +- Ansible 2.9 or later. Installation instructions can be found [here](https://docs.ansible.com/ansible/latest/installation_guide/index.html) +- AWS Ansible collection. This should be included with the base installation of Ansible. + +## Installation +There are three files used to create the report: +- `generate_report.yaml`: The Ansible playbook that generates the report. +- `processs_region.yaml`: A collection of tasks that will process all the FSxNs in a region. +- `get_all_fsxn_regions.yaml`: A collection of tasks that retrieves all the AWS regions, that are enabled for the account, where FSx for ONTAP is available. + +## Configuration +There are a variable that can be changed at the top of the `generate_report.yaml` file: +- report\_name - Sets the file path of the report that will be generated. + +Since this script leverages the AWS Ansible collection as well as the `aws` cli, you will need to provide authentication credentials for them. +You can read more about how to do that [here](https://docs.ansible.com/ansible/latest/collections/amazon/aws/docsite/aws_ec2_guide.html#authentication). + +## Usage +To generate the report copy the three files mentioned above and run the following command: +```bash +ansible-playbook generate_report.yaml +``` + +## Author Information + +This repository is maintained by the contributors listed on [GitHub](https://github.com/NetApp/FSx-ONTAP-samples-scripts/graphs/contributors). + +## License + +Licensed under the Apache License, Version 2.0 (the "License"). + +You may obtain a copy of the License at [apache.org/licenses/LICENSE-2.0](http://www.apache.org/licenses/LICENSE-2.0). + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an _"AS IS"_ basis, without WARRANTIES or conditions of any kind, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. + +© 2024 NetApp, Inc. All Rights Reserved. diff --git a/Ansible/fsx_inventory_report/generate_report.yaml b/Ansible/fsx_inventory_report/generate_report.yaml new file mode 100644 index 0000000..5d40b09 --- /dev/null +++ b/Ansible/fsx_inventory_report/generate_report.yaml @@ -0,0 +1,34 @@ +# +# This Ansible playbook generates an inventory report for all the FSXNs +# in all the AWS regions for the account that it is running under. +################################################################################# +--- +- vars: + report_name: output.txt +################################################################################# +# +# Don't change anything below this line. +# +################################################################################# + fsxn_regions: [] + opted_in_regions: [] + + name: Playbook to generate an inventory report on all the FSxNs. + hosts: localhost + collections: + - amazon.aws + gather_facts: false + + tasks: + - name: Make sure the report file is empty. + ansible.builtin.shell: + cmd: echo "" > {{ report_name }} + + - name: Get all the regions that support FSxN that are opted into. + include_tasks: get_fsxn_regions.yaml + + - name: Generate the report for all the FSxNs. + include_tasks: process_region.yaml + loop: "{{ fsxn_regions }}" + loop_control: + loop_var: region diff --git a/Ansible/fsx_inventory_report/get_fsxn_regions.yaml b/Ansible/fsx_inventory_report/get_fsxn_regions.yaml new file mode 100644 index 0000000..06270df --- /dev/null +++ b/Ansible/fsx_inventory_report/get_fsxn_regions.yaml @@ -0,0 +1,26 @@ +# +# These tasks are used to set a variable named 'fsnx_regions' that contains a +# list of regions that support FSxN and are opted-in. +################################################################################ +- name: Get all the opted-in regions + amazon.aws.aws_region_info: + register: region_info + +- name: Get region names + set_fact: + enabled_regions: "{{ region_info.regions | map(attribute='region_name') | list }}" + +- name: Get the capabilities of all regions + set_fact: + regions_capabilities: "{{ lookup('ansible.builtin.url', 'https://api.regional-table.region-services.aws.a2z.com/index.json', split_lines=false) }}" + +- name: Filter regions that support FSxN and are opted-in + set_fact: + fsxn_regions: >- + {{ + regions_capabilities.prices + | selectattr("attributes.aws:serviceName", "equalto", "Amazon FSx for NetApp ONTAP") + | selectattr("attributes.aws:region", "in", enabled_regions) + | map(attribute="attributes.aws:region") + | list + }} diff --git a/Ansible/fsx_inventory_report/process_region.yaml b/Ansible/fsx_inventory_report/process_region.yaml new file mode 100644 index 0000000..fd848ab --- /dev/null +++ b/Ansible/fsx_inventory_report/process_region.yaml @@ -0,0 +1,24 @@ +# +# Since Ansible can't handle nested loops, this is a block of tasked that is +# run for each region. It assume that the calling playbook used 'region' as +# its loop variable. +################################################################################ +--- +- name: Get all the FSxNs for the specified region. + ansible.builtin.shell: + cmd: aws fsx describe-file-systems --region {{ region }} --query 'FileSystems[*].{FileSystemId:FileSystemId}' --output text | sed -e '/^$/d' + register: fsxn_ids_per_region + +- name: Get all the SVMs and volumes for each FSxN. + ansible.builtin.shell: + cmd: | + echo "Region: {{ region }}" >> {{ report_name }}; + fs={{ item }}; + echo " File System ID: ${fs}" >> {{ report_name }}; + svms=$(aws fsx describe-storage-virtual-machines --filters Name=file-system-id,Values=${fs} --region {{ region }} --output=text --query 'StorageVirtualMachines[*].StorageVirtualMachineId'); + for svm in $svms; do + echo " SVM ID: ${svm}" >> {{ report_name }}; + echo " Volume IDs:" >> {{ report_name }}; + aws fsx describe-volumes --filters Name=storage-virtual-machine-id,Values=${svm} --region {{ region }} --output=json --query 'Volumes[*].{Size: OntapConfiguration.SizeInMegabytes, ID: VolumeId, Name: Name, Type: OntapConfiguration.OntapVolumeType, Style: OntapConfiguration.SecurityStyle}' | jq -r '.[] | " \(.ID) \(.Type) \(.Style) \(.Size) \(.Name)"' >> {{ report_name }}; + done + loop: "{{ fsxn_ids_per_region.stdout_lines }}" diff --git a/Ansible/snapmirror_report/README.md b/Ansible/snapmirror_report/README.md new file mode 100644 index 0000000..3e15f94 --- /dev/null +++ b/Ansible/snapmirror_report/README.md @@ -0,0 +1,72 @@ +# Ansible SnapMirror Report +This Ansible playbook generates a report of all the FSx for ONTAP SnapMirror relationships within an AWS account. +The output of the report is a CSV file with the following columns: +- File System ID +- Source Path (svm:volume) +- Destination Path (svm:volume) +- State (e.g. snapmirrored, broken-off) +- Healthy (true or false) +- lag\_time (in "P#DT#H#M#S" format. See below for an explanation) + +The lag\_time format is an ASCII string that always starts with the letter 'P' and if the lag time is more than 24 hours it is followed by +a number and the letter 'D'. The number represents the number days. The next character is always a 'T' and is followed by +number/letter pairs, where the letter is either an 'H', 'M', or 'S'. If the letter is 'H' then the number before it represents +the number of hours. If the letter is 'M' then the number before it represents the number of minutes. If the letter is 'S' then +the number before it represents the number of seconds. For example, 'P1DT2H3M4S' represents 1 day, 2 hours, 3 minutes, and 4 seconds. + +## Requirements +- jq - A lightweight and flexible command-line JSON processor. Installation instructions can be found [here](https://jqlang.github.io/jq/download/) +- Ansible 2.9 or later. Installation instructions can be found [here](https://docs.ansible.com/ansible/latest/installation_guide/index.html) +- AWS Ansible collection. This should be included with the base installation of Ansible. +- AWS secret(s) with the credentials necessary to run SnapMirror ONTAP APIs against the FSx for ONTAP file systems. The required format of the secret is described below. + +## Installation +There are three files used to create the report: +- `generate_report.yaml`: The Ansible playbook that generates the report. +- `processs_region.yaml`: A collection of tasks that will process all the FSxNs in a region. +- `get_all_fsxn_regions.yaml`: A collection of tasks that retrieves all the regions, that are enabled for the account, where FSx for ONTAP is available. + +You will also need to create a file named (by default) `secrets_list.csv` that list the secret name for each FSx file systems. +The format of the file should be: +``` +file_system_id,secret_name +``` +:warning: **NOTE:** Do not add any spaces before or after the file\_system\_id or secret\_name. + +Each secret should have two `keys`: +| Key | Value | +| --- | --- | +| `username` | The username to use to authenticate with the FSx for ONTAP file system. | +| `password` | The password to use to authenticate with the FSx for ONTAP file system. | + +## Configuration +There are a few variables that can be changed at the top of the `generate_report.yaml` file: +- report\_name - Sets the file path of the report that will be generated. +- secrets\_list\_file - Sets the file path of the file that contains the list of FSx file systems and their secrets. See above for more information. +- secrets\_region - Set the region where the secrets are stored. + +Since this script leverages the AWS Ansible collection as well as the `aws` cli, you will need to provide authentication credentials for them. +You can read more about how to do that [here](https://docs.ansible.com/ansible/latest/collections/amazon/aws/docsite/aws_ec2_guide.html#authentication). + +## Usage +To generate the report, run the following command: +```bash +ansible-playbook generate_report.yaml +``` +After a successful run, the report will be stored in the file specified by the `report_name` variable. + +## Author Information + +This repository is maintained by the contributors listed on [GitHub](https://github.com/NetApp/FSx-ONTAP-samples-scripts/graphs/contributors). + +## License + +Licensed under the Apache License, Version 2.0 (the "License"). + +You may obtain a copy of the License at [apache.org/licenses/LICENSE-2.0](http://www.apache.org/licenses/LICENSE-2.0). + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an _"AS IS"_ basis, without WARRANTIES or conditions of any kind, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. + +© 2024 NetApp, Inc. All Rights Reserved. diff --git a/Ansible/snapmirror_report/generate_report.yaml b/Ansible/snapmirror_report/generate_report.yaml new file mode 100644 index 0000000..306ceab --- /dev/null +++ b/Ansible/snapmirror_report/generate_report.yaml @@ -0,0 +1,36 @@ +# +# This Ansible playbook generates a SnapMirrorreport for all the +# SnapMirror relationships, in all the FSxNs, in all regions, +################################################################################# +--- +- vars: + report_name: output.csv + secrets_list_file: secrets_list.csv + secrets_region: us-west-2 +################################################################################# +# +# Don't change anything below this line. +# +################################################################################# + fsxn_regions: [] + opted_in_regions: [] + + name: Playbook to generate a SnapMirror report on all the FSxNs. + hosts: localhost + collections: + - amazon.aws + gather_facts: false + + tasks: + - name: Delete previous report while adding the header line. + ansible.builtin.shell: + cmd: echo fs,source,destination,state,healthy,lag_time > {{ report_name }} + + - name: Get all the regions that support FSxN that I am opted into. + include_tasks: get_fsxn_regions.yaml + + - name: Generate the report for all the FSxNs. + include_tasks: process_region.yaml + loop: "{{ fsxn_regions }}" + loop_control: + loop_var: region diff --git a/Ansible/snapmirror_report/get_fsxn_regions.yaml b/Ansible/snapmirror_report/get_fsxn_regions.yaml new file mode 100644 index 0000000..06270df --- /dev/null +++ b/Ansible/snapmirror_report/get_fsxn_regions.yaml @@ -0,0 +1,26 @@ +# +# These tasks are used to set a variable named 'fsnx_regions' that contains a +# list of regions that support FSxN and are opted-in. +################################################################################ +- name: Get all the opted-in regions + amazon.aws.aws_region_info: + register: region_info + +- name: Get region names + set_fact: + enabled_regions: "{{ region_info.regions | map(attribute='region_name') | list }}" + +- name: Get the capabilities of all regions + set_fact: + regions_capabilities: "{{ lookup('ansible.builtin.url', 'https://api.regional-table.region-services.aws.a2z.com/index.json', split_lines=false) }}" + +- name: Filter regions that support FSxN and are opted-in + set_fact: + fsxn_regions: >- + {{ + regions_capabilities.prices + | selectattr("attributes.aws:serviceName", "equalto", "Amazon FSx for NetApp ONTAP") + | selectattr("attributes.aws:region", "in", enabled_regions) + | map(attribute="attributes.aws:region") + | list + }} diff --git a/Ansible/snapmirror_report/process_region.yaml b/Ansible/snapmirror_report/process_region.yaml new file mode 100644 index 0000000..aecce2c --- /dev/null +++ b/Ansible/snapmirror_report/process_region.yaml @@ -0,0 +1,36 @@ +# +# Since Ansible can't handle nested loops, this is a block of tasked that is +# run for each region. It assume that the calling playbook used 'region' as its loop variable. +################################################################################# +--- +- name: Get all the FSxNs for the specified region. + ansible.builtin.shell: + cmd: aws fsx describe-file-systems --region {{ region }} --query 'FileSystems[*].{FileSystemId:FileSystemId}' --output text | sed -e '/^$/d' + register: fsxn_ids_per_region + +- name: Get the SnapMirror relationships for each FSxN. + when: secret != 'n/a' + ansible.builtin.shell: + cmd: | + fs={{ item }}; + username="{{ lookup('amazon.aws.aws_secret', '{{ secret }}.username', region=secrets_region, nested=true, on_missing='skip') }}"; + password="{{ lookup('amazon.aws.aws_secret', '{{ secret }}.password', region=secrets_region, nested=true, on_missing='skip') }}"; + if [ "$username" = '[]' -o "$password" = '[]' ]; then + echo "Missing secret for file system $fs" 1>&2; + exit 0; + fi; + ip=$(aws fsx describe-file-systems --region {{ region }} --file-system-ids $fs --query 'FileSystems[0].OntapConfiguration.Endpoints.Management.IpAddresses[0]' --output=text); + curl -s -u "${username}:${password}" -k https://$ip/api/snapmirror/relationships?fields=source,destination,lag_time,state,healthy | jq -r '.records[] | "'${fs}',\(.source.path),\(.destination.path),\(.state),\(.healthy),\(.lag_time)"' + loop: "{{ fsxn_ids_per_region.stdout_lines }}" + register: snapmirror_relationships + vars: + secret: "{{ lookup('ansible.builtin.csvfile', item, file=secrets_list_file, delimiter=',', default='n/a') }}" + +- name: Write the SnapMirror relationships to a file. + when: item.stdout is defined + ansible.builtin.shell: + cmd: | + if [ "{{ item.stdout }}" != "" ]; then + echo "{{ item.stdout }}" >> {{ report_name }}; + fi + loop: "{{ snapmirror_relationships.results }}" diff --git a/Management-Utilities/fsx-ontap-aws-cli-scripts/README.md b/Management-Utilities/fsx-ontap-aws-cli-scripts/README.md index 7f42b1e..056f139 100644 --- a/Management-Utilities/fsx-ontap-aws-cli-scripts/README.md +++ b/Management-Utilities/fsx-ontap-aws-cli-scripts/README.md @@ -14,18 +14,18 @@ Before running the UNIX based scripts, make sure the following package is instal | Script | Description | |:------------------------|:----------------| -|create_fsxn_filesystem | Creates a new FSx for NetApp ONTAP file-system | -|create_fsxn_svm | Creates a new Storage Virtual Server (svm) in a soecific FSx ONTAP filesystem | +|create_fsxn_filesystem | Creates a new FSx for NetApp ONTAP file system. | +|create_fsxn_svm | Creates a new Storage Virtual Server (SVM) in a specified FSx for ONTAP file system. | |create_fsxn_volume | Creates a new volume under a specified SVM. | -|list_fsx_filesystems | List all the FSx for NetApp ONTAP filesystems that the user has access to. | -|list_fsx_filesystems.ps1 | List all the FSx for NetApp ONTAP filesystems that the user has access to, written in PowerShell. | +|list_fsx_filesystems | List all the FSx for NetApp ONTAP file systems that the user has access to. | +|list_fsx_filesystems.ps1 | List all the FSx for NetApp ONTAP file systems that the user has access to, written in PowerShell. | |list_fsxn_volumes | List all the FSx for NetApp ONTAP volumes that the user has access to. | |list_fsxn_svms | List all the storage virtual machines that the user access to. | -|list_aws_subnets | List all the aws subnets. | -|list_aws_vpcs | List all the aws vpcs. | -|delete_fsxn_filesystem | Deletes an FSx for NetApp ONTAP filesystem. | -|delete_fsxn_svm | Deletes an svm. | -|delete_fsxn_volume | Deletes a volume. | +|list_aws_subnets | List all the AWS subnets. | +|list_aws_vpcs | List all the AWS VPCs. | +|delete_fsxn_filesystem | Deletes a specified FSx for ONTAP file system. Including all the SVMs and volumes on it. | +|delete_fsxn_svm | Deletes a specified SVM. Including all the volumes assigned to it. | +|delete_fsxn_volume | Deletes a specified volume. | ## Author Information diff --git a/Management-Utilities/fsx-ontap-aws-cli-scripts/create_fsxn_filesystem b/Management-Utilities/fsx-ontap-aws-cli-scripts/create_fsxn_filesystem index 112391b..3a33901 100755 --- a/Management-Utilities/fsx-ontap-aws-cli-scripts/create_fsxn_filesystem +++ b/Management-Utilities/fsx-ontap-aws-cli-scripts/create_fsxn_filesystem @@ -20,17 +20,19 @@ ################################################################################ usage () { cat 1>&2 < /dev/null 2>&1; then - : -else - echo "Error, both the 'aws' and 'jq' commands are required to run this script." 1>&2 - exit 1 -fi +integerRegex='^[0-9]+$' + +################################################################################ +# This function is used to see if the first value passed is in one of the +# other parameters passed. +################################################################################ +is_in(){ + + local value=$1 + shift + for i in "$@"; do + if [ $i == $value ]; then + return 0 + fi + done + return 1 +} + +for cmd in jq aws; do + if which $cmd > /dev/null 2>&1; then + : + else + echo "Error, the '$cmd' is required to run this script." 1>&2 + exit 1 + fi +done # # Set some defaults. size=1024 @@ -54,7 +89,9 @@ throughput=128 region=$(aws configure list | egrep '^.*egion ' | awk '{print $2}') securityGroupOption="" endpointips="" -azType="MULTI_AZ_1" +availType=multi +numPairs=1 +waitForCompletion=false # # Process command line arguments. while [ ! -z "$1" ]; do @@ -66,13 +103,10 @@ while [ ! -z "$1" ]; do shift ;; -size|--size) size="$2" - if ! [[ "$size" =~ '^[0-9]+$' ]]; then + if ! [[ "$size" =~ $integerRegEx ]]; then echo "-size must be an integer." usage fi - if [ "$size" -le 1024 ]; then - usage - fi shift ;; -subnetid1|--subnetid1) subnetID1="$2" @@ -85,31 +119,34 @@ while [ ! -z "$1" ]; do shift ;; -type|--type) - if [ "$(echo $2 | tr [A-Z] [a-z])" == "single" ]; then - azType="SINGLE_AZ_1" - elif [ "$(echo $2 | tr [A-Z] [a-z])" == "multi" ]; then - azType="MULTI_AZ_1" - else - echo "Error, known availability type '$2'." + availType=$2 + if [ $availType != "single" -a $availType != "multi" ]; then + echo "-type must be 'single' or 'multi'." 1>&2 usage fi shift ;; -throughput|--throughput) throughput="$2" - if ! [[ "$throughput" =~ '^[0-9]+$' ]]; then + if ! [[ "$throughput" =~ $integerRegEx ]]; then echo "-throughput must be an integer." usage fi - if [ "$througput" != "128" -a "$througput" != "256" -a "$throughput" != "512" -a "$throughput" != "1024" -a "$throughput" != "2048" -a "$throughput" != "4096" ]; then - echo "-throughput must be 128 or 256 or 512 or 1024 or 2048 or 4096." - usage - fi shift ;; -endpointiprange|--endpointiprange) endpointips='"EndpointIpAddressRange": "'$2'",' shift ;; + -numberpairs|--numberpairs) numPairs="$2" + if ! [[ "$numPairs" =~ $integerRegEx ]]; then + echo "-numPairs must be an integer." + usage + fi + shift + ;; + -wait|--wait) + waitForCompletion=true + ;; -h|-help|--help) usage ;; @@ -119,16 +156,44 @@ while [ ! -z "$1" ]; do esac shift done + +if is_in "$throughput" "${throughputValuesGen1[@]}"; then + if [ $availType == "single" ]; then + azType="SINGLE_AZ_1" + elif [ $availType == "multi" ]; then + azType="MULTI_AZ_1" + else + echo "Error, unknown availability type '$availType'." + usage + fi +elif is_in "$throughput" "${throughputValuesGen2[@]}"; then + if [ $availType == "single" ]; then + azType="SINGLE_AZ_2" + elif [ $availType == "multi" ]; then + azType="MULTI_AZ_2" + else + echo "Error, unknown availability type '$availType'." + usage + fi +else + echo "Error, unsupported throughput value '$throughput'." + usage +fi # # Ensure all the required parameters have been provided. -if [ -z "$fileSystemName" -o -z "$subnetID1" -o "$azType" == "MULTI_AZ_1" -a -z "$subnetID2" ]; then - echo "Missing arguments." 1>&2 +if [ -z "$fileSystemName" ]; then + echo "Error, you must specify a file system name." 1>&2 + usage +fi + +if [ -z "$subnetID1" -o "$azType" == "MULTI_AZ_1" -a -z "$subnetID2" -o "$azType" == "MULTI_AZ_2" -a -z "$subnetID2" ]; then + echo "Error, you must specify only subnetID1 for a single availability zone deployments or both subnetID1 and subnetID2 for a multi availability zone deployments." 1>&2 usage - exit 1 fi -if [ $azType == "SINGLE_AZ_1" ]; then + +if [[ $azType == *"SINGLE_AZ"* ]]; then if [ ! -z "$endpointips" ]; then - echo "Error, you can not specify Endpoint IP address range when deploying in a single availability zone." 1>&2 + echo "Error, you cannot specify Endpoint IP address range when deploying in a single availability zone." 1>&2 exit 1 fi @@ -138,12 +203,32 @@ if [ $azType == "SINGLE_AZ_1" ]; then fi fi -aws fsx create-file-system --output=json --file-system-type ONTAP --storage-capacity $size --subnet-ids $subnetID1 $subnetID2 --storage-type SSD --tags "Key=Name,Value=$fileSystemName" $securityGroupOption --ontap-configuration '{ +if [ $numPairs -gt 1 ]; then + if ! is_in "$throughput" "${throughputValuesMultiHAPairs[@]}"; then + echo "Error, you can only specify more than one HA pair with throughput values of 1536, 3072, and 6144." 1>&2 + usage + fi + + if [ $azType != "SINGLE_AZ_2" ]; then + echo "Error, you can only specify more than one HA pair with a single availability zone deployment." 1>&2 + usage + fi +fi + +minSize=$((1024*numPairs)) +if [ $size -lt $minSize ]; then + echo "Error, the size must be at least $minSize for $numPairs HA pairs. In other words 1024 per HA pair." 1>&2 + usage +fi + +echo aws fsx create-file-system --output=json --file-system-type ONTAP --storage-capacity $size --subnet-ids $subnetID1 $subnetID2 --storage-type SSD --tags "Key=Name,Value=$fileSystemName" $securityGroupOption --ontap-configuration '{ "PreferredSubnetId": "'$subnetID1'", '$endpointips' "DeploymentType": "'$azType'", - "ThroughputCapacity": '$throughput'}' --region=$region > $tmpout 2>&1 - + "HAPairs": '$numPairs', + "ThroughputCapacityPerHAPair": '$throughput'}' --region=$region +# "ThroughputCapacityPerHAPair": '$throughput'}' --region=$region > $tmpout 2>&1 +exit if [ $? != "0" ]; then echo "Failed to create FSxN file system." 1>&2 cat $tmpout 1>&2 @@ -151,11 +236,49 @@ if [ $? != "0" ]; then else status=$(jq -r .FileSystem.Lifecycle $tmpout 2> /dev/null) if [ "$status" == "CREATING" -o "$status" == "PENDING" ]; then - echo "File system '$fileSystemName' ($(jq -r .FileSystem.FileSystemId $tmpout)) is being created." - exit 0 + fsid=$(jq -r .FileSystem.FileSystemId $tmpout) + printf "File system '$fileSystemName' ($fsid) is being created." + if [ $waitForCompletion == "true" ]; then + i=0 + while [ $i -lt $MaxIterations ]; do + aws fsx describe-file-systems --file-system-ids $fsid --output=json --region=$region > $tmpout 2>&1 + if [ $? -eq 0 ]; then + status=$(jq -r '.FileSystems[0].Lifecycle' $tmpout 2> /dev/null) + if [ "$status" == "AVAILABLE" ]; then + printf "\nFile system '$fileSystemName' ($fsid) has been created.\n" + break + fi + if [ "$status" != "CREATING" -a "$status" != "PENDING" ]; then + printf "\nError, failed to create the file system. Status = $status\n" 1>&2 + reason="$(jq -r '.FileSystems[0].LifecycleTransitionReason.Message' $tmpout 2> /dev/null)" + if [ ! -z "$reason" ]; then + echo "Reason: $reason" 1>&2 + else + cat $tmpout 1>&2 + fi + exit 1 + fi + printf "." + else + printf "\nError, failed to get the file system status.\n" 1>&2 + cat $tmpout 1>&2 + exit 1 + fi + sleep $SleepTime + let i+=1 + done + if [ $i -ge $MaxIterations ]; then + printf "\nFailed to create file system('$fsid'). Taking too long.\n" 1>&2 + exit 1 + fi + exit 0 + else + echo + fi else echo "Unknown status '$status'. Complete output returned from the AWS api:" 1>&2 cat $tmpout 1>&2 exit 1 fi fi +exit 0 diff --git a/Management-Utilities/fsx-ontap-aws-cli-scripts/create_fsxn_svm b/Management-Utilities/fsx-ontap-aws-cli-scripts/create_fsxn_svm index 8935954..aa16246 100755 --- a/Management-Utilities/fsx-ontap-aws-cli-scripts/create_fsxn_svm +++ b/Management-Utilities/fsx-ontap-aws-cli-scripts/create_fsxn_svm @@ -21,12 +21,13 @@ ################################################################################ usage () { cat 1>&2 < /dev/null | jq -r ".FileSystems[] | if((.Tags[] | select(.Key == \"Name\") .Value) == \"${fileSystemName}\") then .FileSystemId else empty end" 2> /dev/null) + fsid=$(aws fsx describe-file-systems --region $region --output=json 2> /dev/null | jq -r ".FileSystems[] | if((.Tags[] | select(.Key == \"Name\") .Value) == \"${fileSystemName}\") then .FileSystemId else empty end" 2> /dev/null) fi if [ -z "$fsid" ]; then - echo "Error, could not find the file system with name '$fileSystemName}' in region $region." 1>&2 + echo "Error, could not find the file system with name '${fileSystemName}' in region $region." 1>&2 exit 1 fi # @@ -94,11 +100,50 @@ if [ $? != "0" ]; then else status=$(jq -r .StorageVirtualMachine.Lifecycle $tmpout 2> /dev/null) if [ "$status" == "CREATING" -o "$status" == "PENDING" ]; then - echo "Stroage Virtaul Machine '$svmName'($(jq -r '.StorageVirtualMachine.StorageVirtualMachineId' $tmpout)) is being created." - exit 0 + svmId=$(jq -r '.StorageVirtualMachine.StorageVirtualMachineId' $tmpout) + printf "Stroage Virtaul Machine '$svmName'($svmId) is being created." + # + # Wait for the svm to be deleted. + if [ $waitForCompletion == true ]; then + i=0 + while [ $i -lt $maxIterations ]; do + aws fsx describe-storage-virtual-machines --storage-virtual-machine-ids $svmId --output=json --region=$region > $tmpout 2>&1 + if [ $? -eq 0 ]; then + status=$(jq -r '.StorageVirtualMachines[0].Lifecycle' $tmpout 2> /dev/null) + if [ "$status" == "CREATED" ]; then + printf "\nStorage Virtual Machine '$svmName'($svmId) has been created.\n" + break + fi + if [ "$status" != "CREATING" -a "$status" != "PENDING" ]; then + printf "\nError, failed to create SVM with SVM ID '$svmId'. Status = $status\n" 1>&2 + reason="$(jq -r '.StorageVirtualMachines[0].LifecycleTransitionReason.Message' $tmpout)" + if [ ! -z "$reason" ]; then + echo "Reason: $reason" 1>&2 + else + cat $tmpout 1>&2 + fi + exit 1 + fi + else + printf "\nError, failed to get status of SVM with SVM ID '$svmId'.\n" 1>&2 + cat $tmpout 1>&2 + exit 1 + fi + printf "." + sleep $sleepTime + let i+=1 + done + if [ $i -ge $maxIterations ]; then + printf "\nFailed to create SVM with SVM ID of '$svmID'. Taking too long.\n" 1>&2 + exit 1 + fi + else + printf "\n" + fi else echo "Unknown status '$status'. Complete output returned from the AWS api:" 1>&2 cat $tmpout 1>&2 exit 1 fi fi +exit 0 diff --git a/Management-Utilities/fsx-ontap-aws-cli-scripts/create_fsxn_volume b/Management-Utilities/fsx-ontap-aws-cli-scripts/create_fsxn_volume index eb9e592..d16bda6 100755 --- a/Management-Utilities/fsx-ontap-aws-cli-scripts/create_fsxn_volume +++ b/Management-Utilities/fsx-ontap-aws-cli-scripts/create_fsxn_volume @@ -21,12 +21,14 @@ ################################################################################ usage () { cat 1>&2 <&2 + usage + fi + ;; + v) volumeStyle=$OPTARG ;; - w) wait=1 + a) aggregateOption='"AggregateConfiguration":{"Aggregates":["'$OPTARG'"]},' + ;; + w) waitForCompletion=true ;; *) usage ;; @@ -66,15 +89,23 @@ if [ -z "$volumeName" -o -z "$svmId" ]; then usage fi -aws fsx create-volume --volume-type ONTAP --name $volumeName --ontap-configuration "{ - \"JunctionPath\": \"/$volumeName\", - \"SecurityStyle\": \"UNIX\", - \"SizeInMegabytes\" : $size, - \"StorageEfficiencyEnabled\": true, - \"StorageVirtualMachineId\": \"$svmId\", - \"TieringPolicy\" : {\"CoolingPeriod\": 31, \"Name\": \"SNAPSHOT_ONLY\"}, - \"OntapVolumeType\": \"RW\", - \"SnapshotPolicy\": \"default\"}" --region=$region --output=json > $tmpout 2>&1 +volumeStyle=$(echo $volumeStyle | tr '[a-z] ' '[A-Z]') +if [ $volumeStyle != "FLEXVOL" -a $volumeStyle != "FLEXGROUP" ]; then + echo "Error, volume style must be either FlexVol or FlexGroup." 1>&2 + usage +fi + +aws fsx create-volume --volume-type ONTAP --name $volumeName --ontap-configuration '{ + "JunctionPath": "/'$volumeName'", + "SecurityStyle": "UNIX", + "SizeInMegabytes" : '$size', + "StorageEfficiencyEnabled": true, + "StorageVirtualMachineId": "'$svmId'", + "TieringPolicy" : {"CoolingPeriod": 31, "Name": "SNAPSHOT_ONLY"}, + "OntapVolumeType": "RW", + "VolumeStyle": "'$volumeStyle'", + '$aggregateOption' + "SnapshotPolicy": "default"}' --region=$region --output=json > $tmpout 2>&1 if [ $? != "0" ]; then echo "Failed to create the FSxN volume." 1>&2 @@ -84,21 +115,40 @@ else status=$(jq -r .Volume.Lifecycle $tmpout 2> /dev/null) if [ "$status" == "CREATING" -o "$status" == "PENDING" ]; then volumeId=$(jq -r .Volume.VolumeId $tmpout) - echo "FSxN volume '$volumeName'($volumeId) is being created." - if [ ! -z "$wait" ]; then - echo -n "Waiting for volume to be created." - while [ $status == "CREATING" -o $status == "PENDING" ]; do - sleep 4 + printf "Volume '$volumeName'($volumeId) is being created." + if [ "$waitForCompletion" == "true" ]; then + i=0 + while [ $i -lt $maxIterations ]; do aws fsx describe-volumes --volume-ids $volumeId --region=$region --output=json > $tmpout 2>&1 status=$(jq -r .Volumes[0].Lifecycle $tmpout 2> /dev/null) + if [ $status == "CREATED" ]; then + printf "\nVolume '$volumeName'($volumeId) has been created.\n" + break + fi + if [ $status != "CREATING" -a $status != "PENDING" ]; then + echo "Error, volume $volumeId creation failed." 1>&2 + reason="$(jq -r '.Volumes[0].LifecycleTransitionReason.Message' $tmpout 2> /dev/null)" + if [ -n "$reason" ]; then + echo "Reason: $reason" 1>&2 + else + cat $tmpout 1>&2 + fi + exit 1 + fi echo -n "." + sleep $sleepTime done - printf "\nVolume as been $status.\n" + if [ $i -gt $maxIterations ]; then + printf "\nError, volume creation failed. Took too long." 1>&2 + exit 1 + fi + else + printf "\n" fi - exit 0 else echo "Unknown status '$status'. Complete output returned from the AWS api:" 1>&2 cat $tmpout 1>&2 exit 1 fi fi +exit 0 diff --git a/Management-Utilities/fsx-ontap-aws-cli-scripts/delete_fsxn_filesystem b/Management-Utilities/fsx-ontap-aws-cli-scripts/delete_fsxn_filesystem index 0d8b7d5..39a0750 100755 --- a/Management-Utilities/fsx-ontap-aws-cli-scripts/delete_fsxn_filesystem +++ b/Management-Utilities/fsx-ontap-aws-cli-scripts/delete_fsxn_filesystem @@ -12,7 +12,13 @@ # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ################################################################################ # -# This script is used to delete a FSxN filesystem. +# This script is used to delete a FSxN file system. It does that by first +# displaying all the SVM and volumes associated with the file system and then +# asking the user if they are sure they want to delete the file system. If the +# user responds with 'yes', then the script will delete all the SVMs associated +# with the file system. The 'delete_fsxn_svm' script will delete all the volumes +# associated with the SVM. Once all the SVMs and volumes have been deleted, the +# script will then delete the file system itself. ################################################################################ ################################################################################ @@ -21,12 +27,13 @@ ################################################################################ usage () { cat 1>&2 < $tmpout 2>&1 - if [ $? != "0" ]; then - printf "\nError, failed to delete a volume with volumeId: '$volumeId'.\n" 1>&2 - cat $tmpout 1>&2 - return 1 - fi - # - # Wait for the volume to be deleted. - i=0 - while [ $i -lt $MaxIterations ]; do - aws fsx describe-volumes --volume-ids $volumeId --output=json --region=$region > $tmpout 2>&1 - if [ $? -eq 0 ]; then - status=$(jq -r .Volumes[0].Lifecycle $tmpout 2> /dev/null) - if [ "$status" != "DELETING" -a "$status" != "PENDING" ]; then - printf "\nError, failed to delete volume with volume ID '$volumeId'. Status = ${status}.\n" 1>&2 - cat $tmpout 1>&2 - return 1 - fi - else - # Assume if it failed, it is because the volume was deleted and doesn't exist anymore. - break - fi - sleep $SleepTime - let i+=1 - done - if [ $i -ge $MaxIterations ]; then - echo "Failed to delete volume with volume ID of '$volumeId'. Taking too long." 1>&2 - return 1 - fi - return 0 -} - -################################################################################ -# This function is used to delete an FSxN SVM. It waits for the SVM to be -# deleted. It assumes the SVM has been deleted when the API call to display -# its status returns an error. -################################################################################ -delete_svm() { - - local tmpout=/tmp/delete_fsxn_delete_svm.$BASHPID - trap 'rm -f $tmpout' RETURN - - local svmId=$1 - aws fsx delete-storage-virtual-machine --region=$region --output=json --storage-virtual-machine-id $svmId > $tmpout 2>&1 - if [ $? != "0" ]; then - printf "\nError, failed to delete a SVM with svmID: '$svmId'.\n" 1>&2 - cat $tmpout 1>&2 - return 1 - fi - # - # Wait for the svm to be deleted. - i=0 - while [ $i -lt $MaxIterations ]; do - aws fsx describe-storage-virtual-machines --storage-virtual-machine-ids $svmId --output=json --region=$region > $tmpout 2>&1 - if [ $? -eq 0 ]; then - status=$(jq -r '.StorageVirtualMachines[0].Lifecycle' $tmpout 2> /dev/null) - if [ "$status" != "DELETING" -a "$status" != "PENDING" ]; then - printf "\nError, failed to delete SVM with SVM ID '$svmId'. Status = $status\n" 1>&2 - cat $tmpout 1>&2 - return 1 - fi - else - # Assume if it failed, it is because the SVM was delted and therefore doesn't exist anymore. - break - fi - sleep $SleepTime - let i+=1 - done - if [ $i -ge $MaxIterations ]; then - printf "\nFailed to delete SVM with SVM ID of '$svmID'. Taking too long.\n" 1>&2 - return 1 - fi - return 0 -} - ################################################################################ # Main logic starts here. ################################################################################ @@ -144,31 +63,32 @@ svmsFile=/tmp/fsx_fs_delete_svms.$$ volumesFile=/tmp/fsx_fs_delete_volumes.$$ trap 'rm -f $tmpout $svmsFile $volumesFile' exit # -# Set the maximum number of times to check that a volume and/or SVM has been -# deleted. Multiple it by the SleepTime set below to the total amount of -# time allowed. Note, it takes at least 4 minutes to delete a volume. +# Set the maximum number of times to check that will be made to see if a +# file system has been deleted. Multiple it by the SleepTime set below to +# the total amount of time allowed. MaxIterations=120 # -# Set the number of seconds to wait between checks that a volume and/or SVM has been deleted. +# Set the number of seconds to wait between checks that the file system +# has been deleted. SleepTime=5 # -# Set the maximum number of "volume deletes" that can be running at the same time. -MaxDeletesRunning=20 -# # Check that the required commands are available. -if which jq aws > /dev/null 2>&1; then - : -else - echo "Error, both the 'aws' and 'jq' commands is required to run this script." 1>&2 - exit 1 -fi +for cmd in jq aws delete_fsxn_svm delete_fsxn_volume; do + if which $cmd > /dev/null 2>&1; then + : + else + echo "Error, command '$cmd' is required to run this script." 1>&2 + exit 1 + fi +done # # Get the default region. region=$(aws configure list | egrep '^.*egion ' | awk '{print $2}') -skipBackup=true +enableBackup=false +waitForCompletion=false # # Process command line arguments. -while getopts "hbr:f:i:" option; do +while getopts "hwbr:f:i:" option; do case $option in f) fileSystemName=$OPTARG ;; @@ -176,7 +96,9 @@ while getopts "hbr:f:i:" option; do ;; i) fsid=$OPTARG ;; - b) skipBackup=false + b) enableBackup=true + ;; + w) waitForCompletion=true ;; *) usage ;; @@ -190,9 +112,15 @@ fi # # Ensure all the required parameters have been provided. if [ -z "$fileSystemName" -a -z "$fsid" ]; then - echo "Error, missing required arguments." 1>&2 + echo "Error, you must provide either the filesystem name or the file system id that you want to delete." 1>&2 usage # implied exit fi + +if [ $enableBackup == "true" ]; then + enableBackup="-b" # Turn it into a flag passed to the delete_fsxn_svm script. +else + enableBackup="" +fi # # Get the file system id based on the name. if [ -z "$fsid" ]; then @@ -212,135 +140,61 @@ if [ -z "$fsid" ]; then else # # Get the file system name based on the fsid. - fileSystemName=$(aws fsx describe-file-systems --file-system-ids $fsid --region=$region --output=json 2> /dev/null | jq -r '.FileSystems[0].Tags[] | select(.Key == "Name") .Value' 2> /dev/null) - if [ -z "$fileSystemName" ]; then + aws fsx describe-file-systems --file-system-ids $fsid --region=$region --output=json > $tmpout 2>&1 + if [ $? -ne 0 ]; then echo "Error, failed to get the file system name based on the ID ($fsid)." 1>&2 + cat $tmpout 1>&2 exit 1 fi + fileSystemName=$(jq -r '.FileSystems[0].Tags[] | select(.Key == "Name") .Value' $tmpout) + # + # Since it isn't required to have a name (Name tag), set it to "Not Set" if it doesn't exist.k + if [ -z "$fileSystemName" ]; then + fileSystemName="Not Set" + fi fi # -# Create a JSON file with all the FSxN SVMs in the region. -aws fsx describe-storage-virtual-machines --region=$region --output=json > $svmsFile 2>&1 +# Create a JSON file with all the SVMs associated with the file system. +aws fsx describe-storage-virtual-machines --region=$region --output=json --filters Name=file-system-id,Values=$fsid > $svmsFile 2>&1 if [ $? -ne 0 ]; then echo "Error, failed to get the list of SVMs." 1>&2 cat $svmsFile 1>&2 exit 1 fi # -# Create a JSON file with all the FSXN volumes in the region. -aws fsx describe-volumes --region=$region --output=json > $volumesFile 2>&1 +# Create a JSON file with all the FSXN volumes associated with the file system. +aws fsx describe-volumes --region=$region --output=json --filters Name=file-system-id,Values=$fsid > $volumesFile 2>&1 if [ $? -ne 0 ]; then echo "Error, failed to get the list of volumes." 1>&2 cat $volumesFile 1>&2 exit 1 fi +numVolumes=$(jq '.Volumes | length' $volumesFile) +numSvms=$(jq '.StorageVirtualMachines | length' $svmsFile) # # Make sure the user really wants to delete the file system. -echo "Here are the current contents of the '$fileSystemName'($fsid) file system you have indicated you want to delete:" -displayFileSystemContents $fsid -read -p "Are you sure you want to delete this file system, with all the above volumes (yes/no)? " response -if [ "$response" != "yes" ]; then - echo "Aborted." - exit 1 +if [ $numVolumes -gt 0 -o $numSvms -gt 0 ]; then + echo "Here are the current contents of the '$fileSystemName'($fsid) file system you have indicated you want to delete:" + displayFileSystemContents $fsid + read -p "Are you sure you want to delete this file system, with all the above volumes (yes/no)? " response + if [ "$response" != "yes" ]; then + echo "Aborted." + exit 1 + fi fi # -# Before you can delete a file system, you have to first delete all the volumes, -# and then all the SVMs. So, first get the list of SVMs: -declare -a svms -declare -a volumes -svms=($(jq -r '.StorageVirtualMachines[] | if(.FileSystemId == "'$fsid'") then .StorageVirtualMachineId else empty end' $svmsFile)) +# Create a list of all the SVMs associated with the file system. +svms=($(jq -r '.StorageVirtualMachines[] | select(.FileSystemId == "'$fsId'") | .StorageVirtualMachineId' $svmsFile)) # -# Now delete all the volumes for each SVM. I could just deleted all the volumes -# associated with the fsid, but I wanted the extra check on the volumeId to be -# associated with one of the SVMs that is associated with the fsid. +# First delete all the SVMs. The 'delete_fsxn_svm' script will delete all the volumes associated with the SVM. for svmId in ${svms[*]}; do - # - # Create an array with all the non-root volume IDs for this SVM. - volumes=($(jq -r '.Volumes[] | if(.OntapConfiguration.StorageVirtualMachineId == "'$svmId'" and (.OntapConfiguration.StorageVirtualMachineRoot | not) and .FileSystemId == "'$fsid'") then .VolumeId else empty end' $volumesFile)) - if [ ! -z "${volumes[*]}" ]; then - # - # Since it can take a while for a single volume to be deleted (e.g. 4 minutes - # for a small empty volume) and you can do multiple deletes in parallel, - # spawn them in the background and wait for them to finish. Although, since - # we don't want to overwhelm either AWS or ONTAP, only allow a certain - # number at a time. - i=0 - numRunning=0 - numVolumes=${#volumes[*]} - maxNumRunning=1 # Only do one initially, if it completes successfully, then do the rest concurrently. - printf "\nDeleting all the volumes associated with ${svmId}.\n" - while [ $i -lt $numVolumes ]; do - delete_volume ${volumes[$i]} $skipBackup & - let i+=1 - let numRunning+=1 - printf "\rTotal number of volumes to delete: ${numVolumes}. Number of deletes currently running: ${numRunning}. Number waiting to be started: $((numVolumes-i)). " - if [ $numRunning -ge $maxNumRunning ]; then - # - # Wait for a job to complete. - wait -n - rc=$? - if [ $rc -eq 127 ]; then - # - # 127 means there were no background jobs. Since we just deployed one, that shouldn't happen. - printf "\nError, got an expected response from 'wait'. Aborting.\n" 1>&2 - exit 1 - fi - if [ $rc -ne 0 ]; then - printf "\nError, one of the volume deletes failed. Aborting!\n" 1>&2 - exit 1 - fi - let numRunning-=1 - if [ $i -eq 1 ]; then - # The first one succeeded, open up the flood gates. - maxNumRunning=$MaxDeletesRunning - fi - fi - done - # - # Now that we have queued them all up, wait for them to finish. - wait -n - rc=$? - let numRunning-=1 - while [ "$rc" != 127 ]; do - printf "\rTotal number of volumes to delete: ${numVolumes}. Number of deletes currently running: ${numRunning}. Number waiting to be started: $((numVolumes-i)). " - if [ "$rc" != 0 ]; then - printf "\nError, one of the volume deletes failed. Aborting!\n" 1>&2 - exit 1 - fi - wait -n - rc=$? - let numRunning-=1 - done + delete_fsxn_svm -n -w $enableBackup -i $svmId -r $region + if [ $? -ne 0 ]; then + echo "Error, failed to delete the SVM with SVM ID '$svmId'." 1>&2 + exit 1 fi -done # for svmId in ${svms[*]}; do -# -# Now that all the volumes are deleted, delete the SVMs. -# Since there can only be 24 SVMs, don't really have to worry about spawning -# too many at a time. -[ ${#svms[*]} -gt 0 ] && printf "\nDeleting SVMs.\n" -for svmId in ${svms[*]}; do - delete_svm $svmId & done # -# Now wait for them to finish. -if [ ! -z "$svms" ]; then - numRunning=${#svms[*]} - printf "\rTotal number of SVMs to delete: ${#svms[*]}. Number of deletes currently running: ${numRunning}. Number waiting to be started: 0. " - wait -n - rs=$? - let numRunning-=1 - while [ "$rs" != 127 ]; do - if [ "$rs" != 0 ]; then - printf "\nError, one of the SVM deletes failed. Aborting!\n" 1>&2 - exit 1 - fi - printf "\rTotal number of SVMs to delete: ${#svms[*]}. Number of deletes currently running: ${numRunning}. Number waiting to be started: 0. " - wait -n - rs=$? - let numRunning-=1 - done -fi -# # Now that all the volumes and all the SVMs have been deleted, we can delete the filesystem. aws fsx delete-file-system --file-system-id $fsid --output=json --region=$region > $tmpout 2>&1 if [ $? != "0" ]; then @@ -350,7 +204,41 @@ if [ $? != "0" ]; then else status=$(jq -r .Lifecycle $tmpout) if [ "$status" == "DELETING" -o "$status" == "PENDING" ]; then - printf "\nFile system '$fileSystemName' is being deleted.\n" + printf "\nFile system '$fileSystemName'($fsid) is being deleted." + if [ $waitForCompletion == "true" ]; then + i=0 + while [ $i -lt $MaxIterations ]; do + aws fsx describe-file-systems --file-system-ids $fsid --output=json --region=$region > $tmpout 2>&1 + if [ $? -eq 0 ]; then + status=$(jq -r '.FileSystems[0].Lifecycle' $tmpout 2> /dev/null) + if [ "$status" != "DELETING" -a "$status" != "PENDING" ]; then + printf "\nError, failed to delete file system with file system ID '$fsid'. Status = $status\n" 1>&2 + reason="$(jq -r '.FileSystems[0].LifecycleTransitionReason.Message' $tmpout 2> /dev/null)" + if [ ! -z "$reason" ]; then + printf "Reason: $reason\n" 1>&2 + else + cat $tmpout 1>&2 + fi + exit 1 + else + printf "." + fi + else + # Assume if it failed, it is because the filesystem was deleted and therefore doesn't exist anymore. + printf "\nFile system '$fileSystemName'($fsid) has been deleted.\n" + break + fi + sleep $SleepTime + let i+=1 + done + if [ $i -ge $MaxIterations ]; then + printf "\nFailed to delete file system with filesystem ID of '$fsid'. Taking too long.\n" 1>&2 + exit 1 + fi + exit 0 + else + echo + fi exit 0 else printf "\nUnknown status '$status'. Complete output returned from the AWS api:\n" 1>&2 diff --git a/Management-Utilities/fsx-ontap-aws-cli-scripts/delete_fsxn_svm b/Management-Utilities/fsx-ontap-aws-cli-scripts/delete_fsxn_svm index 3c9707a..aad0270 100755 --- a/Management-Utilities/fsx-ontap-aws-cli-scripts/delete_fsxn_svm +++ b/Management-Utilities/fsx-ontap-aws-cli-scripts/delete_fsxn_svm @@ -13,7 +13,9 @@ ################################################################################ # # This script is used to delete a storage virtual machine from a -# FSxN filesystem. +# FSxN filesystem. It will delete all the volumes associated with the +# storage virtual machine as well. +# ################################################################################ ################################################################################ @@ -21,10 +23,13 @@ ################################################################################ usage () { cat 1>&2 < /dev/null 2>&1; then - : -else - echo "Error, both the 'aws' and 'jq' commands are required to run this script." 1>&2 - exit 1 -fi +for cmd in aws jq delete_fsxn_volume; do + if which $cmd > /dev/null 2>&1; then + : + else + echo "Error, a required command '$cmd' is not in the search path. Please install it and try again." 1>&2 + exit 1 + fi +done # # Set any defaults. region=$(aws configure list | egrep '^.*egion ' | awk '{print $2}') +waitForDelete=false +noQuery=false +enableBackup=false # # Process command line arguments. -while getopts "hi:r:" option; do +while getopts "hbi:wnr:" option; do case $option in i) svmID=$OPTARG ;; r) region=$OPTARG ;; + w) waitForDelete=true + ;; + n) noQuery=true + ;; + b) enableBackup=true + ;; *) usage ;; esac - shift done + +if [ $enableBackup == "true" ]; then + enableBackup="-b" # This is just a flag to the delete_fsxn_volume script. +else + enableBackup="" +fi # # Ensure all the required parameters have been provided. if [ -z "$svmID" ]; then @@ -65,18 +86,155 @@ if [ -z "$svmID" ]; then usage exit 1 fi +# +# Set the maximum number of volume deletes that can be running at the same time. +MaxDeletesRunning=20 +# +# Set the maximum number of iterations to wait for the SVM delete to complete. +# The total time to wait is the product of MaxIterations and SleepTime. +MaxIterations=120 +# +# Set the sleep time between iterations checking to see if the SVM delete has completed. +SleepTime=5 +# +# Get the list of volumes associated with the SVM. +aws fsx describe-volumes --region=$region --output=json --filters Name=storage-virtual-machine-id,Values=$svmID > $volumesFile 2>&1 +if [ $? -ne 0 ]; then + echo "Error, failed to get the list of volumes for SVM: $svmID." 1>&2 + cat $volumesFile 1>&2 + exit 1 +fi + +if [ $noQuery != "true" ]; then + # + # Display the voluems in the SVM to make sure the user really wants to delete them. + numVolumes=$(jq '.Volumes | length' $volumesFile) + if [ $numVolumes -gt 1 ]; then # Assume there is a root volume that will be skipped. + echo "The following volumes are associated with the storage virtual machine with an id of '$svmID':" + while read volumeId volumeName; do + volumes=(${volumes[@]} $volumeId) + printf "\t$volumeId - '$volumeName'\n" + done < <(jq -r '.Volumes[] | select(.OntapConfiguration.StorageVirtualMachineRoot | not) | .VolumeId + " " + .Name' $volumesFile) + fi + read -p "Are you sure you want to delete the storage virtual machine with an id of '$svmID' and all its volumes? (yes/no): " answer + + if [ "$answer" != "yes" ]; then + echo "Aborting deletion of storage virtual machine." 1>&2 + exit 1 + fi +else + # + # Create the volumes array. + while read volumeId; do + volumes=(${volumes[@]} $volumeId) + done < <(jq -r '.Volumes[] | select(.OntapConfiguration.StorageVirtualMachineRoot | not) | .VolumeId' $volumesFile) +fi + +if [ ! -z "${volumes[*]}" ]; then + # + # Since it can take a while for a single volume to be deleted (e.g. 4 minutes + # for a small empty volume) and you can do multiple deletes in parallel, + # spawn them in the background and wait for them to finish. Although, since + # we don't want to overwhelm either AWS or ONTAP, only allow a certain + # number at a time. + i=0 + numRunning=0 + numVolumes=${#volumes[*]} + maxNumRunning=1 # Only do one initially, if it completes successfully, then do the rest concurrently. + printf "\nDeleting all the volumes associated with ${svmID}.\n" + while [ $i -lt $numVolumes ]; do + delete_fsxn_volume -r $region -w -q -i ${volumes[$i]} $enableBackup & + let i+=1 + let numRunning+=1 + printf "\rTotal number of volumes to delete: ${numVolumes}. Number of deletes currently running: ${numRunning}. Number waiting to be started: $((numVolumes-i)). " + if [ $numRunning -ge $maxNumRunning ]; then + # + # Wait for a job to complete. + wait -n + rc=$? + if [ $rc -eq 127 ]; then + # + # 127 means there were no background jobs. Since we just deployed one, that shouldn't happen. + printf "\nError, got an expected response from 'wait'. Aborting.\n" 1>&2 + exit 1 + fi + if [ $rc -ne 0 ]; then + printf "\nError, one of the volume deletes failed. Aborting!\n" 1>&2 + exit 1 + fi + let numRunning-=1 + if [ $i -eq 1 ]; then + # The first one succeeded, open up the flood gates. + maxNumRunning=$MaxDeletesRunning + fi + fi + done + # + # Now that we have queued them all up, wait for them to finish. + wait -n + rc=$? + let numRunning-=1 + while [ "$rc" != 127 ]; do + printf "\rTotal number of volumes to delete: ${numVolumes}. Number of deletes currently running: ${numRunning}. Number waiting to be started: $((numVolumes-i)). " + if [ "$rc" != 0 ]; then + printf "\nError, one of the volume deletes failed. Aborting!\n" 1>&2 + exit 1 + fi + wait -n + rc=$? + let numRunning-=1 + done + echo "" +fi +# +# Now that all the volumes have been deleted, delete the storage virtual machine. aws fsx delete-storage-virtual-machine --region=$region --storage-virtual-machine-id $svmID > $tmpout 2>&1 if [ $? != "0" ]; then echo "Failed to delete storage virtual machine." 1>&2 - cat $tmpout + cat $tmpout 1>&2 exit 1 else status=$(jq -r .Lifecycle $tmpout) if [ "$status" == "DELETING" ]; then - echo "Storage Virtual Machine with an id of '$svmID' is being deleted." - exit 0 + echo -n "Storage Virtual Machine with an id of '$svmID' is being deleted." + # + # Wait for the svm to be deleted. + if [ $waitForDelete == "true" ]; then + i=0 + while [ $i -lt $MaxIterations ]; do + aws fsx describe-storage-virtual-machines --storage-virtual-machine-ids $svmID --output=json --region=$region > $tmpout 2>&1 + if [ $? -eq 0 ]; then + status=$(jq -r '.StorageVirtualMachines[0].Lifecycle' $tmpout 2> /dev/null) + if [ "$status" != "DELETING" -a "$status" != "PENDING" ]; then + printf "\nError, failed to delete SVM with SVM ID '$svmID'. Status = $status\n" 1>&2 + reason="$(jq -r '.StorageVirtualMachines[0].LifecycleTransitionReason.Message' $tmpout 2> /dev/null)" + if [ ! -z "$reason" ]; then + echo "Reason: $reason" 1>&2 + else + cat $tmpout 1>&2 + fi + exit 1 + else + printf "." + fi + else + # Assume if it failed, it is because the SVM was delted and therefore doesn't exist anymore. + printf "\nStorage Virtual Machine with an id of '$svmID' has been deleted.\n" + break + fi + sleep $SleepTime + let i+=1 + done + if [ $i -ge $MaxIterations ]; then + printf "\nFailed to delete SVM with SVM ID of '$svmID'. Taking too long.\n" 1>&2 + exit 1 + fi + exit 0 + else + echo + fi else echo "Unknown status '$status'. Complete output returned from the AWS api:" cat $tmpout diff --git a/Management-Utilities/fsx-ontap-aws-cli-scripts/delete_fsxn_volume b/Management-Utilities/fsx-ontap-aws-cli-scripts/delete_fsxn_volume index b164aff..31d2144 100755 --- a/Management-Utilities/fsx-ontap-aws-cli-scripts/delete_fsxn_volume +++ b/Management-Utilities/fsx-ontap-aws-cli-scripts/delete_fsxn_volume @@ -20,15 +20,56 @@ ################################################################################ usage () { cat 1>&2 < $tmpout 2>&1 + if [ $? -eq 0 ]; then + status=$(jq -r .Volumes[0].Lifecycle $tmpout 2> /dev/null) + if [ "$status" != "DELETING" -a "$status" != "PENDING" ]; then + printf "\nError, failed to delete volume with volume ID '$volumeId'. Status = ${status}.\n" 1>&2 + reason="$(jq -r '.Volumes[0].LifecycleTransitionReason.Message' $tmpout 2> /dev/null)" + if [ ! -z "$reason" ]; then + echo "Reason: $reason" 1>&2 + else + cat $tmpout 1>&2 + fi + return 1 + fi + else + # Assume if it failed, it is because the volume was deleted and doesn't exist anymore. + break + fi + [ $quiet != "true" ] && printf "." + sleep $SleepTime + let i+=1 + done + if [ $i -ge $MaxIterations ]; then + printf "\nFailed to delete volume with volume ID of '$volumeId'. Taking too long.\n" 1>&2 + return 1 + fi + return 0 +} + ################################################################################ # Main logic starts here. ################################################################################ @@ -37,10 +78,12 @@ trap 'rm -f $tmpout' exit # # Set some defaults. region=$(aws configure list | egrep '^.*egion ' | awk '{print $2}') +skipBackup=true +waitForCompletion=false +quiet=false # # Process command line arguments. -skipBackup=true -while getopts "hbi:r:" option; do +while getopts "qhwbi:r:" option; do case $option in r) region="$OPTARG" ;; @@ -48,10 +91,21 @@ while getopts "hbi:r:" option; do ;; b) skipBackup=false ;; + w) waitForCompletion=true + ;; + q) quiet=true + ;; *) usage ;; esac done + +if which jq aws > /dev/null 2>&1; then + : +else + echo "Error, both 'jq' and 'aws' are required to run this script." 1>&2 + exit 1 +fi # # Ensure all the required parameters have been provided. if [ -z "$volumeId" ]; then @@ -68,11 +122,20 @@ if [ $? != "0" ]; then else status=$(jq -r .Lifecycle $tmpout 2> /dev/null) if [ "$status" == "DELETING" -o "$status" == "PENDING" ]; then - echo "Volume '$volumeId' is being deleted." - exit 0 + [ $quiet != "true" ] && printf "Volume '$volumeId' is being deleted." + if [ "$waitForCompletion" == "true" ]; then + waitForVolumeDelete $volumeId + if [ $? -ne 0 ]; then + exit 1 + fi + [ $quiet != "true" ] && printf "\n" + else + [ $quiet != "true" ] && printf "\n" + fi else echo "Unknown status '$status'. Complete output returned from the AWS api:" 1>&2 cat $tmpout 1>&2 exit 1 fi fi +exit 0 diff --git a/Management-Utilities/fsx-ontap-aws-cli-scripts/list_aws_subnets b/Management-Utilities/fsx-ontap-aws-cli-scripts/list_aws_subnets index 66be940..dc8ebe5 100755 --- a/Management-Utilities/fsx-ontap-aws-cli-scripts/list_aws_subnets +++ b/Management-Utilities/fsx-ontap-aws-cli-scripts/list_aws_subnets @@ -51,8 +51,8 @@ for region in $regions; do printf "\nRegion: $region\n" fi if [ -z "$vpcId" ]; then - aws ec2 describe-subnets | jq -r '.Subnets[] | .VpcId + "," + .SubnetId + "," + .CidrBlock + "," + (first(.Tags[] | select(.Key == "Name").Value) // "")' | awk -F, 'BEGIN {formatStr="%21s %24s %18s %s\n"; printf(formatStr, "VPC Id", "Subnet ID", "CIDR", "Name")} {printf(formatStr , $1, $2, $3, $4)}' + aws ec2 describe-subnets --region=$region | jq -r '.Subnets[] | .VpcId + "," + .SubnetId + "," + .CidrBlock + "," + (if(has("Tags")) then first(.Tags[] | select(.Key == "Name").Value) // "" else "" end)' | awk -F, 'BEGIN {formatStr="%21s %24s %18s %s\n"; printf(formatStr, "VPC Id", "Subnet ID", "CIDR", "Name")} {printf(formatStr , $1, $2, $3, $4)}' else - aws ec2 describe-subnets --filters '[{"Name": "vpc-id", "Values": ["'$vpcId'"]}]' | jq -r '.Subnets[] | .VpcId + "," + .SubnetId + "," + .CidrBlock + "," + (first(.Tags[] | select(.Key == "Name").Value) // "")' | awk -F, 'BEGIN {formatStr="%21s %24s %18s %s\n"; printf(formatStr, "VPC Id", "Subnet ID", "CIDR", "Name")} {printf(formatStr , $1, $2, $3, $4)}' + aws ec2 describe-subnets --region=$region --filters '[{"Name": "vpc-id", "Values": ["'$vpcId'"]}]' | jq -r '.Subnets[] | .VpcId + "," + .SubnetId + "," + .CidrBlock + "," + (if(has("Tags")) then first(.Tags[] | select(.Key == "Name").Value) // "" else "" end)' | awk -F, 'BEGIN {formatStr="%21s %24s %18s %s\n"; printf(formatStr, "VPC Id", "Subnet ID", "CIDR", "Name")} {printf(formatStr , $1, $2, $3, $4)}' fi done diff --git a/Management-Utilities/fsx-ontap-aws-cli-scripts/list_aws_vpcs b/Management-Utilities/fsx-ontap-aws-cli-scripts/list_aws_vpcs index 3ecdc96..c35176a 100755 --- a/Management-Utilities/fsx-ontap-aws-cli-scripts/list_aws_vpcs +++ b/Management-Utilities/fsx-ontap-aws-cli-scripts/list_aws_vpcs @@ -59,7 +59,7 @@ vpcFormatStr="%21s %19s %s\n" for region in $regions; do [ "$quiet" != "True" ] && printf "\nRegion: $region\n" first=True - aws ec2 describe-vpcs --region $region | jq -r '.Vpcs[] | .VpcId + " " + .CidrBlock + " " + (if (.Tags != null) then (.Tags[] | (select(.Key == "Name") .Value)) else "" end)' | \ + aws ec2 describe-vpcs --region=$region | jq -r '.Vpcs[] | .VpcId + " " + .CidrBlock + " " + (if (has("Tags")) then .Tags[] | (select(.Key == "Name") .Value) else "" end)' | \ while read vpcId cidr name; do if [ "$quiet" != "True" -a "$first" == "True" ]; then printf "\n$vpcFormatStr" "VPC IP" "CIDR" "Name" @@ -69,7 +69,7 @@ for region in $regions; do if [ "$subnets" == "True" ]; then printf "\n\tSubnets:\n" - aws ec2 describe-subnets --filters '[{"Name": "vpc-id", "Values": ["'$vpcId'"]}]' | jq -r '.Subnets[] | .VpcId + " " + .SubnetId + " " + .CidrBlock + " " + (first(.Tags[] | select(.Key == "Name").Value) // "")' | awk 'BEGIN {formatStr="\t\t%24s %18s %s\n"; printf(formatStr, "Subnet ID", "CIDR", "Name")} {name=$4; for (i=5; i<=NF; i++) {name=name " " $(i)}; printf(formatStr , $2, $3, name)}' + aws ec2 describe-subnets --region=$region --filters '[{"Name": "vpc-id", "Values": ["'$vpcId'"]}]' | jq -r '.Subnets[] | .VpcId + " " + .SubnetId + " " + .CidrBlock + " " + (if(has("Tags")) then first(.Tags[] | select(.Key == "Name").Value) // "" else "" end)' | awk 'BEGIN {formatStr="\t\t%24s %18s %s\n"; printf(formatStr, "Subnet ID", "CIDR", "Name")} {name=$4; for (i=5; i<=NF; i++) {name=name " " $(i)}; printf(formatStr , $2, $3, name)}' first=True fi done diff --git a/Management-Utilities/fsx-ontap-aws-cli-scripts/list_fsxn_svms b/Management-Utilities/fsx-ontap-aws-cli-scripts/list_fsxn_svms index 26b7d64..a5c199f 100755 --- a/Management-Utilities/fsx-ontap-aws-cli-scripts/list_fsxn_svms +++ b/Management-Utilities/fsx-ontap-aws-cli-scripts/list_fsxn_svms @@ -101,7 +101,11 @@ else fi if [ ! -z "$fileSystemName" ]; then - fileSystemID=$(aws fsx describe-file-systems --output=json 2> /dev/null | jq -r '.FileSystems[] | if((.Tags[] | select(.Key == "Name") .Value) == "'"${fileSystemName}"'") then .FileSystemId else empty end' 2> /dev/null) + fileSystemID=$(aws fsx describe-file-systems --output=json --region=$region 2> /dev/null | jq -r '.FileSystems[] | if((.Tags[] | select(.Key == "Name") .Value) == "'"${fileSystemName}"'") then .FileSystemId else empty end' 2> /dev/null) + if [ -z "$fileSystemID" ]; then + echo "Error, failed to find a file system with the name '$fileSystemName'. Maybe in a different region?" 1>&2 + exit 1 + fi fi # # Loop on all the regions. diff --git a/Management-Utilities/fsxn-rotate-secret/terraform/main.tf b/Management-Utilities/fsxn-rotate-secret/terraform/main.tf index ca71341..82fef56 100644 --- a/Management-Utilities/fsxn-rotate-secret/terraform/main.tf +++ b/Management-Utilities/fsxn-rotate-secret/terraform/main.tf @@ -11,7 +11,12 @@ resource "random_id" "id" { byte_length = 4 } # -# Create the assume role policy document for the Lambda function. +# Create a local variable for the Lambda function name, so it can be used in two places without causing a cycle. +locals { + lambdaName = "fsxn_rotate_secret-${random_id.id.hex}" +} +# +# Create the policy document for the assume role policy for the Lambda function role. data "aws_iam_policy_document" "assume_role" { statement { effect = "Allow" @@ -25,8 +30,8 @@ data "aws_iam_policy_document" "assume_role" { } } # -# Create the inline policy document for the Lambda function role. -data "aws_iam_policy_document" "inline_permissions" { +# Create a policy document for the policy for the Lambda function role. +data "aws_iam_policy_document" "lambda_permissions" { # # The frist two statements are required for the lambda function to write logs to CloudWatch. # While not required, are useful for debugging. @@ -75,20 +80,18 @@ data "aws_iam_policy_document" "inline_permissions" { } } # -# Create a local variable for the Lambda function name, so it can be used in two places without causing a cycle. -locals { - lambdaName = "fsxn_rotate_secret-${random_id.id.hex}" -} -# # Create the IAM role for the Lambda function. -resource "aws_iam_role" "iam_for_lambda" { - name = "iam_for_lambda-${random_id.id.hex}" +resource "aws_iam_role" "role_for_lambda" { + name = "rotate_fsxn_secret_role_${random_id.id.hex}" description = "IAM role for the Rotate FSxN Secret Lambda function." assume_role_policy = data.aws_iam_policy_document.assume_role.json - inline_policy { - name = "required_policy" - policy = data.aws_iam_policy_document.inline_permissions.json - } +} +# +# Create the policy based on the policy document. +resource "aws_iam_role_policy" "lambda_permissions" { + name = "rotate_fsxn_secret_policy_${random_id.id.hex}" + role = aws_iam_role.role_for_lambda.name + policy = data.aws_iam_policy_document.lambda_permissions.json } # # Create the archive file for the Lambda function. @@ -103,10 +106,11 @@ resource "aws_lambda_function" "rotateLambdaFunction" { provider = aws.secrets_provider function_name = local.lambdaName description = var.svm_id != "" ? "Lambda function to rotate the secret for SVM (${var.svm_id})." : "Lambda function to rotate the secret for FSxN File System (${var.fsx_id})." - role = aws_iam_role.iam_for_lambda.arn + role = aws_iam_role.role_for_lambda.arn runtime = "python3.12" handler = "fsxn_rotate_secret.lambda_handler" filename = "fsxn_rotate_secret.zip" + timeout = 10 source_code_hash = data.archive_file.lambda.output_base64sha256 } # diff --git a/Management-Utilities/fsxn-rotate-secret/terraform/output.tf b/Management-Utilities/fsxn-rotate-secret/terraform/output.tf index 5c670cb..d5b215b 100644 --- a/Management-Utilities/fsxn-rotate-secret/terraform/output.tf +++ b/Management-Utilities/fsxn-rotate-secret/terraform/output.tf @@ -20,10 +20,10 @@ output "lambda_name" { output "role_arn" { description = "The ARN of the role that was created that allows the Lambda function to rotate the secret." - value = aws_iam_role.iam_for_lambda.arn + value = aws_iam_role.role_for_lambda.arn } output "role_name" { description = "The name of the role that was created that allows the Lambda function to rotate the secret." - value = aws_iam_role.iam_for_lambda.name + value = aws_iam_role.role_for_lambda.name } diff --git a/Monitoring/README.md b/Monitoring/README.md index 4796d0e..f98a482 100644 --- a/Monitoring/README.md +++ b/Monitoring/README.md @@ -3,6 +3,7 @@ This subfolder contains tools that can help you monitor your FSx ONTAP file syst | Tool | Description | | --- | --- | +| [CloudWatch Dashboard for FSx for ONTAP](/Monitoring/CloudWatch-FSx) | This tool creates a CloudWatch dashboard that displays metrics for your FSx for ONTAP file system. | | [LUN-monitoring](/Monitoring/LUN-monitoring) | This tool exports FSxN LUN metrics to CloudWatch and creates a CloudWatch dashboard to you can monitor your LUNs. | | [auto-add-cw-alarms](/Monitoring/auto-add-cw-alarms) | This tool will automatically add CloudWatch alarms that will alert you when:
  • The utilization of the primary storage of any FSx ONTAP file system gets above a specified threshold.
  • The CPU utilization of any file system gets above a specified threshold.
  • The utilization of any volume within any file system gets above a specified threshold.
| | [monitor-ontap-services](/Monitoring/monitor-ontap-services)| This tool helps you monitor various Data ONTAP services and send SNS alerts if anything of interest is detected. The following services are monitored:
  • EMS Messages
  • SnapMirror health, including tag time
  • Aggregate, volume or Quota utilization based on user provided thresholds
  • Overall health of the File System
| diff --git a/Monitoring/auto-add-cw-alarms/README.md b/Monitoring/auto-add-cw-alarms/README.md index 798a0d6..fd22a2c 100644 --- a/Monitoring/auto-add-cw-alarms/README.md +++ b/Monitoring/auto-add-cw-alarms/README.md @@ -8,39 +8,78 @@ delete alarms. This can be tedious, and error prone. This script will automate t AWS CloudWatch alarms that monitor the utilization of the file system and its volumes. It will also create alarms to monitor the CPU utilization of the file system. And if a volume or file system is removed, it will remove the associated alarms. -To implement this, you might think to just create EventTail filters to trigger on the creation or deletion of an FSx Volume. +To implement this, you might think to just create EventBridge rules that trigger on the creation or deletion of an FSx Volume. This would kind of work, but since you have command line access to the FSx for ONTAP file system, you can create -and delete volumes without creating CloudTrail events. So, this method would not be reliable. Therefore, instead +and delete volumes without generating any CloudTrail events. So, depending on CloudTrail events would not be reliable. Therefore, instead of relying on those events, this script will scan all the file systems and volumes in all the regions then create and delete alarms as needed. ## Invocation -There are two ways you can invoke this script (Python program). Either from a computer that has Python installed, or you could upload it -as a Lambda function. +The preferred way to run this script is as a Lambda function. That is because it is very inexpensive to run without having +to maintain any compute resources. You can use an `EventBridge Schedule` to run it on a regular basis to +ensure that all the CloudWatch alarms are kept up to date. Since there are several steps involved in setting up a Lambda function, +a CloudFormation template is included in the repo, named `cloudlformation.yaml`, that will do the following steps for you: +- Create a role that will allow the Lambda function to: + - List AWS regions. This is so it can get a list of all the regions, so it can know which regions to scan for FSx for ONTAP file systems and volumes. + - List the FSx for ONTAP file systems. + - List the FSx volumes. + - List the CloudWatch alarms. + - List tags for the resources. This is so you can customize the thresholds for the alarms on a per instance basis. More on that below. + - Create CloudWatch alarms. + - Delete CloudWatch alarms that it has created (based on alarm names). +- Create a Lambda function with the Python program. +- Create an EventBridge schedule that will run the Lambda function on a user defined basis. +- Create a role that will allow the EventBridge schedule to trigger the Lambda function. + +To use the CloudFormation template perform the following steps: + +1. Download the `cloudformation.yaml` file from this repo. +2. Go to the `CloudFormation` services page in the AWS console and select `Create Stack -> With new resources (standard)`. +3. Select `Choose an existing template` and `Upload a template file`. +4. Click `Choose file` and select the `cloudformation.yaml` file you downloaded in step 1. +5. Click `Next` and fill in the parameters presented on the next page. The parameters are: + - `Stack name` - The name of the CloudFormation stack. Note this name is also used as a base name for some of the resources that are created, to make them unique, so you must keep this string under 25 characters, so the resource names don't exceed their name length limit. + - `SNStopic` - The SNS Topic name where CloudWatch will send alerts to. Note that since CloudWatch can't send messages to an SNS topic residing in a different region, it is assumed that the SNS topic, with the same name, will exist in all the regions where alarms are to be created. + - `accountId` - The AWS account ID associated with the SNS topic. This is only used to compute the ARN to the SNS Topic set above. + - `customerId` - This is optional. If provided the string entered is included in the description of every alarm created. + - `defaultCPUThreshold` - This will define a default CPU utilization threshold. You can override the default by having a specific tag associated with the file system (see below for more information). + - `defaultSSDThreshold` - This will define a default SSD (aggregate) utilization threshold. You can override the default by having a specific tag associated with the file system (see below for more information). + - `defaultVolumeThreshold` - This will define the default Volume utilization threshold. You can override the default by having a specific tag associated with the volume (see below for more information). + - `checkInterval` - This is the interval in minutes that the program will run. + - `alarmPrefixString` - This defines the string that will be prepended to every CloudWatch alarm name that the program creates. Having a known prefix is how it knows it is the one maintaining the alarm. + - `regions` - This is a comma separated list of AWS region names (e.g. us-east-1) that the program will act on. If not specified, the program will scan on all regions that support an FSx for ONTAP file system. Note that no checking is performed to ensure that the regions you provide are valid. +6. Click `Next`. There aren't any recommended changes to make to any of the proceeding pages, so just click `Next` again. +7. On the final page, check the box that says `I acknowledge that AWS CloudFormation might create IAM resources with custom names.` and then click `Submit`. + +If you prefer, you can run this Python program on any UNIX based computer that has Python installed. See the "Running on a computer" section below for more information. ### Configuring the program -Before you can run the program you will need to configure it. You can configure it a few ways: +If you use the CloudFormation template to deploy the program, it will create the appropriate environment variables for you. +However, if you didn't use the CloudFormation template, you will need to configure the program yourself. Here are the +various ways you can do so: * By editing the top part of the program itself where there are the following variable definitions. -* By setting environment variables. +* By setting environment variables with the same names as the variables in the program. * If running it as a standalone program, via some command line options. +:bulb: **NOTE:** The CloudFormation template will prompt for these values when you create the stack and will set the appropriate environment variables for you. + Here is the list of variables, and what they define: | Variable | Description |Command Line Option| |:---------|:------------|:--------------------------------| -|SNStopic | The SNS Topic name where CloudWatch will send alerts to. Note that it is assumed that the SNS topic, with the same name, will exist in all the regions where alarms are to be created.|-s SNS_Topic_Name| -|accountId | The AWS account ID associated with the SNStopic. This is only used to compute the ARN to the SNS Topic.|-a Account_number| -|customerId| This is really just a comment that will be added to the alarm description.|-c Customer_String| +|SNStopic | The SNS Topic name where CloudWatch will send alerts to. Note that it is assumed that the SNS topic, with the same name, will exist in all the regions where alarms are to be created.|-s SNS\_Topic\_Name| +|accountId | The AWS account ID associated with the SNS topic. This is only used to compute the ARN to the SNS Topic.|-a Account\_number| +|customerId| This is just an optional string that will be added to the alarm description.|-c Customer\_String| |defaultCPUThreshold | This will define the default CPU utilization threshold. You can override the default by having a specific tag associated with the file system. See below for more information.|-C number| |defaultSSDThreshold | This will define the default SSD (aggregate) utilization threshold. You can override the default by having a specific tag associated with the file system. See below for more information.|-S number| |defaultVolumeThreshold | This will define the default Volume utilization threshold. You can override the default by having a specific tag associated with the volume. See below for more information.|-V number| -|alarmPrefixCPU | This defines the string that will be put in front of the name of every CPU utilization CloudWatch alarm that the program creates. Having a known prefix is how it knows it is the one maintaining the alarm.|N/A| -|alarmPrefixSSD | This defines the string that will be put in front of the name of every SSD utilization CloudWatch alarm that the program creates. Having a known prefix is how it knows it is the one maintaining the alarm.|N/A| +|alarmPrefixCPU | This defines the string that will be put in front of the name of every CPU utilization CloudWatch alarm that the program creates. Having a known prefix is how it knows it is the one maintaining the alarm.|N/A| +|alarmPrefixSSD | This defines the string that will be put in front of the name of every SSD utilization CloudWatch alarm that the program creates. Having a known prefix is how it knows it is the one maintaining the alarm.|N/A| |alarmPrefixVolume | This defines the string that will be put in front of the name of every volume utilization CloudWatch alarm that the program creates. Having a known prefix is how it knows it is the one maintaining the alarm.|N/A| +|regions | This is a comma separated list of AWS region names (e.g. us-east-1) that the program will act on. If not specified, the program will scan on all regions that support an FSx for ONTAP file system. Note that no checking is performed to ensure that the regions you provide are valid.|-r region -r region ...| -There are a few command line options that don't have a corresponding variables: +There are a few command line options that don't have a corresponding variable: |Option|Description| |:-----|:----------| -|-r region| This option can be specified multiple times to limit the regions that the program will act on. If not specified, the program will act on all regions.| |-d| This option will cause the program to run in "Dry Run" mode. In this mode, the program will only display messages showing what it would have done, and not really create or delete any CloudWatch alarms.| |-F filesystem\_ID| This option will cause the program to only add or remove alarms that are associated with the filesystem\_ID.| @@ -68,8 +107,9 @@ Once you have Python and boto3 installed, you can run the program by executing t python3 auto_add_cw_alarms.py ``` This will run the program based on all the variables set at the top. If you want to change the behavior without -having to edit the program, you can use the Command Line Option specified in the table above. Note that you can give a `-h` (or `--help`) -option and the program will display a list of all the available options. +having to edit the program, you can either use the Command Line Option specified in the table above or you can +set the appropriate environment variable. Note that you can give a `-h` (or `--help`) command line option +and the program will display a list of all the available options. You can limit the regions that the program will act on by using the `-r region` option. You can specify that option multiple times to act on multiple regions. @@ -78,21 +118,39 @@ You can run the program in "Dry Run" mode by specifying the `-d` (or `--dryRun`) messages showing what it would have done, and not really create or delete any CloudWatch alarms. ### Running as a Lambda function -If you run the program as a Lambda function, you will want to set the timeout to at least two minutes since some of the API calls -can take a significant amount of "clock time" to run, especially in distant regions. - -Once you have installed the Lambda function it is recommended to set up a scheduled type EventBridge rule so the function will run on a regular basis. - -The appropriate permissions will need to be assigned to the Lambda function in order for it to run correctly. -It doesn't need many permissions. It just needs to be able to: -* List the FSx for ONTAP file systems -* List the FSx volume names -* List the CloudWatch alarms -* Create CloudWatch alarms -* Delete CloudWatch alarms -* Create CloudWatch Log Groups and Log Streams in case you need to diagnose an issue - -The following permissions are required to run the script (although you could narrow the "Resource" specification to suit your needs.) +A CloudFormation template is included in the repo that will do the steps below. If you don't want to use that, here are +the detailed steps required to install the program as a Lambda function. + +#### Create a Lambda function +1. Download the `auto_add_cw_alarms.py` file from this repo. +2. Create a new Lambda function in the AWS console by going to the Lambda services page and clicking on the `Create function` button. +3. Choose `Author from scratch` and give the function a name. For example `auto_add_cw_alarms`. +4. Choose the latest version of Python (currently Python 3.11) as the runtime and click on `Create function`. +5. In the function code section, copy and paste the contents of the `auto_add_cw_alarms.py` file into the code editor. +6. Click on the `Deploy` button to save the function. +7. Click on the Configuration tag and then the "General configuration" sub tab and set the "Timeout" to be at least 3 minutes. +8. Click on the "Environment variables" tab and add the following environment variables: + - `SNStopic` - The SNS Topic name where CloudWatch will send alerts to. + - `accountId` - The AWS account ID associated with the SNS topic. + - `customerId` - This is optional. If provided the string entered is included in the description of every alarm created. + - `defaultCPUThreshold` - This will define a default CPU utilization threshold. + - `defaultSSDThreshold` - This will define a default SSD (aggregate) utilization threshold. + - `defaultVolumeThreshold` - This will define the default Volume utilization threshold. + - `alarmPrefixString` - This defines the string that will be prepended to every CloudWatch alarm name that the program creates. + - `regions` - This is an optional comma separated list of AWS region names (e.g. us-east-1) that the program will act on. If not specified, the program will scan on all regions that support an FSx for ONTAP file system. + +You will also need to set up the appropriate permissions for the Lambda function to run. It doesn't need many permissions. It just needs to be able to: +* List the FSx for ONTAP file systems. +* List the FSx volume names. +* List tags associated with an FSx file system or volume. +* List the CloudWatch alarms. +* List all the AWS regions. +* Create CloudWatch alarms. +* Delete CloudWatch alarms. You can set resource to `arn:aws:cloudwatch:*:`*AccountId*`:alarm:`*alarmPrefixString*`*` to limit the deletion to only the alarms that it creates. +* Create CloudWatch Log Groups and Log Streams in case you need to diagnose an issue. + +The following is an example AWS policy that has all the required permissions to run the script (although you could narrow the "Resource" specification to suit your needs.) +Note it assumes that the alarmPrefixString is set to "FSx-ONTAP-Auto". ```JSON { "Version": "2012-10-17", @@ -101,20 +159,27 @@ The following permissions are required to run the script (although you could nar "Sid": "VisualEditor0", "Effect": "Allow", "Action": [ - "cloudwatch:PutMetricAlarm", - "fsx:ListTagsForResource", - "fsx:DescribeVolumes", "fsx:DescribeFilesystems", - "cloudwatch:DeleteAlarms", + "fsx:DescribeVolumes", + "fsx:ListTagsForResource", + "cloudwatch:DescribeAlarms" "cloudwatch:DescribeAlarmsForMetric", "ec2:DescribeRegions", - "cloudwatch:DescribeAlarms" + "cloudwatch:PutMetricAlarm", ], "Resource": "*" }, { "Sid": "VisualEditor1", "Effect": "Allow", + "Action": [ + "cloudwatch:DeleteAlarms" + ], + "Resource": "arn:aws:cloudwatch:*:*:alarm:FSx-ONTAP-Auto*" + }, + { + "Sid": "VisualEditor2", + "Effect": "Allow", "Action": [ "logs:CreateLogStream", "logs:PutLogEvents" @@ -122,7 +187,7 @@ The following permissions are required to run the script (although you could nar "Resource": "arn:aws:logs:*:*:log-group:*:log-stream:*" }, { - "Sid": "VisualEditor2", + "Sid": "VisualEditor3", "Effect": "Allow", "Action": "logs:CreateLogGroup", "Resource": "arn:aws:logs:*:*:log-group:*" @@ -131,15 +196,39 @@ The following permissions are required to run the script (although you could nar } ``` +Once you have deployed the Lambda function it is recommended to set up a schedule to run it on a regular basis. +The easiest way to do that is: +1. Click on the `Add trigger` button from the Lambda function page. +2. Select `EventBridge (CloudWatch Events)` as the trigger type. +3. Click on the `Create a new rule` button. +4. Give the rule a name and a description. +5. Set the `Schedule expression` to be the interval you want the function to run. For example, if you want it to run every 15 minutes, you would set the expression to `rate(15 minutes)`. +6. Click on the `Add` button + ### Expected Action Once the script has been configured and invoked, it will: -* Scan for every FSx for ONTAP file systems in every region. For every file system it finds it will: +* Scan for every FSx for ONTAP file systems in every region, unless you have specified a specific list of regions to scan. For every file system that it finds it will: * Create a CPU utilization CloudWatch alarm, unless the threshold value is set to 100 for the specific alarm. - * Create a SSD utilization CloudWatch alarm, unless the threshold value is set to 100 for the specific alarm. -* Scan for every FSx for ONTAP volume in every region. For every volume it finds it will: + * Create an SSD utilization CloudWatch alarm, unless the threshold value is set to 100 for the specific alarm. +* Scan for every FSx for ONTAP volume in every region, unless you have specified a specific list of regions to scan. For every volume it finds it will: * Create a Volume Utilization CloudWatch alarm, unless the threshold value is set to 100 for the specific alarm. * Scan for the CloudWatch alarms and remove any alarms that the associated resource doesn't exist anymore. +### Cleaning up +If you decide you don't want to use this program anymore, you can delete the CloudFormation stack that you created. +This will remove the Lambda function, the EventBridge schedule, and the roles that were created for you. If you did +not use the CloudFormation template, you will have to do these steps yourself. + +Once you have removed the program, you can remove all the CloudWatch alarms that were created by the program by running +the following command: + +```bash +region=us-west-2 +aws cloudwatch describe-alarms --region=$region --alarm-name-prefix "FSx-ONTAP-Auto" --query "MetricAlarms[*].AlarmName" --output text | xargs -n 50 aws cloudwatch delete-alarms --region $region --alarm-names +``` +This command will remove all the alarms that have an alarm name that starts with "FSx-ONTAP-Auto" in the us-west-2 region. +Make sure to adjust the alarm-name-prefix to match the AlarmPrefix you set when you deployed the program. +You will also need to adjust the region variable and run the `aws` command again for each region where you have alarms in. ## Author Information diff --git a/Monitoring/auto-add-cw-alarms/auto_add_cw_alarms.py b/Monitoring/auto-add-cw-alarms/auto_add_cw_alarms.py index 5886be8..fb9697f 100755 --- a/Monitoring/auto-add-cw-alarms/auto_add_cw_alarms.py +++ b/Monitoring/auto-add-cw-alarms/auto_add_cw_alarms.py @@ -4,15 +4,18 @@ # ONTAP volumes, that don't already have one, that will trigger when the # utilization of the volume gets above the threshold defined below. It will # also create an alarm that will trigger when the file system reach -# an average CPU utilization greater than what is specified below. +# an average CPU utilization greater than what is specified below as well +# an alarm that will trigger when the SSD utilization is greater than what +# is specified below. # # It can either be run as a standalone script, or uploaded as a Lambda # function with the thought being that you will create a EventBridge schedule # to invoke it periodically. # -# It will scan all regions looking for FSxN volumes, and since CloudWatch -# can't send SNS messages across regions, it assumes that the specified -# SNS topic exist in each region for the specified account ID. +# It will scan all regions looking for FSxN volumes and file systems +# and since CloudWatch can't send SNS messages across regions, it assumes +# that the specified SNS topic exist in each region for the specified +# account ID. # # Finally, a default volume threshold is defined below. It sets the volume # utilization threshold that will cause CloudWatch to send the alarm event @@ -24,6 +27,9 @@ # Lastly, you can create an override for the SSD alarm, by creating a tag # with the name "SSD_Alarm_Threshold" on the file system resource. # +# Version: %%VERSION%% +# Date: %%DATE%% +# ################################################################################ # # The following variables effect the behavior of the script. They can be @@ -64,14 +70,23 @@ # what you are doing. ################################################################################ # +# The following is put in front of all alarms so an IAM policy can be create +# that will allow this script to only be able to delete the alarms it creates. +# If you change this, you must also change the IAM policy. It can be +# set via an environment variable, this is so that the CloudFormation template +# can pass the value to the Lambda function. To change the value, change +# the "FSx-ONTAP-Auto" string to your desired value. +import os +basePrefix = os.environ.get('basePrefix', "FSx-ONTAP-Auto") +# # Define the prefix for the volume utilization alarm name for the CloudWatch alarms. -alarmPrefixVolume="Volume_Utilization_for_volume_" +alarmPrefixVolume=f"{basePrefix}-Volume_Utilization_for_volume_" # # Define the prefix for the CPU utilization alarm name for the CloudWatch alarms. -alarmPrefixCPU="CPU_Utilization_for_fs_" +alarmPrefixCPU=f"{basePrefix}-CPU_Utilization_for_fs_" # # Define the prefix for the SSD utilization alarm name for the CloudWatch alarms. -alarmPrefixSSD="SSD_Utilization_for_fs_" +alarmPrefixSSD=f"{basePrefix}-SSD_Utilization_for_fs_" ################################################################################ # You shouldn't have to modify anything below here. @@ -80,7 +95,6 @@ import botocore from botocore.config import Config import boto3 -import os import getopt import sys import time @@ -531,7 +545,7 @@ def lambda_handler(event, context): # This function is used to print out the usage of the script. ################################################################################ def usage(): - print('Usage: add_cw_alarm [-h|--help] [-d|--dryRun] [[-c|--customerID customerID] [[-a|--accountID aws_account_id] [[-s|--SNSTopic SNS_Topic_Name] [[-r|--region region] [[-C|--CPUThreshold threshold] [[-S|--SSDThreshold threshold] [[-V|--VolumeThreshold threshold] [-F|--FileSystemID FileSystemID]') + print('Usage: auto_add_cw_alarms [-h|--help] [-d|--dryRun] [[-c|--customerID customerID] [[-a|--accountID aws_account_id] [[-s|--SNSTopic SNS_Topic_Name] [[-r|--region region] [[-C|--CPUThreshold threshold] [[-S|--SSDThreshold threshold] [[-V|--VolumeThreshold threshold] [-F|--FileSystemID FileSystemID]') ################################################################################ # Main logic starts here. @@ -549,6 +563,9 @@ def usage(): defaultCPUThreshold = int(os.environ.get('defaultCPUThreshold', defaultCPUThreshold)) defaultSSDThreshold = int(os.environ.get('defaultSSDThreshold', defaultSSDThreshold)) defaultVolumeThreshold = int(os.environ.get('defaultVolumeThreshold', defaultVolumeThreshold)) +regionsEnv = os.environ.get('regions', '') +if regionsEnv != '': + regions = regionsEnv.split(',') # # Check to see if we are bring run from a command line or a Lmabda function. if os.environ.get('AWS_LAMBDA_FUNCTION_NAME') == None: diff --git a/Monitoring/auto-add-cw-alarms/cloudformation.yaml b/Monitoring/auto-add-cw-alarms/cloudformation.yaml new file mode 100644 index 0000000..686ce57 --- /dev/null +++ b/Monitoring/auto-add-cw-alarms/cloudformation.yaml @@ -0,0 +1,772 @@ +Description: "Deploy auto-add-cw-alarms." +# +# This just formats the page that prompts for the parameters when using the AWS Console to deploy your stack. +Metadata: + AWS::CloudFormation::Interface: + ParameterGroups: + - Label: + default: "Configuration Parameters" + Parameters: + - SNStopic + - accountId + - customerId + - defaultCPUThreshold + - defaultSSDThreshold + - defaultVolumeThreshold + - checkInterval + - alarmPrefixString + - regions + +Parameters: + SNStopic: + Description: "The SNS Topic name where CloudWatch will send alerts to. Note that it is assumed that the SNS topic, with the same name, will exist in all the regions where alarms are to be created." + Type: String + + accountId: + Description: "The AWS account ID associated with the SNStopic. This is only used to compute the ARN to the SNS Topic." + Type: String + + customerId: + Description: "This is really just a comment that will be added to the alarm description." + Type: String + Default: "" + + defaultCPUThreshold: + Description: "This will define the default CPU utilization threshold. You can override the default by having a specific tag associated with the file system. See below for more information." + Type: Number + MinValue: 0 + MaxValue: 100 + Default: 80 + + defaultSSDThreshold: + Description: "This will define the default SSD (aggregate) utilization threshold. You can override the default by having a specific tag associated with the file system. See below for more information." + Type: Number + MinValue: 0 + MaxValue: 100 + Default: 80 + + defaultVolumeThreshold: + Description: "This will define the default Volume utilization threshold. You can override the default by having a specific tag associated with the volume. See below for more information." + Type: Number + MinValue: 0 + MaxValue: 100 + Default: 80 + + checkInterval: + Description: "This is how often you want the Lambda function to run to look for new file systems and/or volumes (minutes)." + Type: Number + MinValue: 5 + Default: 15 + + alarmPrefixString: + Description: "This is the string that will be prepended to all CloudWatch alarms created by this script." + Type: String + Default: "FSx-ONTAP-Auto" + + regions: + Description: "This is a list of AWS regions that you want the Lambda function to run in. If left blank, it will run in all regions." + Type: CommaDelimitedList + Default: "" + +Resources: + LambdaRole: + Type: "AWS::IAM::Role" + Properties: + RoleName: !Sub "Lambda-Role-for-${AWS::StackName}" + AssumeRolePolicyDocument: + Version: "2012-10-17" + Statement: + - Effect: "Allow" + Principal: + Service: "lambda.amazonaws.com" + Action: "sts:AssumeRole" + + ManagedPolicyArns: + - "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" + + Policies: + - PolicyName: "LambdaPolicy_for_Auto_Add_CW_Alarms" + PolicyDocument: + Version: "2012-10-17" + Statement: + - Effect: "Allow" + Action: + - "fsx:DescribeFileSystems" + - "fsx:DescribeVolumes" + - "fsx:ListTagsForResource" + - "ec2:DescribeRegions" + - "cloudwatch:DescribeAlarms" + - "cloudwatch:DescribeAlarmsForMetric" + - "cloudwatch:PutMetricAlarm" + Resource: "*" + + - Effect: "Allow" + Action: + - "cloudwatch:DeleteAlarms" + Resource: !Sub "arn:aws:cloudwatch:*:${AWS::AccountId}:alarm:${alarmPrefixString}*" + + SchedulerRole: + Type: "AWS::IAM::Role" + Properties: + RoleName: !Sub "SchedulerRole-for-${AWS::StackName}" + AssumeRolePolicyDocument: + Version: "2012-10-17" + Statement: + - Effect: "Allow" + Principal: + Service: "scheduler.amazonaws.com" + Action: "sts:AssumeRole" + + Policies: + - PolicyName: "SchedulerPolicy_for_Auto_Add_CW_Alarms" + PolicyDocument: + Version: "2012-10-17" + Statement: + - Effect: "Allow" + Action: + - "lambda:InvokeFunction" + Resource: !GetAtt LambdaFunction.Arn + + LambdaScheduler: + Type: "AWS::Scheduler::Schedule" + Properties: + Description: "Schedule the auto_add_cw_alarms Lambda function." + Name: !Sub "Schedule-for-${AWS::StackName}" + FlexibleTimeWindow: + Mode: "OFF" + ScheduleExpression: !Sub "rate(${checkInterval} minutes)" + Target: + Arn: !GetAtt LambdaFunction.Arn + RoleArn: !GetAtt SchedulerRole.Arn + + LambdaFunction: + Type: "AWS::Lambda::Function" + Properties: + FunctionName: !Sub "Lambda-for-${AWS::StackName}" + Role: !GetAtt LambdaRole.Arn + PackageType: "Zip" + Runtime: "python3.12" + Handler: "index.lambda_handler" + Timeout: 300 + Environment: + Variables: + SNStopic: !Ref SNStopic + accountId: !Ref accountId + customerId: !Ref customerId + defaultCPUThreshold: !Ref defaultCPUThreshold + defaultSSDThreshold: !Ref defaultSSDThreshold + defaultVolumeThreshold: !Ref defaultVolumeThreshold + basePrefix: !Ref alarmPrefixString + regions: !Join [",", !Ref regions] + + Code: + ZipFile: | + #!/usr/bin/python3 + # + # This script is used to add CloudWatch alarms for all the FSx for NetApp + # ONTAP volumes, that don't already have one, that will trigger when the + # utilization of the volume gets above the threshold defined below. It will + # also create an alarm that will trigger when the file system reach + # an average CPU utilization greater than what is specified below as well + # an alarm that will trigger when the SSD utilization is greater than what + # is specified below. + # + # It can either be run as a standalone script, or uploaded as a Lambda + # function with the thought being that you will create a EventBridge schedule + # to invoke it periodically. + # + # It will scan all regions looking for FSxN volumes and file systems + # and since CloudWatch can't send SNS messages across regions, it assumes + # that the specified SNS topic exist in each region for the specified + # account ID. + # + # Finally, a default volume threshold is defined below. It sets the volume + # utilization threshold that will cause CloudWatch to send the alarm event + # to the SNS topic. It can be overridden on a per volume basis by having a + # tag with the name of "alarm_threshold" set to the desired threshold. + # If the tag is set to 100, then no alarm will be created. You can also + # set an override to the filesystem CPU utilization alarm, but setting + # a tag with the name of 'CPU_Alarm_Threshold' on the file system resouce. + # Lastly, you can create an override for the SSD alarm, by creating a tag + # with the name "SSD_Alarm_Threshold" on the file system resource. + # + # Version: v2.13 + # Date: 2024-10-03-12:30:21 + # + ################################################################################ + # + # The following variables effect the behavior of the script. They can be + # either be set here, overridden via the command line options, or + # overridden by environment variables. + # + # Define which SNS topic you want "volume full" message to be sent to. + SNStopic='' + # + # Provide the account id the SNS topic resides under: + # MUST be a string. + accountId='' + # + # Set the customer ID associated with the AWS account. This is used to + # as part of the alarm name prefix so a customer ID can be associated + # with the alarm. If it is left as an empty string, no extra prefix + # will be added. + customerId='' + # + # Define the default CPU utilization threshold before sending the alarm. + # Setting it to 100 will disable the creation of the alarm. + defaultCPUThreshold=80 + # + # Define the default SSD utilization threshold before sending the alarm. + # Setting it to 100 will disable the creation of the alarm. + defaultSSDThreshold=90 + # + # Define the default volume utilization threshold before sending the alarm. + # Setting it to 100 will disable the creation of the alarm. + defaultVolumeThreshold=80 + # + # + ################################################################################ + # You can't change the following variables from the command line or environment + # variables since changing them after the program has run once would cause + # all existing CloudWatch alarms to be abandoned, and all new alarms to be + # created. So, it is not recommended to change these variables unless you know + # what you are doing. + ################################################################################ + # + # The following is put in front of all alarms so an IAM policy can be create + # that will allow this script to only be able to delete the alarms it creates. + # If you change this, you must also change the IAM policy. It can be + # set via an environment variable, this is so that the CloudFormation template + # can pass the value to the Lambda function. To change the value, change + # the "FSx-ONTAP-Auto" string to your desired value. + import os + basePrefix = os.environ.get('basePrefix', "FSx-ONTAP-Auto") + # + # Define the prefix for the volume utilization alarm name for the CloudWatch alarms. + alarmPrefixVolume=f"{basePrefix}-Volume_Utilization_for_volume_" + # + # Define the prefix for the CPU utilization alarm name for the CloudWatch alarms. + alarmPrefixCPU=f"{basePrefix}-CPU_Utilization_for_fs_" + # + # Define the prefix for the SSD utilization alarm name for the CloudWatch alarms. + alarmPrefixSSD=f"{basePrefix}-SSD_Utilization_for_fs_" + + ################################################################################ + # You shouldn't have to modify anything below here. + ################################################################################ + + import botocore + from botocore.config import Config + import boto3 + import getopt + import sys + import time + import json + + ################################################################################ + # This function adds the SSD Utilization CloudWatch alarm. + ################################################################################ + def add_ssd_alarm(cw, fsId, alarmName, alarmDescription, threshold, region): + action = 'arn:aws:sns:' + region + ':' + accountId + ':' + SNStopic + if not dryRun: + cw.put_metric_alarm( + AlarmName=alarmName, + ActionsEnabled=True, + AlarmActions=[action], + AlarmDescription=alarmDescription, + EvaluationPeriods=1, + DatapointsToAlarm=1, + Threshold=threshold, + ComparisonOperator='GreaterThanThreshold', + MetricName="StorageCapacityUtilization", + Period=300, + Statistic="Average", + Namespace="AWS/FSx", + Dimensions=[{'Name': 'FileSystemId', 'Value': fsId}, {'Name': 'StorageTier', 'Value': 'SSD'}, {'Name': 'DataType', 'Value': 'All'}] + ) + else: + print(f'Would have added SSD alarm for {fsId} with name {alarmName} with thresold of {threshold} in {region} with action {action}') + + ################################################################################ + # This function adds the CPU Utilization CloudWatch alarm. + ################################################################################ + def add_cpu_alarm(cw, fsId, alarmName, alarmDescription, threshold, region): + action = 'arn:aws:sns:' + region + ':' + accountId + ':' + SNStopic + if not dryRun: + cw.put_metric_alarm( + AlarmName=alarmName, + ActionsEnabled=True, + AlarmActions=[action], + AlarmDescription=alarmDescription, + EvaluationPeriods=1, + DatapointsToAlarm=1, + Threshold=threshold, + ComparisonOperator='GreaterThanThreshold', + MetricName="CPUUtilization", + Period=300, + Statistic="Average", + Namespace="AWS/FSx", + Dimensions=[{'Name': 'FileSystemId', 'Value': fsId}] + ) + else: + print(f'Would have added CPU alarm for {fsId} with name {alarmName} with thresold of {threshold} in {region} with action {action}.') + + ################################################################################ + # This function adds the Volume utilization CloudWatch alarm. + ################################################################################ + def add_volume_alarm(cw, volumeId, alarmName, alarmDescription, fsId, threshold, region): + action = 'arn:aws:sns:' + region + ':' + accountId + ':' + SNStopic + if not dryRun: + cw.put_metric_alarm( + ActionsEnabled=True, + AlarmName=alarmName, + AlarmActions=[action], + AlarmDescription=alarmDescription, + EvaluationPeriods=1, + DatapointsToAlarm=1, + Threshold=threshold, + ComparisonOperator='GreaterThanThreshold', + Metrics=[{"Id":"e1","Label":"Utilization","ReturnData":True,"Expression":"m2/m1*100"},\ + {"Id":"m2","ReturnData":False,"MetricStat":{"Metric":{"Namespace":"AWS/FSx","MetricName":"StorageUsed","Dimensions":[{"Name":"VolumeId","Value": volumeId},{"Name":"FileSystemId","Value":fsId}]},"Period":300,"Stat":"Average"}},\ + {"Id":"m1","ReturnData":False,"MetricStat":{"Metric":{"Namespace":"AWS/FSx","MetricName":"StorageCapacity","Dimensions":[{"Name":"VolumeId","Value": volumeId},{"Name":"FileSystemId","Value":fsId}]},"Period":300,"Stat":"Average"}}] + ) + else: + print(f'Would have added volume alarm for {volumeId} {fsId} with name {alarmName} with thresold of {threshold} in {region} with action {action}.') + + + ################################################################################ + # This function deletes a CloudWatch alarm. + ################################################################################ + def delete_alarm(cw, alarmName): + if not dryRun: + cw.delete_alarms(AlarmNames=[alarmName]) + else: + print(f'Would have deleted alarm {alarmName}.') + + ################################################################################ + # This function checks to see if the alarm already exists. + ################################################################################ + def contains_alarm(alarmName, alarms): + for alarm in alarms: + if(alarm['AlarmName'] == alarmName): + return True + return False + + ################################################################################ + # This function checks to see if a volume exists. + ################################################################################ + def contains_volume(volumeId, volumes): + for volume in volumes: + if(volume['VolumeId'] == volumeId): + return True + return False + + ################################################################################ + # This function checks to see if a file system exists. + ################################################################################ + def contains_fs(fsId, fss): + for fs in fss: + if(fs['FileSystemId'] == fsId): + return True + return False + + ################################################################################ + # This function returns the value assigned to the "alarm_threshold" tag + # associated with the arn passed in. If none is found, it returns the default + # threshold set above. + ################################################################################ + def getAlarmThresholdTagValue(fsx, arn): + # + # If there are a lot of volumes, we could get hit by the AWS rate limit, + # so we will sleep for a short period of time and then retry. We will + # double the sleep time each time we get a rate limit exception until + # we get to 5 seconds, then we will just raise the exception. + sleep=.125 + # + # This is put into a try block because it is possible that the volume + # is deleted between the time we get the list of volumes and the time + # we try to get the tags for the volume. + while True: + try: + tags = fsx.list_tags_for_resource(ResourceARN=arn) + for tag in tags['Tags']: + if(tag['Key'].lower() == "alarm_threshold"): + return(tag['Value']) + return(defaultVolumeThreshold) + except botocore.exceptions.ClientError as e: + if e.response['Error']['Code'] == 'ResourceNotFound': + return(100) # Return 100 so we don't try to create an alarm. + + if e.response['Error']['Code'] == 'TooManyRequestsException' or e.response['Error']['Code'] == 'ThrottlingException': + sleep = sleep * 2 + if sleep > 5: + raise e + print(f"Warning: Rate Limit fault while getting tags. Sleeping for {sleep} seconds.") + time.sleep(sleep) + else: + print(f"boto3 client error: {json.dumps(e.response)}") + raise e + + ################################################################################ + # This function returns the value assigned to the "CPU_alarm_threshold" tag + # that is in the array of tags passed in. if it doesn't find that tag it + # returns the default threshold set above. + ################################################################################ + def getCPUAlarmThresholdTagValue(tags): + for tag in tags: + if(tag['Key'].lower() == "cpu_alarm_threshold"): + return(tag['Value']) + return(defaultCPUThreshold) + + ################################################################################ + # This function returns the value assigned to the "CPU_alarm_threshold" tag + # that is in the array of tags passed in. if it doesn't find that tag it + # returns the default threshold set above. + ################################################################################ + def getSSDAlarmThresholdTagValue(tags): + for tag in tags: + if(tag['Key'].lower() == "ssd_alarm_threshold"): + return(tag['Value']) + return(defaultSSDThreshold) + + ################################################################################ + # This function returns the file system id that the passed in alarm is + # associated with. + ################################################################################ + def getFileSystemId(alarm): + + for metric in alarm['Metrics']: + if metric["Id"] == "m1": + for dim in metric['MetricStat']['Metric']['Dimensions']: + if dim['Name'] == 'FileSystemId': + return dim['Value'] + return None + + ################################################################################ + # This function will return all the file systems in the region. It will handle the + # case where there are more file systms than can be returned in a single call. + # It will also handle the case where we get a rate limit exception. + ################################################################################ + def getFss(fsx): + + # The initial amount of time to sleep if there is a rate limit exception. + sleep=.125 + while True: + try: + response = fsx.describe_file_systems() + fss = response['FileSystems'] + nextToken = response.get('NextToken') + sleep=.125 + break + except botocore.exceptions.ClientError as e: + if e.response['Error']['Code'] == 'TooManyRequestsException' or e.response['Error']['Code'] == 'ThrottlingException': + sleep = sleep * 2 # Exponential backoff. + if sleep > 5: + raise e + print(f"Warning: Rate Limit fault while getting initial file system list. Sleeping for {sleep} seconds.") + time.sleep(sleep) + else: + print(f"boto3 client error: {json.dumps(e.response)}") + raise e + + while nextToken: + try: + response = fsx.describe_file_systems(NextToken=nextToken) + fss += response['FileSystems'] + nextToken = response.get('NextToken') + sleep=.125 + except botocore.exceptions.ClientError as e: + if e.response['Error']['Code'] == 'TooManyRequestsException' or e.response['Error']['Code'] == 'ThrottlingException': + sleep = sleep * 2 # Exponential backoff. + if sleep > 5: + raise e + print(f"Warning: Rate Limit fault while getting additional file systems. Sleeping for {sleep} seconds.") + time.sleep(sleep) + else: + print(f"boto3 client error: {json.dumps(e.response)}") + raise e + return fss + + ################################################################################ + # This function will return all the volumes in the region. It will handle the + # case where there are more volumes than can be returned in a single call. + # It will also handle the case where we get a rate limit exception. + ################################################################################ + def getVolumes(fsx): + # + # The initial amount of time to sleep if there is a rate limit exception. + sleep=.125 + while True: + try: + response = fsx.describe_volumes() + volumes = response['Volumes'] + nextToken = response.get('NextToken') + sleep=.125 + break + except botocore.exceptions.ClientError as e: + if e.response['Error']['Code'] == 'TooManyRequestsException' or e.response['Error']['Code'] == 'ThrottlingException': + sleep = sleep * 2 # Exponential backoff. + if sleep > 5: + raise e + print(f"Warning: Rate Limit fault while getting the initial list of volumes. Sleeping for {sleep} seconds.") + time.sleep(sleep) + else: + print(f"boto3 client error: {json.dumps(e.response)}") + raise e + + while nextToken: + try: + response = fsx.describe_volumes(NextToken=nextToken) + volumes += response['Volumes'] + nextToken = response.get('NextToken') + sleep=.125 + except botocore.exceptions.ClientError as e: + if e.response['Error']['Code'] == 'TooManyRequestsException' or e.response['Error']['Code'] == 'ThrottlingException': + sleep = sleep * 2 # Exponential backoff. + if sleep > 5: + raise e + print(f"Warning: Rate Limit fault while getting additional volumes. Sleeping for {sleep} seconds.") + time.sleep(sleep) + else: + print(f"boto3 client error: {json.dumps(e.response)}") + raise e + + return volumes + + ################################################################################ + # This function will return all the alarms in the region. It will handle the + # case where there are more alarms than can be returned in a single call. + # It will also handle the case where we get a rate limit exception. + ################################################################################ + def getAlarms(cw): + + # The initial amount of time to sleep if there is a rate limit exception. + sleep=.125 + while True: + try: + response = cw.describe_alarms() + alarms = response['MetricAlarms'] + nextToken = response.get('NextToken') + sleep=.125 + break + except botocore.exceptions.ClientError as e: + if e.response['Error']['Code'] == 'TooManyRequestsException' or e.response['Error']['Code'] == 'ThrottlingException': + sleep = sleep * 2 + if sleep > 5: + raise e + print(f"Warning: Rate Limit fault while getting the initial list of alarms. Sleeping for {sleep} seconds.") + time.sleep(sleep) + else: + print(f"boto3 client error: {json.dumps(e.response)}") + raise e + + while nextToken: + try: + response = cw.describe_alarms(NextToken=nextToken) + alarms += response['MetricAlarms'] + nextToken = response.get('NextToken') + sleep=.125 + except botocore.exceptions.ClientError as e: + if e.response['Error']['Code'] == 'TooManyRequestsException' or e.response['Error']['Code'] == 'ThrottlingException': + sleep = sleep * 2 # Exponential backoff. + if sleep > 5: + raise e + print(f"Warning: Rate Limit fault while getting additional alarms. Sleeping for {sleep} seconds.") + time.sleep(sleep) + else: + print(f"boto3 client error: {json.dumps(e.response)}") + raise e + + return alarms + + ################################################################################ + # This is the main logic of the program. It loops on all the regions then all + # the fsx volumes within the region, checking to see if any of them already + # have a CloudWatch alarm, and if not, add one. + ################################################################################ + def lambda_handler(event, context): + global customerId, regions, SNStopic, accountId, onlyFilesystemId + # + # If the customer ID is set, reformat it to be used in the alarm description. + if customerId != '': + customerId = f", CustomerID: {customerId}" + + if len(SNStopic) == 0: + raise Exception("You must specify a SNS topic to send the alarm messages to.") + + if len(accountId) == 0: + raise Exception("You must specify an accountId to run this program.") + # + # Configure boto3 to use the more advanced "adaptive" retry method. + boto3Config = Config( + retries = { + 'max_attempts': 5, + 'mode': 'adaptive' + } + ) + + if len(regions) == 0: # pylint: disable=E0601 + ec2Client = boto3.client('ec2', config=boto3Config) + ec2Regions = ec2Client.describe_regions()['Regions'] + for region in ec2Regions: + regions += [region['RegionName']] + + fsxRegions = boto3.Session().get_available_regions('fsx') + for region in regions: + if region in fsxRegions: + print(f'Scanning {region}') + fsx = boto3.client('fsx', region_name=region, config=boto3Config) + cw = boto3.client('cloudwatch', region_name=region, config=boto3Config) + # + # Get all the file systems, volumes and alarm in the region. + fss = getFss(fsx) + volumes = getVolumes(fsx) + alarms = getAlarms(cw) + # + # Scan for filesystems without CPU Utilization Alarm. + for fs in fss: + if(fs['FileSystemType'] == "ONTAP"): + threshold = int(getCPUAlarmThresholdTagValue(fs['Tags'])) + if(threshold != 100): + fsId = fs['FileSystemId'] + fsName = fsId.replace('fs-', 'FsxId') + alarmName = alarmPrefixCPU + fsId + alarmDescription = f"CPU utilization alarm for file system {fsName}{customerId} in region {region}." + + if(not contains_alarm(alarmName, alarms) and onlyFilesystemId == None or + not contains_alarm(alarmName, alarms) and onlyFilesystemId != None and onlyFilesystemId == fsId): + print(f'Adding CPU Alarm for {fs["FileSystemId"]}') + add_cpu_alarm(cw, fsId, alarmName, alarmDescription, threshold, region) + # + # Scan for CPU alarms without a FSxN filesystem. + for alarm in alarms: + alarmName = alarm['AlarmName'] + if(alarmName[:len(alarmPrefixCPU)] == alarmPrefixCPU): + fsId = alarmName[len(alarmPrefixCPU):] + if(not contains_fs(fsId, fss) and onlyFilesystemId == None or + not contains_fs(fsId, fss) and onlyFilesystemId != None and onlyFilesystemId == fsId): + print("Deleting alarm: " + alarmName + " in region " + region) + delete_alarm(cw, alarmName) + # + # Scan for filesystems without SSD Utilization Alarm. + for fs in fss: + if(fs['FileSystemType'] == "ONTAP"): + threshold = int(getSSDAlarmThresholdTagValue(fs['Tags'])) + if(threshold != 100): + fsId = fs['FileSystemId'] + fsName = fsId.replace('fs-', 'FsxId') + alarmName = alarmPrefixSSD + fsId + alarmDescription = f"SSD utilization alarm for file system {fsName}{customerId} in region {region}." + + if(not contains_alarm(alarmName, alarms) and onlyFilesystemId == None or + not contains_alarm(alarmName, alarms) and onlyFilesystemId != None and onlyFilesystemId == fsId): + print(f'Adding SSD Alarm for {fsId}') + add_ssd_alarm(cw, fs['FileSystemId'], alarmName, alarmDescription, threshold, region) + # + # Scan for SSD alarms without a FSxN filesystem. + for alarm in alarms: + alarmName = alarm['AlarmName'] + if(alarmName[:len(alarmPrefixSSD)] == alarmPrefixSSD): + fsId = alarmName[len(alarmPrefixSSD):] + if(not contains_fs(fsId, fss) and onlyFilesystemId == None or + not contains_fs(fsId, fss) and onlyFilesystemId != None and onlyFilesystemId == fsId): + print("Deleteing alarm: " + alarmName + " in region " + region) + delete_alarm(cw, alarmName) + # + # Scan for volumes without alarms. + for volume in volumes: + if(volume['VolumeType'] == "ONTAP"): + volumeId = volume['VolumeId'] + volumeName = volume['Name'] + volumeARN = volume['ResourceARN'] + fsId = volume['FileSystemId'] + + threshold = int(getAlarmThresholdTagValue(fsx, volumeARN)) + + if(threshold != 100): # No alarm if the value is set to 100. + alarmName = alarmPrefixVolume + volumeId + fsName = fsId.replace('fs-', 'FsxId') + alarmDescription = f"Volume utilization alarm for volumeId {volumeId}{customerId}, File System Name: {fsName}, Volume Name: {volumeName} in region {region}." + if(not contains_alarm(alarmName, alarms) and onlyFilesystemId == None or + not contains_alarm(alarmName, alarms) and onlyFilesystemId != None and onlyFilesystemId == fsId): + print(f'Adding volume utilization alarm for {volumeName} in region {region}.') + add_volume_alarm(cw, volumeId, alarmName, alarmDescription, fsId, threshold, region) + # + # Scan for volume alarms without volumes. + for alarm in alarms: + alarmName = alarm['AlarmName'] + if(alarmName[:len(alarmPrefixVolume)] == alarmPrefixVolume): + volumeId = alarmName[len(alarmPrefixVolume):] + if(not contains_volume(volumeId, volumes) and onlyFilesystemId == None or + not contains_volume(volumeId, volumes) and onlyFilesystemId != None and onlyFilesystemId == getFileSystemId(alarm)): + print("Deleteing alarm: " + alarmName + " in region " + region) + delete_alarm(cw, alarmName) + + return + + ################################################################################ + # This function is used to print out the usage of the script. + ################################################################################ + def usage(): + print('Usage: auto_add_cw_alarms [-h|--help] [-d|--dryRun] [[-c|--customerID customerID] [[-a|--accountID aws_account_id] [[-s|--SNSTopic SNS_Topic_Name] [[-r|--region region] [[-C|--CPUThreshold threshold] [[-S|--SSDThreshold threshold] [[-V|--VolumeThreshold threshold] [-F|--FileSystemID FileSystemID]') + + ################################################################################ + # Main logic starts here. + ################################################################################ + # + # Set some default values. + regions = [] + dryRun = False + # + # Check to see if there any any environment variables set. + customerId = os.environ.get('customerId', '') + accountId = os.environ.get('accountId', '') + SNStopic = os.environ.get('SNStopic', '') + onlyFilesystemId = None + defaultCPUThreshold = int(os.environ.get('defaultCPUThreshold', defaultCPUThreshold)) + defaultSSDThreshold = int(os.environ.get('defaultSSDThreshold', defaultSSDThreshold)) + defaultVolumeThreshold = int(os.environ.get('defaultVolumeThreshold', defaultVolumeThreshold)) + regionsEnv = os.environ.get('regions', '') + if regionsEnv != '': + regions = regionsEnv.split(',') + # + # Check to see if we are bring run from a command line or a Lmabda function. + if os.environ.get('AWS_LAMBDA_FUNCTION_NAME') == None: + argumentList = sys.argv[1:] + options = "hc:a:s:dr:C:S:V:F:" + + longOptions = ["help", "customerID=", "accountID=", "SNSTopic=", "dryRun", "region=", "CPUThreshold=", "SSDThreshold=", "VolumeThreshold=", "FileSystemID="] + skip = False + try: + arguments, values = getopt.getopt(argumentList, options, longOptions) + + for currentArgument, currentValue in arguments: + if currentArgument in ("-h", "--help"): + usage() + skip = True + elif currentArgument in ("-c", "--customerID"): + customerId = currentValue + elif currentArgument in ("-a", "--accountID"): + accountId = currentValue + elif currentArgument in ("-s", "--SNSTopic"): + SNStopic = currentValue + elif currentArgument in ("-C", "--CPUThreshold"): + defaultCPUThreshold = int(currentValue) + elif currentArgument in ("-S", "--SSDThreshold"): + defaultSSDThreshold = int(currentValue) + elif currentArgument in ("-V", "--VolumeThreshold"): + defaultVolumeThreshold = int(currentValue) + elif currentArgument in ("-d", "--dryRun"): + dryRun = True + elif currentArgument in ("-r", "--region"): + regions += [currentValue] + elif currentArgument in ("-F", "--FileSystemID"): + onlyFilesystemId = currentValue + + except getopt.error as err: + print(str(err)) + usage() + skip = True + + if not skip: + lambda_handler(None, None) diff --git a/Monitoring/auto-add-cw-alarms/update-auto-add-cw-alarms-CF-Template b/Monitoring/auto-add-cw-alarms/update-auto-add-cw-alarms-CF-Template new file mode 100755 index 0000000..8c838cd --- /dev/null +++ b/Monitoring/auto-add-cw-alarms/update-auto-add-cw-alarms-CF-Template @@ -0,0 +1,46 @@ +#!/bin/bash +# +# This script is used to update the CloudFormation template with the latest +# version of the Lambda function. It will also update the version number in +# the template. +################################################################################# + +majorVersionNum=2 +file="auto_add_cw_alarms.py" + +tmpfile1=$(mktemp /tmp/tmpfile1.XXXXXX) +tmpfile2=$(mktemp /tmp/tmpfile2.XXXXXX) +trap "rm -f $tmpfile1 $tmpfile2" exit +# +# First get the monitoring code out of the CF template. +sed -e '1,/ZipFile/d' cloudformation.yaml > cloudformation.yaml.tmp +# +# Now get the Date and Version lines out of both files. +egrep -v '^ # Date:|^ # Version' cloudformation.yaml.tmp > $tmpfile1 +egrep -v '^# Date:|^# Version:' $file > $tmpfile2 + +if diff -w $tmpfile1 $tmpfile2 > /dev/null; then + echo "No changes to the monitor code." + rm -f cloudformation.yaml.tmp + rm -f $tmpfile1 $tmpfile2 + exit 0 +fi +# +# Get the number of commits in the git history for the file to calculate the minor version number. +minorVersionNum="$(git log "$file" | egrep '^commit' | wc -l)" +if [ -z "$minorVersionNum" ]; then + echo "Failed to calculate version number." 1>&2 + exit 1 +fi + +version="v${majorVersionNum}.${minorVersionNum}" +# +# Strip out the monitoring code. +sed -e '/ZipFile/,$d' cloudformation.yaml > cloudformation.yaml.tmp +echo " ZipFile: |" >> cloudformation.yaml.tmp +# +# Add the monitoring code to the CF template while updating the version and date. +cat "$file" | sed -e 's/^/ /' -e "s/%%VERSION%%/${version}/" -e "s/%%DATE%%/$(date +%Y-%m-%d-%H:%M:%S)/" >> cloudformation.yaml.tmp + +echo "Updating cloudformation.yaml" +mv cloudformation.yaml.tmp cloudformation.yaml diff --git a/Monitoring/monitor-ontap-services/updateMonOntapServiceCFTemplate b/Monitoring/monitor-ontap-services/updateMonOntapServiceCFTemplate index 3da3bbb..219a002 100755 --- a/Monitoring/monitor-ontap-services/updateMonOntapServiceCFTemplate +++ b/Monitoring/monitor-ontap-services/updateMonOntapServiceCFTemplate @@ -2,7 +2,7 @@ # # This script is used to update the CloudFormation template with the latest # version of the Lambda function. It will also update the version number in -# the template as well as create a git tag for the version. +# the template. ################################################################################# majorVersionNum=2 diff --git a/README.md b/README.md index 6ab3ad0..80c8345 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,7 @@ Have a great idea? We'd love to hear it! Please email us at [ng-fsxn-github-samp * [FSx ONTAP iscsi volume creation automation for Windows](/Management-Utilities/iscsi-vol-create-and-mount) * [Warm Performance Tier](/Management-Utilities/warm_performance_tier) * [Monitoring](/Monitoring) + * [CloudWatch Dashboard for FSx for ONTAP](/Monitoring/CloudWatch-FSx) * [Export LUN metrics from an FSx ONTAP to Amazon CloudWatch](/Monitoring/LUN-monitoring) * [Automatically Add CloudWatch Alarms for FSx Resources](/Monitoring/auto-add-cw-alarms) * [Monitor ONTAP metrics from FSx ONTAP using python Lambda function](/Monitoring/monitor-ontap-services) diff --git a/Solutions/EKS-logs-to-ELK/README.md b/Solutions/EKS-logs-to-ELK/README.md index 8bfde53..7bef2be 100644 --- a/Solutions/EKS-logs-to-ELK/README.md +++ b/Solutions/EKS-logs-to-ELK/README.md @@ -3,36 +3,25 @@ A multi log solution using NetApp FSxN and Trident for collecting non-stdout logs from applications. - - - ## The problem -* Lets say you have your default application stream but you also want to maintain an access log and an audit log, each log has its own foramt, its own wirte frequesncy and even different permissions. -* There is a need to save each type in a different file but the same goal of collecting these logs and pushing them to log aggreagtion engines/storage. -* The chalange is that these file are located on the disposible Pod storage and cannot be accessed or streamed same as std out/std error logs. +* Lets say you have your default application stream but you also want to maintain an access log and an audit log, each log has its own format, its own write frequency and even different permissions. +* There is a need to save each type in a different file but the same goal of collecting these logs and pushing them to log aggregation engines/storage. +* The challenge is that these file are located on the disposable Pod storage and cannot be accessed or streamed same as std out/std error logs. * A more advance but still common scenario is when a container has more than one log stream / file. - - - ## Collecting logs using FSxN Trident persistent storage -With FSxN and Trident, you can create a shared namespace persistent storage platform and collect non-stdout logs into one location (ElasticSearch, Loki, S3, etc..), overcoming the common obstacles faced when implementing multilog solutions. +With FSxN and Trident, you can create a shared namespace persistent storage platform and collect non-stdout logs into one location (ElasticSearch, Loki, S3, etc..), overcoming the common obstacles faced when implementing multi log solutions. - - - - -### Solution Architecture Example ## Getting Started -The following sections provide quickstart instructions for multiple logs shippers. All of these assume that you have cloned this repository locally and are using a CLI thats current directory is the root of the code repository. +The following section provide quick start instructions for multiple logs shippers. All of these assume that you have cloned this repository locally and you are using a CLI with its current directory set to the root of the code repository. ### Prerequisites -* `Helm` - for reources installation. +* `Helm` - for resource installation. * `Kubectl` – for interacting with the EKS cluster. -* NetApp FSxN running on the same EKS vpc. +* NetApp FSxN running on the same EKS VPC. * TCP NFS ports should be open between the EKS nodes and the FSxN: `111`, `2049`, @@ -41,13 +30,17 @@ The following sections provide quickstart instructions for multiple logs shipper `4046`, `4049` - [Check NetAppKB instructions](https://kb.netapp.com/onprem/ontap/da/NAS/Which_Network_File_System_NFS_TCP_and_NFS_UDP_ports_are_used_on_the_storage_system) * Kubernetes Snapshot Custom Resources (CRD) and Snapshot Controller installed on EKS cluster: - Learn more about the snapshot requirements for your cluster in the ["How to Deploy Volume Snapshots”](https://kubernetes.io/blog/2020/12/10/kubernetes-1.20-volume-snapshot-moves-to-ga/#how-to-deploy-volume-snapshots) Kuberbetes blog. + Learn more about the snapshot requirements for your cluster in the ["How to Deploy Volume Snapshots”](https://kubernetes.io/blog/2020/12/10/kubernetes-1.20-volume-snapshot-moves-to-ga/#how-to-deploy-volume-snapshots) Kubernetes blog. * NetApp Trident operator CSI should be installed on EKS. [Check Trident installation guide using Helm](https://docs.netapp.com/us-en/trident/trident-get-started/kubernetes-deploy-helm.html#deploy-the-trident-operator-and-install-astra-trident-using-helm). ### Installation * Configure Trident CSI backend to connect to the FSxN file system. Create the backend configuration for the trident driver. Create secret on trident namespace and fill the FSxN password: -```kubectl create secret generic fsx-secret --from-literal=username=fsxadmin --from-literal=password= -n trident --create-namespace``` + +``` +kubectl create secret generic fsx-secret --from-literal=username=fsxadmin --from-literal=password= -n trident --create-namespace +``` + * Install trident-resources helm chart from this GitHub repository. The custom Helm chart includes: - `backend-tbc-ontap-nas.yaml` - backend configuration for using NFS on EKS @@ -58,17 +51,21 @@ The following sections provide quickstart instructions for multiple logs shipper The following variables should be filled on the Values.yaml or run the following by using `--set` Helm command. -* `namespace` - namespace of the Trident operator +* `namespace` - namespace of the Trident operator. Typically 'trident'. * `fsx.managment_lif` - FSxN ip address * `fsx.svm_name` - FSxN SVM name * `configuration.storageclass_nas` - NAS storage class name * `configuration.storageclass_san` - SAN (ISCSI) storage class name Then use helm to deploy the package: -```helm install trident-resources ./trident-resources -n trident``` +``` +helm install trident-resources ./trident-resources -n trident +``` Verify that FSxN has been successfully connected to the backend: -```kubectl get TridentBackendConfig -n trident``` +``` +kubectl get TridentBackendConfig -n trident +``` ### Implementing a sample application for collecting logs @@ -91,7 +88,7 @@ spec: storage: 100Gi storageClassName: trident-csi ``` - * `trident.netapp.io/shareFromPVC:` The primary PersistentVolumeClaim you have created previously. +* `trident.netapp.io/shareFromPVC:` The primary PersistentVolumeClaim you have created previously. * `storage` - volume size ##### **volume-reference-fsx.yaml**: @@ -136,7 +133,9 @@ spec: Installing an example application by helm: -```helm upgrade --install example-app ./examples/example-app -n rpc --create-namespace``` +``` +helm upgrade --install example-app ./examples/example-app -n rpc --create-namespace +``` When the application is deployed, you should be able to see the PVC as a mount at /log. @@ -151,7 +150,6 @@ Install Vector.dev agent as DeamonSet from [Helm chart](https://vector.dev/docs/ 1. Clone vector GitHub repository: ``` git clone https://github.com/vectordotdev/helm-charts.git - ``` 2. Adding override values: @@ -363,4 +361,4 @@ Unless required by applicable law or agreed to in writing, software distributed See the License for the specific language governing permissions and limitations under the License. -© 2024 NetApp, Inc. All Rights Reserved. \ No newline at end of file +© 2024 NetApp, Inc. All Rights Reserved.