diff --git a/.github/scs-compliance-check/openstack/clouds.yaml b/.github/scs-compliance-check/openstack/clouds.yaml index 325f6e33b..63a2b9805 100644 --- a/.github/scs-compliance-check/openstack/clouds.yaml +++ b/.github/scs-compliance-check/openstack/clouds.yaml @@ -89,6 +89,14 @@ clouds: auth: auth_url: https://identity.l1a.cloudandheat.com/v3 application_credential_id: "7ab4e3339ea04255bc131868974cfe63" + scaleup-occ2: + auth_type: v3applicationcredential + auth: + auth_url: https://keystone.occ2.scaleup.cloud + application_credential_id: "5d2eea4e8bf8448092490b4190d4430a" + region_name: "RegionOne" + interface: "public" + identity_api_version: 3 syseleven-dus2: interface: public identity_api_verion: 3 diff --git a/.github/workflows/check-scaleup-occ2-v4.yml b/.github/workflows/check-scaleup-occ2-v4.yml new file mode 100644 index 000000000..b5bf70a2d --- /dev/null +++ b/.github/workflows/check-scaleup-occ2-v4.yml @@ -0,0 +1,23 @@ +name: "Compliance IaaS v4 of scaleup-occ2" + +on: + # Trigger compliance check every day at 4:30 UTC + schedule: + - cron: '30 4 * * *' + # Trigger compliance check after Docker image has been built + workflow_run: + workflows: [Build and publish scs-compliance-check Docker image] + types: + - completed + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +jobs: + check-scaleup-occ2: + uses: ./.github/workflows/scs-compliance-check-with-application-credential.yml + with: + version: v4 + layer: iaas + cloud: scaleup-occ2 + secret_name: OS_PASSWORD_SCALEUP_OCC2 + secrets: inherit diff --git a/.github/workflows/lint-golang.yml b/.github/workflows/lint-golang.yml new file mode 100644 index 000000000..faf7fdc8c --- /dev/null +++ b/.github/workflows/lint-golang.yml @@ -0,0 +1,28 @@ +name: Check Go syntax + +on: + push: + paths: + - 'Tests/kaas/kaas-sonobuoy-tests/**/*.go' + - .github/workflows/lint-go.yml + +jobs: + lint-go-syntax: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v4 + with: + go-version: '1.23' + + # Install golangci-lint + - name: Install golangci-lint + run: | + curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.61.0 + + # Run golangci-lint + - name: Run golangci-lint + working-directory: Tests/kaas/kaas-sonobuoy-tests + run: golangci-lint run ./... -v diff --git a/.gitignore b/.gitignore index 4d7851fab..2b83a0983 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,9 @@ **/__pycache__/ .venv/ .idea +.sandbox .DS_Store node_modules Tests/kaas/results/ +Tests/kaas/kaas-sonobuoy-tests/results/ *.tar.gz diff --git a/.markdownlint-cli2.jsonc b/.markdownlint-cli2.jsonc index 94fca3275..4d44024bd 100644 --- a/.markdownlint-cli2.jsonc +++ b/.markdownlint-cli2.jsonc @@ -43,9 +43,10 @@ { "name": "double-spaces", "message": "Avoid double spaces", - "searchPattern": "/([^\\s>]) ([^\\s|])/g", + "searchPattern": "/([^\\s>|]) ([^\\s|])/g", "replace": "$1 $2", - "skipCode": true + "skipCode": true, + "tables": false } ] } diff --git a/.zuul.d/secure.yaml b/.zuul.d/secure.yaml index 318a1b6b0..78dbb906f 100644 --- a/.zuul.d/secure.yaml +++ b/.zuul.d/secure.yaml @@ -233,6 +233,28 @@ VCsXjf0qBBMrzz6HP9z95Bk44fiJ3L/LkA3Iij961dYrQXbZKDrKOiX/QPwrcSrVmjmew UbPexJFHgvTCqjadoLejSt9cUd9lVzhuzLJ8CS+CcCMbZOno6qathrd2B88riQaPNIGNu gfkNT9R63ZzKB1qIA2n5RZi7SH9DPIUd0AwLMn2bhp3uok5pNAPP/4/1RkQiCA= + scaleup_occ2_ac_id: !encrypted/pkcs1-oaep + - N2duwkcMdOXw6wF0deE/0BPM1M/URt3eWmrnBJ89VHeCDENGfTfDHcWPYs3wW4rSRCG6t + gqgNuA049OvOhL7rtjNHZ6yIj6xEHH/YdqT4UxjXPS9GFwoJXDtE8rIGjK3KU8GfUgKnG + DLplyyzGzx5j39rJAS628InmC56aip47rO1J4HQE9Ku25Wb06R7ykx+0ZOWr0HXjV/VsV + uwfyL+DPgewbL+4u8/XkcI0FwAM9/KkF/CcYUq5aVMdQS2foatTQW0C2idg+pffSTRaau + VF44rkVfzsCOz4MYAFpLIaL9Zxx1FifaPOd0oi6rEFjGd6vFtFCHk1BRpKmOITLyx3Te5 + zVffSkQAsqpn/4er8800bjQzxXvqmQmR0QwPM7dhvRnrNbTSCA/Awm5BPaUgeCZFN3MPN + Mc0XIaEwjuJvDK6fqj5tJrVIs5bxAmqRDj8d76AlJcOdDxHicTHgR3aUG4AKOWkUsskgQ + 3xR8lPh31O/HgzG9tq6o/DCPA1O9wyyOyT7KwJAaRASPCA1O80ZAzhZUNUVyut6dYEwaS + QXP4IaEJOxP8EkxR7FDEuO99UFZ7TXQ1CF7ots4wIs5tEpQvcdLnvBjJckp0fNBFTuGMm + FCvhgBK30NC93U4DxQv6xZBhqtvHYjHcTOXvz2fryRJT2teMN+eI+RDdV1Jj8Y= + scaleup_occ2_ac_secret: !encrypted/pkcs1-oaep + - LfUHhslK41JDp3CpslWGGA4bZ3udZh4KnytcXohkdbchb8QVt8eNc4nD0ti0/XS18YKwq + DlHOWw2rDJZ8RGIXENVUYzDbECoBErE8IAqQE0q3oS/8Oq0NYOFTGvvlKuue7U4s87Pwi + YFi+Q0Rv7vO8cWFVtbRHK+Hw6pC42Biq2T+tuVBCLqylIMViXpuEy9UpFLEv59zr6EHa9 + uB3xkjnpWuabe7vrG+LQHc0pJ5tNhcLiOnJggU5Ef02FBy+t6xvuJW8f6cXCnRRj1q0fl + D/vTmC7avwHnWC+J4WLL69HCwW05I7iHftVSWOXQgRzMBd4D4ND2OXfsWElu0eOV5XG6X + JsQH8lDnVN/lqaDAOYR4fk4+9yt3RURwvNL5FUnDK1t7LAI4X0gcvLrQAfzgOlpBYDXSK + 0kbUzqwivuw1v2zO/gxQU+J28PsOfZaKf/7ZZyj3e/tiq4wBpvPb0mVBwWXigKqzr+QED + Iy2u/g3x2qdcTpXR/RPq+xiXM2B2rw1V5gdkscdL+avXtTF7hT9HrcayHx3HDZ/h6aGPD + RWIJ8bstl+x2Q4zExgR13amWM8ZR1iLGCN20U/ZAaqANCqjDbrSVSTjTPzYtNFwAXwxkB + 3NHhPDHZ1MIdr6IJE4IZ4TCMsIeTA2UHNfF4RCzeDSIJ+CXOQxUFWOxZkf97WY= syseleven_dus2_ac_id: !encrypted/pkcs1-oaep - SjwtIvJO7DkLJDmS+T/Z5utFBa22hmPRBd8mzonJHGgURB2W7fmXFreD9NPrLfbt7ujKi KNqJm8k1Vr1F3Mu+Osr0BWSnq5makwVt2ikBY4qPbL8iyVXsByaT/HNPLCOokqy+REpfu diff --git a/README.md b/README.md index 2685faddb..5052b25bf 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,10 @@ - # Sovereign Cloud Stack – Standards and Certification SCS unifies the best of cloud computing in a certified standard. With a decentralized and federated cloud stack, SCS puts users in control of their data and fosters trust in clouds, backed by a global open-source community. ## SCS compatible clouds -This is a list of clouds that we test on a nightly basis against our `scs-compatible` certification level. - -| Name | Description | Operator | _SCS-compatible IaaS_ Compliance | HealthMon | -| -------------------------------------------------------------------------------------------------------------- | ------------------------------------------------- | ----------------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------: | -| [gx-scs](https://github.com/SovereignCloudStack/docs/blob/main/community/cloud-resources/plusserver-gx-scs.md) | Dev environment provided for SCS & GAIA-X context | plusserver GmbH | [![Compliance Status](https://img.shields.io/github/actions/workflow/status/SovereignCloudStack/standards/check-gx-scs-v4.yml?label=v4)](https://github.com/SovereignCloudStack/standards/actions/workflows/check-gx-scs-v4.yml) | [HM](https://health.gx-scs.sovereignit.cloud:3000/) | -| [pluscloud open](https://www.plusserver.com/en/products/pluscloud-open)
- prod1
- prod2
- prod3
- prod4 | Public cloud for customers (4 regions) | plusserver GmbH |  
- prod1 [![Compliance Status](https://img.shields.io/github/actions/workflow/status/SovereignCloudStack/standards/check-pco-prod1-v4.yml?label=v4)](https://github.com/SovereignCloudStack/standards/actions/workflows/check-pco-prod1-v4.yml)
- prod2 [![Compliance Status](https://img.shields.io/github/actions/workflow/status/SovereignCloudStack/standards/check-pco-prod2-v4.yml?label=v4)](https://github.com/SovereignCloudStack/standards/actions/workflows/check-pco-prod2-v4.yml)
- prod3 [![Compliance Status](https://img.shields.io/github/actions/workflow/status/SovereignCloudStack/standards/check-pco-prod3-v4.yml?label=v4)](https://github.com/SovereignCloudStack/standards/actions/workflows/check-pco-prod3-v4.yml)
- prod4 [![Compliance Status](https://img.shields.io/github/actions/workflow/status/SovereignCloudStack/standards/check-pco-prod4-v4.yml?label=v4)](https://github.com/SovereignCloudStack/standards/actions/workflows/check-pco-prod4-v4.yml) |  
[HM1](https://health.prod1.plusserver.sovereignit.cloud:3000/d/9ltTEmlnk/openstack-health-monitor2?orgId=1&var-mycloud=plus-pco)
[HM2](https://health.prod1.plusserver.sovereignit.cloud:3000/d/9ltTEmlnk/openstack-health-monitor2?orgId=1&var-mycloud=plus-prod2)
[HM3](https://health.prod1.plusserver.sovereignit.cloud:3000/d/9ltTEmlnk/openstack-health-monitor2?orgId=1&var-mycloud=plus-prod3)
[HM4](https://health.prod1.plusserver.sovereignit.cloud:3000/d/9ltTEmlnk/openstack-health-monitor2?orgId=1&var-mycloud=plus-prod4) | -| [Wavestack](https://www.noris.de/wavestack-cloud/) | Public cloud for customers | noris network AG/Wavecon GmbH | [![Compliance Status](https://img.shields.io/github/actions/workflow/status/SovereignCloudStack/standards/check-wavestack-v4.yml?label=v4)](https://github.com/SovereignCloudStack/standards/actions/workflows/check-wavestack-v4.yml) | [HM](https://health.wavestack1.sovereignit.cloud:3000/) | -| [REGIO.cloud](https://regio.digital) | Public cloud for customers | OSISM GmbH | [![Compliance Status](https://img.shields.io/github/actions/workflow/status/SovereignCloudStack/standards/check-regio-a-v4.yml?label=v4)](https://github.com/SovereignCloudStack/standards/actions/workflows/check-regio-a-v4.yml) | broken | -| [CNDS](https://cnds.io/) | Public cloud for customers | artcodix GmbH | [![Compliance Status](https://img.shields.io/github/actions/workflow/status/SovereignCloudStack/standards/check-artcodix-v4.yml?label=v4)](https://github.com/SovereignCloudStack/standards/actions/workflows/check-artcodix-v4.yml) | [HM](https://ohm.muc.cloud.cnds.io/) | -| [aov.cloud](https://www.aov.de/) | Community cloud for customers | aov IT.Services GmbH | (soon) | [HM](https://health.aov.cloud/) | -| PoC WG-Cloud OSBA | Cloud PoC for FITKO (yaook-based) | Cloud&Heat Technologies GmbH | [![Compliance Status](https://img.shields.io/github/actions/workflow/status/SovereignCloudStack/standards/check-poc-wgcloud-v4.yml?label=v4)](https://github.com/SovereignCloudStack/standards/actions/workflows/check-poc-wgcloud-v4.yml) | [HM](https://health.poc-wgcloud.osba.sovereignit.cloud:3000/d/9ltTEmlnk/openstack-health-monitor2?var-mycloud=poc-wgcloud&orgId=1) | -| PoC KDO | Cloud PoC for FITKO | KDO Service GmbH / OSISM GmbH | [![Compliance Status](https://img.shields.io/github/actions/workflow/status/SovereignCloudStack/standards/check-poc-kdo-v4.yml?label=v4)](https://github.com/SovereignCloudStack/standards/actions/workflows/check-poc-kdo-v4.yml) | (soon) | -| [syseleven](https://www.syseleven.de/en/products-services/openstack-cloud/)
- dus2
- ham1 | Public OpenStack Cloud (2 SCS regions) | SysEleven GmbH |  
- dus2 [![Compliance Status](https://img.shields.io/github/actions/workflow/status/SovereignCloudStack/standards/check-syseleven-dus2-v4.yml?label=v4)](https://github.com/SovereignCloudStack/standards/actions/workflows/check-syseleven-dus2-v4.yml)
- ham1 [![Compliance Status](https://img.shields.io/github/actions/workflow/status/SovereignCloudStack/standards/check-syseleven-ham1-v4.yml?label=v4)](https://github.com/SovereignCloudStack/standards/actions/workflows/check-syseleven-ham1-v4.yml) |  
(soon)
(soon) | +See [Compliant clouds overview](https://docs.scs.community/standards/certification/overview) on our docs page. ## SCS standards overview diff --git a/Standards/scs-0001-v1-sovereign-cloud-standards.md b/Standards/scs-0001-v1-sovereign-cloud-standards.md index 48ef64c49..eabfff020 100644 --- a/Standards/scs-0001-v1-sovereign-cloud-standards.md +++ b/Standards/scs-0001-v1-sovereign-cloud-standards.md @@ -107,7 +107,7 @@ embedded in the markdown header. | Field name | Requirement | Description | | --------------- | -------------------------------------------------------------------------- | ------------------------------------------------------------------------------------- | | `type` | REQUIRED | one of `Procedural`, `Standard`, `Decision Record`, or `Supplement` | -| `status` | REQUIRED | one of `Proposal`, `Draft`, `Stable`, `Deprecated`, or `Rejected` | +| `status` | REQUIRED | one of `Draft`, `Stable`, `Deprecated`, or `Rejected` | | `track` | REQUIRED | one of `Global`, `IaaS`, `KaaS`, `IAM`, `Ops` | | `supplements` | REQUIRED precisely when `type` is `Supplement` | list of documents that are extended by this document (e.g., multiple major versions) | | `deprecated_at` | REQUIRED if `status` is `Deprecated` | ISO formatted date indicating the date after which the deprecation is in effect | @@ -167,11 +167,11 @@ In addition, the following OPTIONAL sections should be considered: ## Process The lifecycle of an SCS document goes through the following phases: -Proposal, Draft, Stable, Deprecated, and Rejected. +Draft, Stable, Deprecated, and Rejected. ```mermaid graph TD - A[Proposal] -->|Pull Request| B[Draft] + A["Draft (Proposal)"] -->|Pull Request| B[Draft] B -->|Pull Request| D[Stable] B -->|Pull Request| E[Rejected] D -->|Pull Request| F[Deprecated] @@ -195,8 +195,15 @@ Supplements may be kept in Draft state, because they are not authoritative. To propose a new SCS document, a community participant creates a pull request on GitHub against the [standards repository in the SovereignCloudStack organisation][scs-standards-repo]. - -The pull request MUST add exactly one SCS document, +In the beginning, the pull request will contain a draft of an SCS document and +the community participant should present it to the SCS community. +They may refer to the [SCS Community page](https://docs.scs.community/community/) +for an overview of applicable means of communication and online meetings +to get in touch with the SCS community. +Community participants are encouraged to present their proposal to the SCS community early on. +Note that the proposal draft's content does not need to be finished in any way at this stage. + +The pull request for the proposal MUST add exactly one SCS document, in the `Standards` folder. In the proposal phase, the document number MUST be replaced with `xxxx` in the file name, @@ -209,7 +216,7 @@ for a Supplement of `scs-0100-v3-flavor-naming.md`, the file name might be `scs-0100-w1-flavor-naming-implementation-testing.md` (note the `w1`!). The metadata MUST indicate the intended `track` and `type` of the document, -and the `status` MUST be set to `Proposal`; +and the `status` MUST be set to `Draft`; for a Supplement, the `supplements` field MUST be set to a list of documents (usually containing one element). @@ -217,7 +224,8 @@ Upon acceptance by the group of people identified by the `track`, a number is assigned (the next unused number) and the proposer is asked -to rename the file to replace the `xxxx` with that number. +to rename the file to replace the `xxxx` with that number +before the merge of the pull request. **Note:** Documents on the `Design Record` track MAY be proposed or accepted directly into `Stable` state, diff --git a/Standards/scs-0100-v3-flavor-naming.md b/Standards/scs-0100-v3-flavor-naming.md index ce09dd0ee..587bde220 100644 --- a/Standards/scs-0100-v3-flavor-naming.md +++ b/Standards/scs-0100-v3-flavor-naming.md @@ -14,7 +14,7 @@ description: | ## Introduction -This is the standard v3.1 for SCS Release 5. +This is the standard v3.2 for SCS Release 8. Note that we intend to only extend it (so it's always backwards compatible), but try to avoid changing in incompatible ways. (See at the end for the v1 to v2 transition where we have not met that @@ -366,13 +366,15 @@ The options for arch are as follows: The generation is vendor specific and can be left out, but it can only be specified in conjunction with a vendor. At present, these values are possible: -| Generation | i (Intel x86-64) | z (AMD x86-64) |  a (AArch64) | r (RISC-V) | -| ---------- | ---------------- | -------------- | ------------------ | ---------- | -| 0 | pre Skylake | pre Zen | pre Cortex A76 | TBD | -| 1 | Skylake | Zen-1 (Naples) | A76/NeoN1 class | TBD | -| 2 | Cascade Lake | Zen-2 (Rome) | A78/x1/NeoV1 class | TBD | -| 3 | Ice Lake | Zen-3 (Milan) | A71x/NeoN2 (ARMv9) | TBD | -| 4 | Sapphire Rapids | Zen-4 (Genoa) | | TBD | +| Generation | i (Intel x86-64) | z (AMD x86-64) |  a (AArch64) | r (RISC-V) | +| ---------- | ----------------- | -------------- | -------------------- | ---------- | +| 0 | pre Skylake | pre Zen | pre Cortex A76 | TBD | +| 1 | Skylake | Zen-1 (Naples) | A76/NeoN1 class | TBD | +| 2 | Cascade Lake | Zen-2 (Rome) | A78/x1/NeoV1 class | TBD | +| 3 | Ice Lake | Zen-3 (Milan) | A71x/NeoN2/V2(ARMv9) | TBD | +| 4 | Sapphire Rapids | Zen-4 (Genoa) | AmpereOne (ARMv8.6) | TBD | +| 5 | Sierra Forest(E) | Zen-5 (Turin) | A72x/NeoN3/V3(Av9.2) | TBD | +| 6 | Granite Rapids(P) | | | TBD | It is recommended to leave out the `0` when specifying the old generation; this will help the parser tool, which assumes 0 for an unspecified value and does leave it @@ -384,8 +386,11 @@ out when generating the name for comparison. In other words: 0 has a meaning of We don't differentiate between Zen-4 (Genoa) and Zen-4c (Bergamo); L3 cache per Siena core is smaller on Bergamo and the frequency lower but the cores are otherwise identical. As we already have a qualifier `h` that allows to specify higher frequencies -(which Genoa thus may use more and Bergamo less or not), we have enough distinction -capabilities. +(which Genoa thus may use more and Bergamo not), we have enough distinction +capabilities. The same applies to Zen-5 (Turin) and Zen-5c (Turin Dense). +For intel with the server E-cores (Crestmont), these received their own +generation assignment, as the difference to the server P-cores (Redwood Cove) +is more significant. ::: @@ -412,7 +417,7 @@ capabilities. ### [OPTIONAL] GPU support -Format: `_`\[`G/g`\]X\[N\]\[`-`M\]\[`h`\] +Format: `_`\[`G/g`\]X\[N\[`-`M\[`h`\]\[`-`V\[`h`\]\]\]\] This extension provides more details on the specific GPU: @@ -420,7 +425,9 @@ This extension provides more details on the specific GPU: - vendor (X) - generation (N) - number (M) of processing units that are exposed (for pass-through) or assigned; see table below for vendor-specific terminology -- high-performance indicator (`h`) +- high-frequency indicator (`h`) for compute units +- amount of video memory (V) in GiB +- an indicator for high-bandwidth memory Note that the vendor letter X is mandatory, generation and processing units are optional. @@ -430,18 +437,34 @@ Note that the vendor letter X is mandatory, generation and processing units are | `A` | AMD | compute units (CUs) | | `I` | Intel | execution units (EUs) | -For nVidia, the generation N can be f=Fermi, k=Kepler, m=Maxwell, p=Pascal, v=Volta, t=turing, a=Ampere, l=Ada Lovelace, ..., -for AMD GCN-x=0.x, RDNA1=1, RDNA2=2, RDNA3=3, -for Intel Gen9=0.9, Xe(12.1)=1, ... +For nVidia, the generation N can be f=Fermi, k=Kepler, m=Maxwell, p=Pascal, v=Volta, t=turing, a=Ampere, l=Ada Lovelace, g=Grace Hopper, ..., +for AMD GCN-x=0.x, RDNA1=1, C/RDNA2=2, C/RDNA3=3, C/RDNA3.5=3.5, C/RDNA4=4, ... +for Intel Gen9=0.9, Xe(12.1/DG1)=1, Xe(12.2)=2, Arc(12.7/DG2)=3 ... (Note: This may need further work to properly reflect what's out there.) -The optional `h` suffix to the compute unit count indicates high-performance (e.g. high freq or special -high bandwidth gfx memory such as HBM); -`h` can be duplicated for even higher performance. +The optional `h` suffix to the compute unit count indicates high-frequency GPU compute units. +It is not normally recommended to use it except if there are several variants of cards within +a generation of GPUs and with similar number of SMs/CUs/EUs. +In case there are even more than two variants, the letter `h` can be duplicated for even +higher frquencies. -Example: `SCS-16V-64-500s_GNa-14h` -This flavor has a pass-through GPU nVidia Ampere with 14 SMs and either high-bandwidth memory or specially high frequencies. -Looking through GPU specs you could guess it's 1/4 of an A30. +Please note that there are GPUs from one generation and vendor that have vastly different sizes +(or different fractions are being passed to an instance with multi-instance-GPUs). The number +M allows to differentiate between them and have an indicator of the compute capability and +parallelism. M can not at all be compared between different generations let alone different +vendors. + +The amount of video memory dedicated to the instance can be indicated by V (in binary +Gigabytes). This number needs to be an integer - fractional memory sizes must be rounded +down. An optional `h` can be used to indicate high bandwidth memory (such as HBM2+) with +bandwidths well above 1GiB/s. + +Example: `SCS-16V-64-500s_GNa-14-6h` +This flavor has a pass-through GPU nVidia Ampere with 14 SMs and 6 GiB of high-bandwidth video +memory. Looking through GPU specs you could guess it's 1/4 of an A30. + +We have a table with common GPUs in the +[implementation hints for this standard](scs-0100-w1-flavor-naming-implementation-testing.md) ### [OPTIONAL] Infiniband @@ -485,14 +508,14 @@ an image is considered broken by the SCS team. ## Proposal Examples -| Example | Decoding | -| ------------------------- | ---------------------------------------------------------------------------------------------- | -| SCS-2C-4-10n | 2 dedicated cores (x86-64), 4GiB RAM, 10GB network disk | -| SCS-8Ti-32-50p_i1 | 8 dedicated hyperthreads (insecure), Skylake, 32GiB RAM, 50GB local NVMe | -| SCS-1L-1u-5 | 1 vCPU (heavily oversubscribed), 1GiB Ram (no ECC), 5GB disk (unspecific) | -| SCS-16T-64-200s_GNa-64_ib | 16 dedicated threads, 64GiB RAM, 200GB local SSD, Infiniband, 64 Passthrough nVidia Ampere SMs | -| SCS-4C-16-2x200p_a1 | 4 dedicated Arm64 cores (A76 class), 16GiB RAM, 2x200GB local NVMe drives | -| SCS-1V-0.5 | 1 vCPU, 0.5GiB RAM, no disk (boot from cinder volume) | +| Example | Decoding | +| ------------------------------ | ---------------------------------------------------------------------------------------------- | +| `SCS-2C-4-10n` | 2 dedicated cores (x86-64), 4GiB RAM, 10GB network disk | +| `SCS-8Ti-32-50p_i1` | 8 dedicated hyperthreads (insecure), Skylake, 32GiB RAM, 50GB local NVMe | +| `SCS-1L-1u-5` | 1 vCPU (heavily oversubscribed), 1GiB Ram (no ECC), 5GB disk (unspecific) | +| `SCS-16T-64-200s_GNa-72-24_ib` | 16 dedicated threads, 64GiB RAM, 200GB local SSD, Infiniband, 72 Passthrough nVidia Ampere SMs | +| `SCS-4C-16-2x200p_a1` | 4 dedicated Arm64 cores (A76 class), 16GiB RAM, 2x200GB local NVMe drives | +| `SCS-1V-0.5` | 1 vCPU, 0.5GiB RAM, no disk (boot from cinder volume) | ## Previous standard versions diff --git a/Standards/scs-0100-w1-flavor-naming-implementation-testing.md b/Standards/scs-0100-w1-flavor-naming-implementation-testing.md index 71756e07d..868215476 100644 --- a/Standards/scs-0100-w1-flavor-naming-implementation-testing.md +++ b/Standards/scs-0100-w1-flavor-naming-implementation-testing.md @@ -2,7 +2,7 @@ title: "SCS Flavor Naming Standard: Implementation and Testing Notes" type: Supplement track: IaaS -status: Proposal +status: Draft supplements: - scs-0100-v1-flavor-naming.md - scs-0100-v2-flavor-naming.md @@ -32,7 +32,8 @@ See the [README](https://github.com/SovereignCloudStack/standards/tree/main/Test for more details. The functionality of this script is also (partially) exposed via the web page -[https://flavors.scs.community/](https://flavors.scs.community/). +[https://flavors.scs.community/](https://flavors.scs.community/), which can both +parse SCS flavors names as well as generate them. With the OpenStack tooling (`python3-openstackclient`, `OS_CLOUD`) in place, you can call `cli.py -v parse v3 $(openstack flavor list -f value -c Name)` to get a report @@ -45,6 +46,107 @@ will create a whole set of flavors in one go. To that end, it provides different options: either the standard mandatory and possibly recommended flavors can be created, or the user can set a file containing his flavors. +### GPU table + +The most commonly used datacenter GPUs are listed here, showing what GPUs (or partitions +of a GPU) result in what GPU part of the flavor name. + +#### Nvidia (`N`) + +We show the most popular recent generations here. Older one are of course possible as well. + +##### Ampere (`a`) + +One Streaming Multiprocessor on Ampere has 64 (A30, A100) or 128 Cuda Cores (A10, A40). + +GPUs without MIG (one SM has 128 Cuda Cores and 4 Tensor Cores): + +| Nvidia GPU | Tensor C | Cuda Cores | SMs | VRAM | SCS name piece | +|------------|----------|------------|-----|-----------|----------------| +| A10 | 288 | 9216 | 72 | 24G GDDR6 | `GNa-72-24` | +| A40 | 336 | 10752 | 84 | 48G GDDR6 | `GNa-84-48` | + +GPUs with Multi-Instance-GPU (MIG), where GPUs can be partitioned and the partitions handed +out as as pass-through PCIe devices to instances. One SM corresponds to 64 Cuda Cores and +4 Tensor Cores. + +| Nvidia GPU | Fraction | Tensor C | Cuda Cores | SMs | VRAM | SCS GPU name | +|------------|----------|----------|------------|-----|-----------|----------------| +| A30 | 1/1 | 224 | 3584 | 56 | 24G HBM2 | `GNa-56-24` | +| A30 | 1/2 | 112 | 1792 | 28 | 12G HBM2 | `GNa-28-12` | +| A30 | 1/4 | 56 | 896 | 14 | 6G HBM2 | `GNa-14-6` | +| A30X | 1/1 | 224 | 3584 | 56 | 24G HBM2e | `GNa-56h-24h` | +| A100 | 1/1 | 432 | 6912 | 108 | 80G HBM2e | `GNa-108h-80h` | +| A100 | 1/2 | 216 | 3456 | 54 | 40G HBM2e | `GNa-54h-40h` | +| A100 | 1/4 | 108 | 1728 | 27 | 20G HBM2e | `GNa-27h-20h` | +| A100 | 1/7 | 60+ | 960+ | 15+| 10G HBM2e | `GNa-15h-10h`+ | +| A100X | 1/1 | 432 | 6912 | 108 | 80G HBM2e | `GNa-108-80h` | + +[+] The precise numbers for the 1/7 MIG configurations are not known by the author of +this document and need validation. + +##### Ada Lovelave (`l`) + +No MIG support, 128 Cuda Cores and 4 Tensor Cores per SM. + +| Nvidia GPU | Tensor C | Cuda Cores | SMs | VRAM | SCS name piece | +|------------|----------|------------|-----|-----------|----------------| +| L4 | 232 | 7424 | 58 | 24G GDDR6 | `GNl-58-24` | +| L40 | 568 | 18176 | 142 | 48G GDDR6 | `GNl-142-48` | +| L40G | 568 | 18176 | 142 | 48G GDDR6 | `GNl-142h-48` | +| L40S | 568 | 18176 | 142 | 48G GDDR6 | `GNl-142hh-48` | + +##### Grace Hopper (`g`) + +These have MIG support and 128 Cuda Cores and 4 Tensor Cores per SM. + +| Nvidia GPU | Fraction | Tensor C | Cuda Cores | SMs | VRAM | SCS GPU name | +|------------|----------|----------|------------|-----|------------|----------------| +| H100 | 1/1 | 528 | 16896 | 132 | 80G HBM3 | `GNg-132-80h` | +| H100 | 1/2 | 264 | 8448 | 66 | 40G HBM3 | `GNg-66-40h` | +| H100 | 1/4 | 132 | 4224 | 33 | 20G HBM3 | `GNg-33-20h` | +| H100 | 1/7 | 72+ | 2304+ | 18+| 10G HBM3 | `GNg-18-10h`+ | +| H200 | 1/1 | 528 | 16896 | 132 | 141G HBM3e | `GNg-132-141h` | +| H200 | 1/2 | 264 | 16896 | 66 | 70G HBM3e | `GNg-66-70h` | +| ... | + +[+] The precise numbers for the 1/7 MIG configurations are not known by the author of +this document and need validation. + +#### AMD Radeon (`A`) + +##### CDNA 2 (`2`) + +One CU contains 64 Stream Processors. + +| AMD Instinct| Stream Proc | CUs | VRAM | SCS name piece | +|-------------|-------------|-----|------------|----------------| +| Inst MI210 | 6656 | 104 | 64G HBM2e | `GA2-104-64h` | +| Inst MI250 | 13312 | 208 | 128G HBM2e | `GA2-208-128h` | +| Inst MI250X | 14080 | 229 | 128G HBM2e | `GA2-220-128h` | + +##### CDNA 3 (`3`) + +SRIOV partitioning is possible, resulting in pass-through for +up to 8 partitions, somewhat similar to Nvidia MIG. 4 Tensor +Cores and 64 Stream Processors per CU. + +| AMD GPU | Tensor C | Stream Proc | CUs | VRAM | SCS name piece | +|-------------|----------|-------------|-----|------------|----------------| +| Inst MI300X | 1216 | 19456 | 304 | 192G HBM3 | `GA3-304-192h` | +| Inst MI325X | 1216 | 19456 | 304 | 288G HBM3 | `GA3-304-288h` | + +#### intel Xe (`I`) + +##### Xe-HPC (Ponte Vecchio) (`3`) + +1 EU corresponds to one Tensor Core and contains 128 Shading Units. + +| intel DC GPU | Tensor C | Shading U | EUs | VRAM | SCS name part | +|--------------|----------|-----------|-----|------------|----------------| +| Max 1100 | 56 | 7168 | 56 | 48G HBM2e | `GI3-56-48h` | +| Max 1550 | 128 | 16384 | 128 | 128G HBM2e | `GI3-128-128h` | + ## Automated tests ### Errors diff --git a/Standards/scs-0101-w1-entropy-implementation-testing.md b/Standards/scs-0101-w1-entropy-implementation-testing.md index 432a25fec..19e1f43dc 100644 --- a/Standards/scs-0101-w1-entropy-implementation-testing.md +++ b/Standards/scs-0101-w1-entropy-implementation-testing.md @@ -2,7 +2,7 @@ title: "SCS Entropy: Implementation and Testing Notes" type: Supplement track: IaaS -status: Proposal +status: Draft supplements: - scs-0101-v1-entropy.md --- diff --git a/Standards/scs-0102-v1-image-metadata.md b/Standards/scs-0102-v1-image-metadata.md index 8b0ab98ba..18d42adf7 100644 --- a/Standards/scs-0102-v1-image-metadata.md +++ b/Standards/scs-0102-v1-image-metadata.md @@ -1,5 +1,5 @@ --- -title: SCS Image Metadata Standard +title: SCS Image Metadata type: Standard stabilized_at: 2022-10-31 status: Stable diff --git a/Standards/scs-0102-w1-image-metadata-implementation-testing.md b/Standards/scs-0102-w1-image-metadata-implementation-testing.md index 05fb05831..b2d9f5b75 100644 --- a/Standards/scs-0102-w1-image-metadata-implementation-testing.md +++ b/Standards/scs-0102-w1-image-metadata-implementation-testing.md @@ -2,7 +2,7 @@ title: "SCS Image Metadata: Implementation and Testing Notes" type: Supplement track: IaaS -status: Proposal +status: Draft supplements: - scs-0102-v1-image-metadata.md --- diff --git a/Standards/scs-0104-w1-standard-images-implementation.md b/Standards/scs-0104-w1-standard-images-implementation.md index 07b5715ee..9a18a9056 100644 --- a/Standards/scs-0104-w1-standard-images-implementation.md +++ b/Standards/scs-0104-w1-standard-images-implementation.md @@ -2,7 +2,7 @@ title: "SCS Standard Images: Implementation Notes" type: Supplement track: IaaS -status: Proposal +status: Draft supplements: - scs-0104-v1-standard-images.md --- diff --git a/Standards/scs-0111-v1-volume-type-decisions.md b/Standards/scs-0111-v1-volume-type-decisions.md index 28fc32e8b..aaf3e522f 100644 --- a/Standards/scs-0111-v1-volume-type-decisions.md +++ b/Standards/scs-0111-v1-volume-type-decisions.md @@ -7,7 +7,7 @@ track: IaaS ## Introduction -Volumes in OpenStack are virtual drives. They are managed by the storage service Cinder, which abstracts creation and usage of many different storage backends. While it is possible to use a backend like lvm which can reside on the same host as the hypervisor, the SCS wants to make a more clear differentiation between volumes and the ephemeral storage of a virtual machine. For all SCS deployments we want to assume that volumes are always residing in a storage backend that is NOT on the same host as a hypervisor - in short terms: Volumes are network storage. Ephemeral storage on the other hand is the only storage residing on a compute host. It is created by creating a VM directly from an Image and is automatically los as soon as the VM cease to exist. Volumes on the other hand have to be created from Images and only after that can be used for VMs. They are persistent and will remain in the last state a VM has written on them before they cease to exit. Being persistent and not relying on the host where the VM resides, Volumes can easily be attached to another VM in case of a node outage and VMs be migrated way more easily, because only metadata and data in RAM has to be shifted to another host, accelerating any migration or evacuation of a VM. +Volumes in OpenStack are virtual drives. They are managed by the storage service Cinder, which abstracts creation and usage of many different storage backends. While it is possible to use a backend like lvm which can reside on the same host as the hypervisor, this decision record wants to make a more clear differentiation between volumes and the ephemeral storage of a virtual machine. For all SCS deployments we want to assume that volumes are always residing in a storage backend that is NOT on the same host as a hypervisor - in short terms: Volumes are network storage. Ephemeral storage on the other hand is the only storage residing on a compute host. It is created by creating a VM directly from an Image and is automatically lost as soon as the VM cease to exist. Volumes on the other hand have to be created from Images and only after that can be used for VMs. They are persistent and will remain in the last state a VM has written on them before they cease to exit. Being persistent and not relying on the host where the VM resides, Volumes can easily be attached to another VM in case of a node outage and VMs be migrated way more easily, because only metadata and data in RAM has to be shifted to another host, accelerating any migration or evacuation of a VM. Volume Types are used to classify volumes and provide a basic decision for what kind of volume should be created. These volume types can sometimes very be backend-specific, and it might be hard for a user to choose the most suitable volume type, if there is more than one default type. Nevertheless, most of the configuration is done in the backends themselves, so volume types only work as a rough classification. diff --git a/Standards/scs-0114-v1-volume-type-standard.md b/Standards/scs-0114-v1-volume-type-standard.md index 9ed0d730c..003db9a24 100644 --- a/Standards/scs-0114-v1-volume-type-standard.md +++ b/Standards/scs-0114-v1-volume-type-standard.md @@ -1,8 +1,9 @@ --- -title: Volume Type Standard +title: SCS Volume Types type: Standard -status: Draft -track: IaaS +status: Stable +stabilized_at: 2024-11-13 +track: IaaS --- ## Introduction diff --git a/Standards/scs-0115-v1-default-rules-for-security-groups.md b/Standards/scs-0115-v1-default-rules-for-security-groups.md index b118dcf1f..8809a2857 100644 --- a/Standards/scs-0115-v1-default-rules-for-security-groups.md +++ b/Standards/scs-0115-v1-default-rules-for-security-groups.md @@ -1,7 +1,8 @@ --- title: Default Rules for Security Groups type: Standard -status: Draft +status: Stable +stabilized_at: 2024-11-13 track: IaaS --- @@ -25,7 +26,7 @@ Administrator (abbr. Admin) ### Default Security Groups, Custom Security Groups and default Security Group Rules -To properly understand the concepts in this standard and avoid ambiguity, is very important to distinguish between the following similar-sounding but different resources in the OpenStack Networking API: +To properly understand the concepts in this standard and avoid ambiguity, it is very important to distinguish between the following similar-sounding but different resources in the OpenStack Networking API: 1. default Security Group 2. custom Security Group @@ -59,10 +60,10 @@ Therefore, this standard proposes default Security Group rules that MUST be set ## Design Considerations -Up to the 2023.1 release (antelope) the default Security Group rules are hardcoded in the OpenStack code. -We should not require to change this behavior through code changes in deployments. +Up to the 2023.1 release (Antelope) the default Security Group rules are defined in the OpenStack code. +We should not require changing this behavior through code changes in deployments. -Beginning with the 2023.2 release (bobcat) the default Security Group rules can now be edited by administrators through an API. +Beginning with the 2023.2 release (Bobcat) the default Security Group rules can now be edited by administrators through an API. All rules that should be present as default in Security Groups have to be configured by admins through this API. There are two ways to approach a standard for the default rules of Security Groups. diff --git a/Standards/scs-0116-v1-key-manager-standard.md b/Standards/scs-0116-v1-key-manager-standard.md index 55d74f0d0..b0dd19139 100644 --- a/Standards/scs-0116-v1-key-manager-standard.md +++ b/Standards/scs-0116-v1-key-manager-standard.md @@ -1,7 +1,8 @@ --- -title: Key Manager Standard +title: SCS Key Manager Standard type: Standard -status: Draft +status: Stable +stabilized_at: 2024-11-13 track: IaaS --- diff --git a/Standards/scs-0116-w1-key-manager-implementation-testing.md b/Standards/scs-0116-w1-key-manager-implementation-testing.md index 0ca20bf2e..d3acc6b4c 100644 --- a/Standards/scs-0116-w1-key-manager-implementation-testing.md +++ b/Standards/scs-0116-w1-key-manager-implementation-testing.md @@ -2,7 +2,7 @@ title: "SCS Key Manager Standard: Implementation and Testing Notes" type: Supplement track: IaaS -status: Proposal +status: Draft supplements: - scs-0116-v1-key-manager-standard.md --- @@ -44,6 +44,11 @@ This can be done with a small change in the policy.yaml file. The `creator` has The check for the presence of a Key Manager is done with a test script, that checks the presence of a Key Manager service in the catalog endpoint of Openstack. This check can eventually be moved to the checks for the mandatory an supported service/API list, in case of a promotion of the Key Manager to the mandatory list. +### Implementation + +The script [`check-for-key-manager.py`](https://github.com/SovereignCloudStack/standards/blob/main/Tests/iaas/key-manager/check-for-key-manager.py) +connects to OpenStack and performs the checks described in this section. + ## Manual Tests It is not possible to check a deployment for a correctly protected Master KEK automatically from the outside. diff --git a/Standards/scs-0117-v1-volume-backup-service.md b/Standards/scs-0117-v1-volume-backup-service.md index d272dfa05..9838536fa 100644 --- a/Standards/scs-0117-v1-volume-backup-service.md +++ b/Standards/scs-0117-v1-volume-backup-service.md @@ -1,7 +1,8 @@ --- title: Volume Backup Functionality type: Standard -status: Draft +status: Stable +stabilized_at: 2024-11-13 track: IaaS --- diff --git a/Standards/scs-0118-v1-taxonomy-of-failsafe-levels.md b/Standards/scs-0118-v1-taxonomy-of-failsafe-levels.md new file mode 100644 index 000000000..45f494368 --- /dev/null +++ b/Standards/scs-0118-v1-taxonomy-of-failsafe-levels.md @@ -0,0 +1,255 @@ +--- +title: SCS Taxonomy of Failsafe Levels +type: Decision Record +status: Draft +track: IaaS +--- + + +## Abstract + +When talking about redundancy and backups in the context of cloud infrastructures, the scope under which circumstances these concepts apply to various resources is neither homogenous nor intuitive. +There does exist very detailed lists of risks and what consequences there are for each risk, but this Decision Record should give a high-level view on the topic. +So that in each standard that referenced redundancy, it can easily be seen how far this redundancy goes in that certain circumstance. +Readers of such standards should be able to know at one glance, whether the achieved failure safeness is on a basic level or a higher one and whether there would be additional actions needed to protect the data. + +This is why this decision record aims to define different levels of failure safety. +These levels can then be used in standards to clearly set the scope that certain procedures in e.g. OpenStack offer. + +## Glossary + +| Term | Explanation | +| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------- | +| Availability Zone | (also: AZ) internal representation of physical grouping of service hosts, which also lead to internal grouping of resources. | +| BSI | German Federal Office for Information Security (Bundesamt für Sicherheit in der Informationstechnik). | +| CSP | Cloud Service Provider, provider managing the OpenStack infrastructure. | +| Compute | A generic name for the IaaS service, that manages virtual machines (e.g. Nova in OpenStack). | +| Network | A generic name for the IaaS service, that manages network resources (e.g. Neutron in OpenStack). | +| Storage | A generic name for the IaaS service, that manages the storage backends and virtual devices (e.g. Cinder in OpenStack). | +| RTO | Recovery Time Objective, the acceptable time needed to restore a ressource. | +| Disk | A physical disk drive (e.g. HDD, SSD) in the infrastructure. | +| Host | A physical machine in the infrastructure providing computational, storage and/or network connectivity capabilities. | +| Cyber attack/threat | Attacks on the infrastructure through the means of electronic access. | + +## Context + +Some standards provided by the SCS project will talk about or require procedures to back up resources or have redundancy for resources. +This decision record should discuss, which failure threats exist within an IaaS and KaaS deployment and will classify them into several levels according to their impact and possible handling mechanisms. +In consequence these levels should be used in standards concerning redundancy or failure safety. + +Based on our research, no similar standardized classification scheme seems to exist currently. +Something close but also very detailed is the [BSI-Standard 200-3 (german)][bsi-200-3] published by the German Federal Office for Information Security. +As we want to focus on IaaS and K8s resources and also have an easily understandable structure that can be applied in standards covering replication, redundancy and backups, this document is too detailed. + +### Goal of this Decision Record + +The SCS wants to classify levels of failure cases according to their impact and the respective measures CSPs can implement to prepare for each level. +Standards that deal with redundancy or backups or recovery SHOULD refer to the levels of this standard. +Thus every reader knows, up to which level of failsafeness the implementation of the standard works. +Reader then should be able to abstract what kind of other measures they have to apply, to reach the failsafe lavel they want to reach. + +:::caution + +This document will not be a replacement for a risk analysis. +Every CSP and every Customer (user of IaaS or KaaS resources) need to do a risk analysis of their own. +Also the differentiation of failure cases in classes, may not be an ideal basis for Business Continuity Planning. +It may be used to get general hints and directions though. + +::: + +### Differentiation between failsafe levels and high availability, disaster recovery, redundancy and backups + +The levels of failsafeness defined in this decision record classify the possibilities and impacts of failure cases (such as data loss) and the possible measures. +High Availability, disaster recovery, redundancy and backups are all measures that can and should be applied to IaaS and KaaS deployments by both CSPs and Users to reduce the possibility and impact of data loss. +So with this document every reader can see to what level of failsafeness their measures protect user data. + +To differentiate also between the named measures the following table can be used: + +| Term | Explanation | +| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| High Availability | Refers to the availability of resources over an extended period of time unaffected by smaller hardware issues. E.g. achievable through having several instances of resources. | +| Disaster Recovery | Measures taken after an incident to recover data, IaaS resource and maybe even physical resources. | +| Redundancy | Having more than one (or two) instances of each resource, to be able to switch to the second resource (could also be a data mirror) in case of a failure. | +| Backup | A specific copy of user data, that presents all data points at a given time. Usually managed by users themself, read only and never stored in the same place as the original data. | + +### Failsafe Levels and RTO + +As this documents classifies failure case with very broad impacts and it is written in regards of mostly IaaS and KaaS, there cannot be one simple RTO set. +The RTOs will differ for each resource and also between IaaS and KaaS level. +It should be taken into consideration that the measure to achieve the RTOs for IaaS and KaaS means to make user data available again through measures within the infrastructure. +But this will not be effective, when there is no backup of the user data or a redundancy of it already in place. +So the different failsafe levels, measures and impacts will be needed to define realistic RTOs. +For example a storage disk that has a failure will not result in a volume gein unavailable and needing a defined RTO, when the storage backend uses internal replication and still has two replicas of the user data. +While in the worst case of a natural disaster, most likely a severe fire, the whole deployment will be lost and if there were no off-site backups done by users any defined RTO will never be met, because the data cannot be recovered anymore. + +[bsi-200-3]: https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/Grundschutz/BSI_Standards/standard_200_3.pdf?__blob=publicationFile&v=2 + +## Decision + +### Failsafe Levels + +This Decision Record defines **four** failsafe levels, each of which describe what kind of failures have to +be tolerated by a provided service. + +:::caution + +This table only contains examples of failure cases. +This should not be used as a replacement for a risk analysis. + +::: + +In general, the lowest, **level 1**, describes isolated/local failures which can occur very frequently, whereas +the highest, **level 4**, describes relatively unlikely failures that impact a whole or even multiple datacenter(s): + +| Level | Probability | Impact | Examples | +| - | - | - | - | +| 1 | Very High | small Hardware Issue | Disk failure, RAM failure, small software bug | +| 2 | High | Rack-wide | Rack outage, power outage, small fire | +| 3 | Medium | site-wide (temporary) | Regional power outage, huge fire, orchestrated cyber attack | +| 4 | Low | site destruction | Natural disaster | + +For example, a provided service with failsafe level 2 tolerates a rack outage (because there is some kind of +redundancy in place.) + +There are some *general* consequences, that can be addressed by CSPs and users in the following ways: + +| Level | consequences for CSPs | consequences for Users | +|---|-----|-----| +| 1. Level | CSPs MUST operate replicas for important components (e.g. replicated volume back-end, replicated database, ...). | Users SHOULD backup their data themself and place it on an other host. | +| 2. Level | CSPs MUST have redundancy for important components (e.g. HA for API services, redundant power supply, ...). | Users MUST backup their data themselves and place it on an other host. | +| 3. Level | CSPs SHOULD operate hardware in dedicated Availability Zones. | Users SHOULD backup their data, in different AZs or even other deployments. | +| 4. Level | CSPs may not be able to save user data from such catastrophes. | Users MUST have a backup of their data in a different geographic location. | + +:::caution + +The columns **consequences for CSPs / Users** only show examples of actions that may provide this class of failure safety for a certain resource. +Customers should always check, what they can do to protect their data and not rely solely on the CSP. + +::: + +More specific guidance on what these levels mean on the IaaS and KaaS layers will be provided in the sections +further down. +But beforehand, we will describe the considered failure scenarios and the resources that may be affected. + +### Failure Scenarios + +The following failure scenarios have been considered for the proposed failsafe levels. +For each failure scenario, we estimate the probability of occurence and the (worst case) damage caused by the scenario. +Furthermore, the corresponding minimum failsafe level covering that failure scenario is given. +The following table give a coarse view over the probabilities, that are used to describe the occurance of failure cases: + +| Probability | Meaning | +|-----------|----| +| Very Low | Occurs at most once a decade OR needs extremly unlikely circumstances. | +| Low | Occurs at most once a year OR needs very unlikely circumstances. | +| Medium | Occurs more than one time a year, up to one time a month. | +| High | Occurs more than once a month and up to a daily basis. | +| Very High | Occurs within minutes. | + +#### Hardware Related + +| Failure Scenario | Probability | Consequences | Failsafe Level Coverage | +|----|-----|----|----| +| Disk Failure | High | Permanent data loss in this disk. Impact depends on type of lost data (data base, user data) | L1 | +| Host Failure (without disks) | Medium to High | Permanent loss of functionality and connectivity of host (impact depends on type of host) | L1 | +| Host Failure | Medium to High | Data loss in RAM and temporary loss of functionality and connectivity of host (impact depends on type of host) | L1 | +| Rack Outage | Medium | Outage of all nodes in rack | L2 | +| Network router/switch outage | Medium | Temporary loss of service, loss of connectivity, network partitioning | L2 | +| Loss of network uplink | Medium | Temporary loss of service, loss of connectivity | L3 | +| Power Outage (Data Center supply) | Medium | Temporary outage of all nodes in all racks | L3 | + +#### Environmental + +Note that probability for these scenarios is dependent on the location. + +| Failure Scenario | Probability | Consequences | Failsafe Level Coverage | +|----|-----|----|----| +| Fire | Low | permanent Disk and Host loss in the affected zone | L3 | +| Flood | Very Low | permanent Disk and Host loss in the affected region | L4 | +| Earthquake | Very Low | permanent Disk and Host loss in the affected region | L4 | +| Storm/Tornado | Low | permanent Disk and Host loss in the affected region | L4 | + +As we consider mainly deployments in central Europe, the probability of earthquakes is low and in the rare case of such an event the severity is also low compared to other regions in the world (e.g. the pacific ring of fire). +The event of a flood will most likely come from overflowing rivers instead of storm floods from a sea. +There can be measures taken, to reduce the probability and severity of a flooding event in central Europe due to simply choosing a different location for a deployment. + +#### Software Related + +| Failure Scenario | Probability | Consequences | Failsafe Level Coverage | +|----|-----|----|----| +| Software bug (major) | Low to Medium | permanent loss or compromise of data that trigger the bug up to data on the whole deployment | L3 | +| Software bug (minor) | Medium to High | temporary or partial loss or compromise of data | L1 | + +Many software components have lots of lines of code and cannot be proven correct in their whole functionality. +They are tested instead with at best enough test cases to check every interaction. +Still bugs can and will occur in software. +Most of them are rather small issues, that might even seem like a feature to some. +An exmple for this would be: [whether a floating IP in OpenStack could be assigned to a VM even if it is already bound to another VM](https://bugs.launchpad.net/neutron/+bug/2060808). +Bugs like this do not affect a whole deployment, when they are triggered, but just specific data or resources. +Nevertheless those bugs can be a daily struggle. +This is the reason, the probability of such minor bugs may be pretty high, but the consequences would either be just temporary or would only result in small losses or compromisation. + +On the other hand major bugs, which might be used to compromise data, that is not in direct connection to the triggered bug, occur only a few times a year. +This can be seen e.g. in the [OpenStack Security Advisories](https://security.openstack.org/ossalist.html), where there were only 3 major bugs found in 2023. +While these bugs might appear only rarely their consequences are immense. +They might be the reason for a whole deployment to be compromised or shut down. +CSPs should be in contact with people triaging and patching such bugs, to be informed early and to be able to update their deployments, before the bug is openly announced. + +#### Human Interference + +| Failure Scenario | Probability | Consequences | Failsafe Level Coverage | +|----|-----|----|----| +| Minor operating error | High | Temporary outage | L1 | +| Major operating error | Low | Permanent loss of data | L3 | +| Cyber attack (minor) | Very High | permanent loss or compromise of data on affected Disk and Host | L1 | +| Cyber attack (major) | Medium | permanent loss or compromise of data on affected Disk and Host | L3 | + +Mistakes in maintaining a data center will always happen. +To reduce the probability of such a mistake, measures are needed to reduce human error, which is more an issue of sociology and psychology instead of computer science. +On the other side an attack on an infrastructure cannot be avoided by this. +Instead every deployment needs to be prepared for an attack all the time, e.g. through security updates. +The severity of Cyber attacks can also vary broadly: from denial-of-service attacks, which should only be a temporary issue, up until coordinated attacks to steal or destroy data, which could also affect a whole deployment. +The easier an attack is, the more frequently it will be used by various persons and organizations up to be just daily business. +Major attacks are often orchestrated and require specific knowledge e.g. of Day-0 Bugs or the attacked infrastructure. +Due to that nature their occurance is less likely, but the damage done can be far more severe. + +## Consequences + +Using the definition of levels established in this decision record throughout all SCS standards would allow readers to understand up to which level certain procedures or aspects of resources (e.g. volume types or a backend requiring redundancy) would protect their data and/or resource availability. + +### Affected Resources + +#### IaaS Layer (OpenStack Resources) + +| Resource | Explanation | Affected by Level | +| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------- | ----------------- | +| Ephemeral VM | Equals the `server` resource in Nova, booting from ephemeral storage. | L1, L2, L3, L4 | +| Volume-based VM | Equals the `server` resource in Nova, booting from a volume. | L2, L3, L4 | +| Ephemeral Storage | Disk storage directly supplied to a virtual machine by Nova. Different from volumes. | L1, L2, L3, L4 | +| Ironic Machine | A physical host managed by Ironic or as a `server` resource in Nova. | L1, L2, L3, L4 | +| (Glance) Image | IaaS resource usually storing raw disk data. Managed by the Glance service. | (L1), L2, L3, L4 | +| (Cinder) Volume | IaaS resource representing block storage disk that can be attached as a virtual disk to virtual machines. Managed by the Cinder service. | (L1, L2), L3, L4 | +| (Volume) Snapshot | Thinly-provisioned copy-on-write snapshots of volumes. Stored in the same Cinder storage backend as volumes. | (L1, L2), L3, L4 | +| Volume Type | Attribute of volumes determining storage details of a volume such as backend location or whether the volume will be encrypted. | L3, L4 | +| (Barbican) Secret | IaaS resource storing cryptographic assets such as encryption keys. Managed by the Barbican service. | L3, L4 | +| Key Encryption Key | IaaS resource, used to encrypt other keys to be able to store them encrypted in a database. | L3, L4 | +| Floating IP | IaaS resource, an IP that is usually routed and accessible from external networks. | L3, L4 | + +#### KaaS Layer (Kubernetes Resources) + +A detailed list of consequnces for certain failures can be found in the [Kubernetes docs](https://kubernetes.io/docs/tasks/debug/debug-cluster/). +The following table gives an overview about certain resources on the KaaS Layer and in which failsafe classes they are affected: + +| Resource(s) | Explanation | Affected by Level | +| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------- | ----------------- | +| Pod | Kubernetes object that represents a workload to be executed, consisting of one or more containers. | L3, L4 | +| Container | A lightweight and portable executable image that contains software and all of its dependencies. | L3, L4 | +| Deployment, StatefulSet | Kubernetes objects that manage a set of Pods. | L3, L4 | +| Job | Application workload that runs once. | L3, L4 | +| CronJob | Application workload that runs once, but repeatedly at specific intervals. | L3, L4 | +| ConfigMap, Secret | Objects holding static application configuration data. | L3, L4 | +| Service | Makes a Pod's network service accessible inside a cluster. | (L2), L3, L4 | +| Ingress | Makes a Service externally accessible. | L2, L3, L4 | +| PersistentVolume (PV) | Persistent storage that can be bound and mounted to a pod. | L1, L2, L3, L4 | + +Also see [Kubernetes Glossary](https://kubernetes.io/docs/reference/glossary/). diff --git a/Standards/scs-0118-w1-example-impacts-of-failure-scenarios.md b/Standards/scs-0118-w1-example-impacts-of-failure-scenarios.md new file mode 100644 index 000000000..a41ceb6ea --- /dev/null +++ b/Standards/scs-0118-w1-example-impacts-of-failure-scenarios.md @@ -0,0 +1,77 @@ +--- +title: "SCS Taxonomy of Failsafe Levels: Examples of Failure Cases and their impact on IaaS and KaaS resources" +type: Supplement +track: IaaS +status: Draft +supplements: + - scs-0118-v1-taxonomy-of-failsafe-levels.md +--- + +## Examples of the impact from certain failure scenarios on Cloud Resources + +Failure cases in Cloud deployments can be hardware related, environmental, due to software errors or human interference. +The following table summerizes different failure scenarios, that can occur: + +| Failure Scenario | Probability | Consequences | Failsafe Level Coverage | +|----|-----|----|----| +| Disk Failure | High | Permanent data loss in this disk. Impact depends on type of lost data (data base, user data) | L1 | +| Host Failure (without disks) | Medium to High | Permanent loss of functionality and connectivity of host (impact depends on type of host) | L1 | +| Host Failure | Medium to High | Data loss in RAM and temporary loss of functionality and connectivity of host (impact depends on type of host) | L1 | +| Rack Outage | Medium | Outage of all nodes in rack | L2 | +| Network router/switch outage | Medium | Temporary loss of service, loss of connectivity, network partitioning | L2 | +| Loss of network uplink | Medium | Temporary loss of service, loss of connectivity | L3 | +| Power Outage (Data Center supply) | Medium | Temporary outage of all nodes in all racks | L3 | +| Fire | Medium | permanent Disk and Host loss in the affected zone | L3 | +| Flood | Low | permanent Disk and Host loss in the affected region | L4 | +| Earthquake | Very Low | permanent Disk and Host loss in the affected region | L4 | +| Storm/Tornado | Low | permanent Disk and Host loss in the affected region | L4 | +| Software bug (major) | Low | permanent loss or compromise of data that trigger the bug up to data on the whole physical machine | L3 | +| Software bug (minor) | High | temporary or partial loss or compromise of data | L1 | +| Minor operating error | High | Temporary outage | L1 | +| Major operating error | Low | Permanent loss of data | L3 | +| Cyber attack (minor) | High | permanent loss or compromise of data on affected Disk and Host | L1 | +| Cyber attack (major) | Medium | permanent loss or compromise of data on affected Disk and Host | L3 | + +Those failure scenarios can result in either only temporary (T) or permanent (P) loss of IaaS / KaaS resources or data. +Additionally, there are a lot of resources in IaaS alone that are more or less affected by these failure scenarios. +The following tables shows the impact **when no redundancy or failure safety measure is in place**, i.e., when +**not even failsafe level 1 is fulfilled**. + +### Impact on IaaS Resources (IaaS Layer) + +| Resource | Disk Loss | Node Loss | Rack Loss | Power Loss | Natural Catastrophy | Cyber Threat | Software Bug | +|----|----|----|----|----|----|----|----| +| Image | P[^1] | T[^3] | T/P | T | P (T[^4]) | T/P | P | +| Volume | P[^1] | T[^3] | T/P | T | P (T[^4]) | T/P | P | +| User Data on RAM /CPU | | P | P | P | P | T/P | P | +| volume-based VM | P[^1] | T[^3] | T/P | T | P (T[^4]) | T/P | P | +| ephemeral-based VM | P[^1] | P | P | T | P (T[^4]) | T/P | P | +| Ironic-based VM | P[^2] | P | P | T | P (T[^4]) | T/P | P | +| Secret | P[^1] | T[^3] | T/P | T | P (T[^4]) | T/P | P | +| network configuration (DB objects) | P[^1] | T[^3] | T/P | T | P (T[^4]) | T/P | P | +| network connectivity (materialization) | | T[^3] | T/P | T | P (T[^4]) | T/P | T | +| floating IP | P[^1] | T[^3] | T/P | T | P (T[^4]) | T/P | T | + +For some cases, this only results in temporary unavailability and cloud infrastructures usually have certain mechanisms in place to avoid data loss, like redundancy in storage backends and databases. +So some of these outages are easier to mitigate than others. + +[^1]: If the resource is located on that specific disk. +[^2]: Everything located on that specific disk. If more than one disk is used, some data could be recovered. +[^3]: If the resource is located on that specific node. +[^4]: In case of disks, nodes or racks are not destroyed, some data could be safed. E.g. when a fire just destroyes the power line. + +### Impact on Kubernetes Resources (KaaS layer) + +:::note + +In case the KaaS layer runs on top of IaaS layer, the impacts described in the above table apply for the KaaS layer as well. + +::: + +| Resource | Disk Loss | Node Loss | Rack Loss | Power Loss | Natural Catastrophy | Cyber Threat | Software Bug | +|----|----|----|----|----|----|----|----| +|Node|P| | | | | |T/P| +|Kubelet|T| | | | | |T/P| +|Pod|T| | | | | |T/P| +|PVC|P| | | | | |P| +|API Server|T| | | | | |T/P| diff --git a/Standards/scs-0119-v1-rook-decision.md b/Standards/scs-0119-v1-rook-decision.md new file mode 100644 index 000000000..47e825f70 --- /dev/null +++ b/Standards/scs-0119-v1-rook-decision.md @@ -0,0 +1,76 @@ +--- +title: Replacement of the deprecated ceph-ansible tool +type: Decision Record +status: Draft +track: IaaS +--- + +## Abstract + +This decision record evaluates the choice for a modern, future-proof deployment tool for the networked storage solution Ceph in the SCS reference implementation, [OSISM](https://osism.tech/). +The new deployment tool aims to enhance Kubernetes integration within SCS, potentially allowing providers to manage the Ceph cluster with greater ease and efficiency. + +## Context + +The current reference implementation relies on `ceph-ansible`, [which is now deprecated](https://github.com/ceph/ceph-ansible/commit/a9d1ec844d24fcc3ddea7c030eff4cd6c414d23d). As a result, this decision record evaluates two alternatives: [Cephadm](https://docs.ceph.com/en/latest/cephadm/) and [Rook](https://rook.io/docs/rook/latest-release/Getting-Started/intro/). + +Both tools are designed to roll out and configure Ceph clusters, providing the capability to manage clusters throughout their lifecycle. This includes functionalities such as adding or removing OSDs, upgrading Ceph services, and managing CRUSH maps, as outlined in the [Feature-Decision-Table](#feature-decision-table). + +This decision record considers both the current and future needs of the reference implementation. The decision is guided by a comprehensive comparison of each tool's capabilities and limitations as well as the SCS communities needs and futures objectives. + +### Comparison of Features + +The tool selected in this decision MUST ensure: + +* ease of migration +* future-proofness +* feature-completeness and feature-maturity +* effective management of Ceph clusters + +#### Feature Decision Table + +A comparative analysis of Cephadm and Rook highlights the following: + +| Feature | Supported in Cephadm | Supported in Rook | +| ------- | -------------------- | ----------------- | +| Migrate from other setups | ☑ Adoption of clusters, that where built with ceph-ansible [is officially supported](https://docs.ceph.com/en/quincy/cephadm/adoption/).| ☐ Migration from other setups is not offically supported. See this [issue](https://github.com/rook/rook/discussions/12045). Consequently, SCS develops a migration tool, named [rookify](https://github.com/SovereignCloudStack/rookify). Alternatively, Rook allows to use [Ceph as an external cluster](https://rook.io/docs/rook/latest-release/CRDs/Cluster/external-cluster/external-cluster/). | +| Connect RGW with OpenStack Keystone | ☑ | ☑ Experimental | +| Deploy specific Ceph versions | ☑ | ☑ | +| Upgrade to specific Ceph versions | ☑ Streamlined upgrade process. | ☑ Rook, CSI and Ceph upgrades have to be aligned, there is a [guide](https://rook.io/docs/rook/latest-release/Upgrade/health-verification/) available for each Rook version. | +| Deploy Ceph Monitors | ☑ | ☑ | +| Deploy Ceph Managers | ☑ | ☑ | +| Deploy Ceph OSDs | ☑ | ☑ | +| Deploy Ceph Object Gateway (RGW) | ☑ | ☑ | +| Removal of nodes | ☑ | ☑ | +| Purging of complete cluster | ☑ | ☑ | + +☐ not supported (yet) +☑ supported +☑☑ better option +☒ not supported on purpose + +#### Evaluation in the Light of SCS Community Plans and Preferences + +**Environment**: Cephadm is better suited for traditional or standalone environments. Conversely, Rook is tailored for Kubernetes. That being said, it's important to note that the current state of resource deployment and management on Kubernetes within the IaaS reference implementation is still in its early stages. This would make Rook one of the first components to utilise Kubernetes in OSISM. + +**Deployment**: Cephadm uses containerization for Ceph components, whereas Rook fully embraces the Kubernetes ecosystem for deployment and management. Although containerization is already a core concept in the reference implementation, there is a strong push from the SCS community to adopt more Kubernetes. + +**Configuration and Management**: Rook offers a more straightforward experience for those already utilizing Kubernetes, leveraging Kubernetes' features for automation and scaling. In contrast, Cephadm grants finer control over Ceph components, albeit necessitating more manual intervention. In both cases, this is something that needs to be partly abstracted by the reference implementation. + +**Integration**: Rook provides better integration with cloud-native tools and environments, whereas Cephadm offers a more Ceph-centric management experience. + +**Migration**: Rook does not currently provide any migration support, while Cephadm does offer this capability. However, the SCS community is highly supportive of developing a migration tool (Rookify) for Rook, as this would enhance SCS's influence by offering the first migration solution specifically for Rook providers. + +**SCS Community**: An important factor in our decision is the preferences and direction of the SCS community and its providers. There is a noticeable trend towards increased use of Kubernetes within the community. This indicates a preference for deployment tools that integrate well with Kubernetes environments. + +**SCS Future Goals**: The SCS community is open to building tools that provide open-source, publicly available solutions beyond the scope of SCS. This openness to development efforts that address limitations of the chosen tools, such as Rook, is also a key consideration in our decision. + +## Decision + +As OSISM will increasingly focus on a Kubernetes-centric approach for orchestration in the near future, adopting Rook is a more suitable and standardized approach. Moreover, many service providers within the SCS community (including several who deploy OSISM) already have experience with Kubernetes. Regarding the missing OpenStack Keystone integration, we are confident that colleagues, who work on this issue, will provide a solution in a timely manner. We expect that deploying Ceph with Rook will simplify deployment and configuration form the outset. +In order to allow for a migration from existing Ceph installations to Rook, we decided to develop a migration tool (called Rookify) for the reference implementation. If the development of Rookify goes beyond the targeted scope of the reference implementation the tool will add value to the Ceph as well as the Rook community. + +## Consequences + +Migrating an existing Ceph environment onto Kubernetes, as well as bringing together existing but independent Ceph and Kubernetes environments, will become straight forward without much manual interference needed. +Landscapes that currently do not deploy a Kubernetes cluster have to adapt and provide a Kubernetes cluster in the future. diff --git a/Standards/scs-0120-v1-capi-images.md b/Standards/scs-0120-v1-capi-images.md new file mode 100644 index 000000000..1fe2380b5 --- /dev/null +++ b/Standards/scs-0120-v1-capi-images.md @@ -0,0 +1,63 @@ +--- +title: Cluster-API images +type: Decision Record +status: Draft +track: IaaS +--- + +## Abstract + +The SCS reference implementation for the Kubernetes-as-a-service layer is built on top of Cluster API (CAPI), and therefore it depends on the corresponding VM images, which may or may not be present on the underlying infrastructure-as-a-service layer. Current tooling will make sure to upload the required image in case it's not present or outdated. However, these ad-hoc uploads will not be shared across environments, which may lead to waste of bandwidth (for transferring the image), storage (if images are not stored in a deduplicated manner), and not least time (because the upload does take multiple minutes). Needless to say, it may also lead to excessive greenhouse-gas emissions. + +This decision record investigates the pros and cons of making the CAPI images mandatory. Ultimately, the decision is made to keep them recommended; we stress, however, that providers who offer the images by default should advertise this fact. + +## Terminology + +- _Kubernetes as a service (KaaS)_: A service that offers provisioning Kubernetes clusters. +- _Cluster API (CAPI)_: "Cluster API is a Kubernetes sub-project focused on providing declarative APIs and tooling to simplify provisioning, upgrading, and operating multiple Kubernetes clusters." ([source](https://cluster-api.sigs.k8s.io/)) This API can thus be used to implement KaaS. +- _CAPI image_: Virtual machine image that contains a standardized Kubernetes setup to be used for CAPI. The SCS reference implementation for KaaS depends on these images. +- _CSP_: Cloud-service provider + +## Design considerations + +We consider the following two options: + +1. Make CAPI image mandatory. +2. Keep CAPI image recommended. + +For reasons of symmetry, it suffices to consider the pros and cons of the first option. + +Pros: + +- Save time, money, physical resources and power for both CSP and customer. +- Regardless of CSP taste, this KaaS tech is part of SCS. + +Neutral: + +- The CAPI image can be provided in an automated fashion that means virtually no burden to the CSP. +- The KaaS implementation will work either way. +- Willing CSPs may offer the image by default and advertise as much. + +Cons: + +- Additional regulations would be necessary to guarantee quality and timeliness of image. +- Some CSPs may be opposed to being forced to offer a certain service, which may hurt the overall acceptance + of the SCS standardization efforts. + +## Decision + +Ultimately, we value the freedom of the CSPs (and the acceptance of the standardization efforts) highest; +willing CSPs are welcome to opt in, i.e., to provide up-to-date images and advertise as much. + +Therefore we decide to _keep the CAPI images recommended_. + +## Consequences + +None, as the status quo is being kept. + +## Open questions + +Some interesting potential future work does remain, however: to find a way to certify that a willing provider +does indeed provide up-to-date images. It would be possible with today's methods to certify that a CAPI +image is present (the image_spec yaml file would have to be split up to obtain a separate test case), but +we there is no way to make sure that the image is up to date. diff --git a/Standards/scs-0121-v1-Availability-Zones-Standard.md b/Standards/scs-0121-v1-Availability-Zones-Standard.md new file mode 100644 index 000000000..0dc9ed698 --- /dev/null +++ b/Standards/scs-0121-v1-Availability-Zones-Standard.md @@ -0,0 +1,206 @@ +--- +title: SCS Availability Zones +type: Standard +status: Stable +stabilized_at: 2024-11-13 +track: IaaS +--- + +## Introduction + +On the IaaS level especially in OpenStack it is possible to group resources in Availability Zones. +Such Zones often are mapped to the physical layer of a deployment, such as e.g. physical separation of hardware or redundancy of power circuits or fire zones. +But how CSPs apply Availability Zones to the IaaS Layer in one deplyoment may differ widely. +Therefore this standard will address the minimal requirements that need to be met, when creating Avaiability Zones. + +## Terminology + +| Term | Explanation | +| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------- | +| Availability Zone | (also: AZ) internal representation of physical grouping of service hosts, which also lead to internal grouping of resources. | +| Fire Zone | A physical separation in a data center that will contain fire within it. Effectively stopping spreading of fire. | +| PDU | Power Distribution Unit, used to distribute the power to all physical machines of a single server rack. | +| Compute | A generic name for the IaaS service, that manages virtual machines (e.g. Nova in OpenStack). | +| Network | A generic name for the IaaS service, that manages network resources (e.g. Neutron in OpenStack). | +| Storage | A generic name for the IaaS service, that manages the storage backends and virtual devices (e.g. Cinder in OpenStack). | +| BSI | German Federal Office for Information Security (Bundesamt für Sicherheit in der Informationstechnik) | +| CSP | Cloud Service Provider, provider managing the OpenStack infrastructure. | +| SDN | Software Defined Network, virtual networks managed by the networking service. | + +## Motivation + +Redundancy is a non-trivial but relevant issue for a cloud deployment. +First and foremost it is necessary to increase failure safety through redundancy on the physical layer. +The IaaS layer as the first abstraction layer from the hardware has an important role in this topic, too. +The grouping of redundant physical resources into Availability Zones on the IaaS level, gives customers the option to distribute their workload to different AZs which will result in a better failure safety. +While CSPs already have some similarities in their grouping of physical resources to AZs, there are also differences. +This standard aims to reduce those differences and will clarify, what customers can expect from Availability Zones in IaaS. + +Availability Zones in IaaS can be set up for Compute, Network and Storage separately while all may be referring to the same physical separation in a deployment. +This standard elaborates the necessity of having Availability Zones for each of these classes of resources. +It will also check the requirements customers may have, when thinking about Availability Zones in relation to the taxonomy of failure safety levels [^1]. +The result should enable CSPs to know when to create AZs to be SCS-compliant. + +## Design Considerations + +Availability Zones should represent parts of the same physical deployment that are independent of each other. +The maximum level of physical independence is achieved through putting physical machines into different fire zones. +In that case a failure case up to level 3 as described in the taxonomy of failure safety levels document[^1] will not lead to a complete outage of the deployment. + +Having Availability Zones represent fire zones will also result in AZs being able to take workload from another AZ in a failure case of Level 3. +So that even the destruction of one Availability Zone will not automatically include the destruction of the other AZs. + +:::caution + +Even with fire zones being physically designed to protect parts of a data center from severe destruction in case of a fire, this will not always succeed. +Availability Zones in Clouds are most of the time within the same physical data center. +In case of a big catastrophe like a huge fire or a flood the whole data center could be destroyed. +Availability Zones will not protect customers against these failure cases of level 4 of the taxonomy of failure safety[^1]. + +::: + +Smaller deplyoments like edge deployments may not have more than one fire zone in a single location. +To include such deployments, it should not be required to use Availability Zones. + +Other physical factors that should be considered are the power supplies, internet connection, cooling and core routing. +Availability Zones were also used by CSPs as a representations of redundant PDUs. +That means there are deployments, which have Availability Zones per rack as each rack has it's own PDU and this was considered to be the single point of failure an AZ should represent. +While this is also a possible measurement of independency it only provides failure safety for level 2. +Therefore this standard should be very clear about which independency an AZ should represent and it should not be allowed to have different deployments with their Availability Zones representing different levels of failure safety. + +Additionally Availability Zones are available for Compute, Storage and Network services. +They behave differently for each of these resources and also when working across resource-based Availability Zones, e.g. attaching a volume from one AZ to a virtual machine in another AZ. +For each of these IaaS resource classes, it should be defined, under which circumstances Availability Zones should be used. + +[^1]: [Taxonomy of Failsafe Levels in SCS (TODO: change link as soon as taxonomy is merged)](https://github.com/SovereignCloudStack/standards/pull/579) + +### Scope of the Availability Zone Standard + +When elaborating redundancy and failure safety in data centers, it is necessary to also define redundancy on the physical level. +There are already recommendations from the BSI for physical redundancy within a cloud deployment [^2]. +This standard considers these recommendation as a basis, that is followed by most CSPs. +So this standard will not go into details, already provided by the CSP, but will rather concentrate on the IaaS layer and only have a coarse view on the physical layer. +The first assumtion from the recommendations of the BSI is that the destruction of one fire zone will not lead to an outage of all power lines (not PDUs), internet connections, core routers or cooling systems. + +For the setup of Availability Zone this means, that within every AZ, there needs to be redundancy in core routers, internet connection, power lines and at least two separate cooling systems. +This should avoid having single points of failure within the Availability Zones. +But all this physical infrastructure can be the same over all Availability Zones in a deployment, when it is possible to survive the destruction of one fire zone. + +[^2]: [Availability recommendations from the BSI](https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/RZ-Sicherheit/RZ-Verfuegbarkeitsmassnahmen.pdf?__blob=publicationFile&v=9) + +### Options considered + +#### Physical-based Availability Zones + +It is possible standardize the usage of Availability Zones over all IaaS resources. +The downside from this is, that the IaaS resources behave so differently, that they have different requirements for redundancy and thus Availability Zones. +This is not the way to go. +Besides that, it is already possible to create two physically separated deployments close to each other, connect them with each other and use regions to differ between the IaaS on both deployments. + +The question that remains is, what an Availability Zone should consist of? +Having one Availability Zone per fire zone gives the best level of failure safety, that can be achieved by CSPs. +When building up on the relation between fire zone and physical redundancy recommendations as from the BSI, this combination is a good starting point, but need to be checked for the validity for the different IaaS resources. + +Another point is where Availability Zones can be instantiated and what the connection between AZs should look like. +To have a proper way to deal with outages of one AZ, where a second AZ can step in, a few requirements need to be met for the connection between those two AZs. +The amount data that needs to be transferred very fast in a failure case may be enormous, so there is a requirement for a high bandwidth between connected AZs. +Tho avoid additional failure cases the latency between those two Availability Zones need to be low. +With such requirements it is very clear that AZs should only reside within one (physical) region of an IaaS deployment. + +#### AZs in Compute + +Compute Hosts are physical machines on which the compute service runs. +A single virtual machine is always running on ONE compute host. +Redundancy of virtual machines is either up to the layer above IaaS or up to the customers themself. +Having Availability Zones gives customers the possibility to let another virtual machine as a backup run within another Availability Zone. + +Customers will expect that in case of the failure of one Availability Zone all other AZs are still available. +The highest possible failure safety here is achieved, when Availability Zones for Compute are used for different fire zones. + +When the BSI recommendations are followed, there should already be redundancy in power lines, internet connection and cooling. +An outage of one of these physical resources will not affect the compute host and its resources for more than a minimal timeframe. +But when a single PDU is used for a rack, a failure of that PDU will result in an outage of all compute hosts in this rack. +In such a case it is not relevant, whether this rack represents a whole Availability Zone or is only part of a bigger AZ. +All virtual machines on the affected compute hosts will not be available and need to be restarted on other hosts, whether of the same Availability Zone or another. + +#### AZs in Storage + +There are many different backends used for the storage service with Ceph being one of the most prominent backends. +Configuring those backends can already include to span one storage cluster over physical machines in different fire zones. +In combination with internal replication a configuration is possible, that already distributes replicas from volumes over different fire zones. +When a deployment has such a configured storage backend, it already can provide safety in case of a failure of level 3. + +Using Availability Zones is also possible for the storage service, but configuring AZs, when having a configuration like above will not increase safety. +Nevertheless using AZs when having different backends in different fire zones will give customers a hint to backup volumes into storages of other AZs. + +Additionally when the BSI recommendations are followed, there should already be redundancy in power lines, internet connection and cooling. +An outage of one of these physical resources will not affect the storage host and its resources for more than a minimal timeframe. +When internal replication is used, either through the IaaS or through the storage backend itself, the outage of a single PDU and such a single rack will not affect the availability of the data itself. +All these physical factors are not requiring the usage of an Availability Zone for Storage. +An increase of the level of failure safety will not be reached through AZs in these cases. + +Still it might be confusing when having deployments with compute AZs but without storage AZs. +CSPs may need to communicate clearly up to which failure safety level their storage service can automatically have redundancy and from which level customers are responsible for the redundancy of their data. + +#### AZs in Network + +Virtualized network resources can typically be quickly and easily set up from building instructions. +Those instructions are stored in the database of the networking service. + +If a physical machine, on which certain network resources are set up, is not available anymore, the resources can be rolled out on another physical machine, without being dependent on the current situation of the lost resources. +There might only be a loss of a few packets within the affected network resources. + +With having Compute and Storage in a good state (e.g. through having fire zones with a compute AZ each and storage being replicated over the fire zones) there would be no downsides to omitting Availability Zones for the network service. +It might even be the opposite: Having resources running in certain Availability Zones might prevent them from being scheduled in other AZs[^3]. +As the network resources like routers are bound to an AZ, in a failure case of one AZ all resource definitions might still be there in the database, while the implementation of those resources is gone. +Trying to rebuild them in another AZ is not possible, because the scheduler will not allow them to be implemented in another AZ, than the one thats present in their definition. +In a failure case of one AZ this might lead to a lot of manual work to rebuild the SDN from scratch instead of just re-using the definitions. + +Because of this severe side effect, this standard will make no recommendations about Network AZs. + +[^3]: [Availability Zones in Neutron for OVN](https://docs.openstack.org/neutron/latest/admin/ovn/availability_zones.html) + +### Cross-Attaching volumes from one AZ to another compute AZ + +Without the networking AZs we only need to take a closer look into attaching volumes to virtual machines across AZs. + +When there is more than one Storage Availability Zone, those AZs do normally align with the Compute Availability Zones. +This means that fire zone 1 contains compute AZ 1 and storage AZ 1 , fire zone 2 contains compute AZ 2 and storage AZ 2 and the same for fire zone 3. +It is possible to allow or forbid cross-attaching volumes from one storage Availability Zone to virtual machines in another AZ. +If it is not allowed, then the creation of volume-based virtual machines will fail, if there is no space left for VMs in the corresponding Availability Zone. +While this may be unfortunate, it gives customers a very clear picture of an Availability Zone. +It clarifies that having a virtual machine in another AZ also requires having a backup or replication of volumes in the other storage AZ. +Then this backup or replication can be used to create a new virtual machine in the other AZ. + +It seems to be a good decision to not encourage CSPs to allow cross-attach. +Currently CSPs also do not seem to widely use it. + +## Standard + +If Compute Availability Zones are used, they MUST be in different fire zones. +Availabilty Zones for Storage SHOULD be setup, if there is no storage backend used that can span over different fire zones and automatically replicate the data. +Otherwise a single Availabilty Zone for Storage SHOULD be configured. + +If more than one Availability Zone for Storage is set up, the attaching of volumes from one Storage Availability Zone to another Compute Availability Zone (cross-attach) SHOULD NOT be possible. + +Within each Availability Zone: + +- there MUST be redundancy in power supply, as in line into the deployment +- there MUST be redundancy in external connection (e.g. internet connection or WAN-connection) +- there MUST be redundancy in core routers +- there SHOULD be redundancy in the cooling system + +AZs SHOULD only occur within the same region and have a low-latency interconnection with a high bandwidth. + +## Related Documents + +The taxonomy of failsafe levels can be used to get an overview over the levels of failure safety in a deployment(TODO: link after DR is merged.) + +The BSI can be consulted for further information about [failure risks](https://www.bsi.bund.de/DE/Themen/Unternehmen-und-Organisationen/Standards-und-Zertifizierung/IT-Grundschutz/IT-Grundschutz-Kompendium/Elementare-Gefaehrdungen/elementare-gefaehrdungen_node.html), [risk analysis for a datacenter](https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/Grundschutz/BSI_Standards/standard_200_3.pdf?__blob=publicationFile&v=2) or [measures for availability](https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/RZ-Sicherheit/RZ-Verfuegbarkeitsmassnahmen.pdf?__blob=publicationFile&v=9). + +## Conformance Tests + +As this standard will not require Availability Zones to be present, we cannot automatically test the conformance. +The other parts of the standard are physical or internal and could only be tested through an audit. +Whether there are fire zones physically available is a criteria that will never change for a single deployment - this only needs to be audited once. +It might be possible to also use Gaia-X Credentials to provide such information, which then could be tested. diff --git a/Standards/scs-0121-w1-Availability-Zones-Standard.md b/Standards/scs-0121-w1-Availability-Zones-Standard.md new file mode 100644 index 000000000..9ec3dbc82 --- /dev/null +++ b/Standards/scs-0121-w1-Availability-Zones-Standard.md @@ -0,0 +1,42 @@ +--- +title: "SCS Availability Zones: Implementation and Testing Notes" +type: Supplement +track: IaaS +status: Draft +supplements: + - scs-0121-v1-Availability-Zones-Standard.md +--- + +## Automated Tests + +The standard will not preclude small deployments and edge deployments, that both will not meet the requirement for being divided into multiple Availability Zones. +Thus multiple Availability Zones are not always present. +Somtimes there can just be a single Availability Zones. +Because of that, there will be no automated tests to search for AZs. + +## Required Documentation + +The requirements for each Availability Zone are written in the Standard. +For each deployment, that uses more than a single Availability Zone, the CSP has to provide documentation to proof the following points: + +1. The presence of fire zones MUST be documented (e.g. through construction plans of the deployment). +2. The correct configuration of one AZ per fire zone MUST be documented. +3. The redundancy in Power Supply within each AZ MUST be documented. +4. The redundancy in external connection within each AZ MUST be documented. +5. The redundancy in core routers within each AZ MUST be documented. + +All of these requirements will either not change at all like the fire zones or it is very unlikely for them to change like redundant internet connection. +Because of this documentation must only be provided in the following cases: + +1. When a new deployment with multiple AZs should be tested for compliance. +2. When there are physical changes in a deplyoment, which already provided the documentation: the changes needs to be documented and provided as soon as possible. + +### Alternative Documentation + +If a deployment already did undergo certification like ISO 27001 or ISO 9001, those certificates can be provided as part of the documentation to cover the redundancy parts. +It is still required to document the existence of fire zones and the correct configuration of one AZ per fire zone. + +## Physical Audits + +In cases where it is reasonable to mistrust the provided documentation, a physical audit by a natural person - called auditor - send by e.g. the [OSBA](https://osb-alliance.de/) should be performed. +The CSP of the deployment, which needs such an audit, should grant access to the auditor to the physical infrastructure and should show them all necessary IaaS-Layer configurations, that are needed to verify compliance to this standard. diff --git a/Standards/scs-0122-v1-node-to-node-encryption.md b/Standards/scs-0122-v1-node-to-node-encryption.md new file mode 100644 index 000000000..f3d298706 --- /dev/null +++ b/Standards/scs-0122-v1-node-to-node-encryption.md @@ -0,0 +1,529 @@ +--- +title: _End-to-End Encryption between Customer Workloads_ +type: Decision Record +status: Draft +track: IaaS +--- + +## Abstract + +This document explores options for developing end-to-end (E2E) encryption for +VMs, Magnum workloads, and container layers to enhance security between user +services. It includes a detailed review of various technologies, feedback from +the OpenStack community, and the decision-making process that led to selecting +VXLANs with the OpenStack ML2 plugin and it's later abandonment in favour of +natural openvswitch-ipsec solution. + +## Terminology + +| Term | Meaning | +|---|---| +| CSP | Cloud service provider, in this document it includes also an operator of a private cloud | +| VM | Virtual machine, alternatively instance, is a virtualized compute resource that functions as a self-contained server for a customer | +| Node | Machine under CSP administration which hosts cloud services and compute instances | + +## Context + +### Motivation + +The security of customer/user workloads is one of CSPs main concerns. With +larger and more diverse cloud instances, parts of the underlying physical +infrastructure can be outside of CSPs direct control, either when +interconnecting datacenters via public internet or in the case of renting +infrastructure from third party. Many security breaches occur due to +actions of malicious or negligent inhouse operators. While some burden lies +with customers, which should secure their own workloads, CSP should have the +option to transparently protect the data pathways between instances, more so +for private clouds, where CSP and customer are the same entity or parts of the +same entity. + +In RFC8926[^rfc] it is stated: +> A tenant system in a customer premises (private data center) may want to +> connect to tenant systems on their tenant overlay network in a public cloud +> data center, or a tenant may want to have its tenant systems located in +> multiple geographically separated data centers for high availability. Geneve +> data traffic between tenant systems across such separated networks should be +> protected from threats when traversing public networks. Any Geneve overlay +> data leaving the data center network beyond the operator's security domain +> SHOULD be secured by encryption mechanisms, such as IPsec or other VPN +> technologies, to protect the communications between the NVEs when they are +> geographically separated over untrusted network links. + +We aren't considering the communication intra node, meaning inside one host +node between different VMs potentially of multiple tenants as this is a +question of tenant isolation, not of networking security, and encryption here +would be possibly a redundant measure. Isolation of VMs is handled by OpenStack +on multiple levels - overlay tunneling protocols, routing rules on networking +level, network namespaces on kernel level and hypervisor isolation mechanisms. +All the communication here is existing inside node and any malicious agent with +high enough access to the node itself to observe/tamper with the internal +communication traffic would pressumably have access to the encryption keys +themselves, rendering the encryption ineffective. + +### Potential threats in detail + +We are assuming that: + +* the customer workloads are not executed within secure enclaves (e.g. Security +Guard Extensions (SGX)) and aren't using security measures like end-to-end +encryption themselves, either relying with security on the CSP or in the case +of a private cloud are run by the operator of the cloud +* the CSP OpenStack administrators are deemed trustworthy since they possess +root access to the host nodes, with access to keys and certificates, enabling +them to bypass any form of internode communication encryption +* a third party or an independent team manages physical network communication +between nodes within a colocation setting or the communication passes unsafe +public infrastructure in the case of a single stretched instance spanning +multiple data centers + +#### Man in the Middle Attack + +Considering the assumptions and the objective to enforce end-to-end (E2E) +encryption for user workloads, our primary security concern is averting +man-in-the-middle (MITM) attacks. These can be categorized into two distinct +forms: active and passive. + +##### Passive Attacks - Eavesdropping + +Consider the scenario where an untrusted individual, such as a third party +network administrator, with physical access to the data center engages in +'passive' covert surveillance, silently monitoring unencrypted traffic +without interfering with data integrity or network operations. + +Wiretapping is a common technique employed in such espionage. It involves the +unauthorized attachment to network cabling, enabling the covert observation of +data transit. This activity typically goes unnoticed as it does not disrupt +the flow of network traffic, although it may occasionally introduce minor +transmission errors. + +An alternative strategy involves deploying an interception device that +captures and retransmits data, which could potentially introduce network +latency or, if deployed disruptively, cause connectivity interruptions. Such +devices can be concealed by synchronizing their installation with network +downtime, maintenance periods, or less conspicuous times like power outages. +They could also be strategically placed in less secure, more accessible +locations, such as inter-building network links. This applies to wiretapping +as well. + +Furthermore, the vulnerability extends to network devices, where an attacker +could exploit unsecured management ports or leverage compromised remote +management tools (like IPMI) to gain unauthorized access. Such access points, +especially those not routinely monitored like backup VPNs, present additional +security risks. + +Below is a conceptual diagram depicting potential vulnerabilities in an +OpenStack deployment across dual regions, highlighting how these passive +eavesdropping techniques could be facilitated. + +![image](https://github.com/SovereignCloudStack/issues/assets/1249759/f5b7edf3-d259-4b2a-8632-c877934f3e31) + +##### Active - Spoofing / Tampering + +Active network attacks like spoofing and tampering exploit various access +points, often leveraging vulnerabilities uncovered during passive eavesdropping +phases. These attacks actively manipulate or introduce unauthorized +communications on the network. + +Spoofing involves an attacker masquerading as another device or user within the +network. This deception can manifest in several ways: + +* **ARP Spoofing:** The attacker sends forged ARP (Address Resolution Protocol) + messages onto the network. This can redirect network traffic flow to the + attacker's machine, intercepting, modifying, or blocking data before it + reaches its intended destination. +* **DNS Spoofing:** By responding with falsified DNS replies, an attacker can + reroute traffic to malicious sites, further compromising or data exfiltration. +* **IP Spoofing:** The attacker disguises their network identity by falsifying + IP address information in packets, tricking the network into accepting them + as legitimate traffic. This can be particularly damaging if encryption is not + enforced, enabling the attacker to interact with databases, invoke APIs, or + execute unauthorized commands while appearing as a trusted entity. + +Moreover, when an active interception device is in place, attackers can extend +their capabilities to traffic filtering. They might selectively delete or alter +logs and metrics to erase traces of their intrusion or fabricate system +performance data, thus obscuring the true nature of their activities. + +### Preliminary considerations + +Initially we wanted to create a plugin into Neutron[^ne] using eBPF[^eb] to +secure the traffic automatically between VMs. We presented the idea in a +team IaaS call[^ia]. After the initial round of feedback specific requirements +emerged. + +#### Utilize existing solutions + +Leverage existing technologies and frameworks as much as possible. This +approach aims to reduce development time and ensure the solution is built on +proven, reliable foundations. Potential technologies include: + +* **OVS[^sw] + IPSec[^ip]**: Provides an overlay network and has built-in + support for encryption using IPsec. Leveraging OVS can minimize development + time since it is already integrated with OpenStack. +* **Neutron[^ne] with eBPF[^eb]**: Using eBPF[^eb] could provide fine-grained + control over packet filtering and encryption directly in the kernel. +* **TripleO[^to] (with IPsec)**: TripleO[^to] tool set for OpenStack deployment + supports IPsec tunnels between nodes. +* **Neutron[^ne] + Cilium[^ci]**: Cilium is an open source, cloud native + eBPF[^eb]-based networking solution, including transparent encryption tools. +* **Tailscale[^ta]** is a mesh VPN based on WireGuard[^wg] that simplifies the + setup of secure, encrypted networks. This could be a potential alternative + to managing encrypted connections in OpenStack environments. + +#### Upstream integration + +Move as much of the development work upstream into existing OpenStack projects. +This will help ensure the solution is maintained by the wider OpenStack +community, reducing the risk of it becoming unmaintained or unusable in the +future. This means to collaborate with the OpenStack community to contribute +changes upstream, particularly in projects like Neutron[^ne], OVN[^ov], +kolla[^kl] and ansible-kolla[^ka]. + +#### Address threat modeling issues + +"We should not encrypt something just for the sake of encryption." The solution +must address the specific security issues identified in the +[threat modeling](#potential-threats-in-detail). This ideally includes +protecting against both passive (eavesdropping) and active (spoofing, +tampering) MITM attacks. Encryption mechanisms on all communication channels +between VMs, containers, hosts prevents successfull eavesdropping, +authentication and integrity checks prevent spoofing and tampering. For example +IPsec[^ip] provides mechanisms for both encyption and integrity verification. + +#### Performance impact and ease of use + +Evaluate the performance impact of the encryption solution and ensure it is +minimal. Performance benchmarking should be conducted to assess the impact of +the encryption solution on network throughput and latency. For local trusted +scenarios opt out should be possible. The solution should also be easy to use +and manage, both for administrators and ideally fully transparent for +end-users. This may involve developing user-friendly interfaces and automated +tools for key management and configuration. + +#### Avoid redundant encryption + +If possible, develop a mechanism to detect and avoid encrypting traffic that is +already encrypted. This will help optimize performance and resource usage. + +By focusing on these detailed requirements and considerations, we aim to +develop a robust, efficient, and sustainable E2E encryption solution for +OpenStack environments. This solution will not only enhance security for user +workloads but also ensure long-term maintainability and ease of use. + +### Exploration of technologies + +Based on the result of the threat modeling and presentation, we explored the +following technologies and also reached out to the OpenStack mailing list for +additional comments. + +This section provides a brief explanation of OpenStack networking and design +decisions for encryption between customer workloads. + +#### Networking in OpenStack + +The foundation of networking in OpenStack is the Neutron[^ne] project, +providing networking as a service (NaaS). It creates and manages network +resources such as switches, routers, subnets, firewalls and load balancers, +uses plugin architecture to support different physical network implementation +options and is accessible to admin or other services through API. + +Another integral part is the Open vSwitch (OVS)[^sw] - a widely adopted virtual +switch implementation, which is not strictly necessary, as Neutron is quite +flexible with compenents used to implement the infrastructure, but tends to +be the agent of choice and is the current default agent for Neutron. It allows +it to respond to environment changes, supporting accounting and monitoring +protocols and maintaining OVSDB state database. It manages virtual ports, +bridges and tunnels on hosts. + +Open Virtual Networking (OVN[^ov]) is a logical abstraction layer on top of OVS, +developed by the same community that became the default controller driver for +Neutron. It manages logical networks insulated from underlying physical/virtual +networks by encapsulation. It replaces the need for OVS agents running on each +host and supports L2 switching, distributed L3 routing, access control and load +balancing. + +#### Encryption options + +##### MACsec[^ms] + +A layer 2 security protocol, defined by an IEEE standard 802.1AE. It allows to +secure an ethernet link for almost all traffic, including control protocols +like DHCP and ARP. It is mostly implemented in hardware, in routers and +switches, but software implementations exist, notably a Linux kernel module. + +##### eBPF[^eb]-based encryption with Linux Kernel Crypto API + +A network packet specific filtering technology in Linux kernel called +Berkeley Packet Filter (BPF) uses a specialized virtual machine inside +kernel to run filters on the networking stack. eBPF is an extension of this +principle to a general purpose stack which can run sandboxed programs in kernel +without changes of kernel code or loading modules. High-performance networking +observability and security is a natural use-case with projects like Cilium[^ci] +implementing transparent in-kernel packet encryption with it. Linux kernel +itself also provides an encryption framework called +Linux Kernel Crypto API[^lkc] which such solutions use. + +##### IPsec[^ip] + +Internet Protocol security is a suite of protocols for network security on +layer 3, providing authentication and packets encryption used for example in +Virtual Private Network (VPN) setups. It is an IETF[^ie] specification with +various open source and commercial implementations. For historical +reasons[^ipwh] it defines two main transmission protocols +Authentication Header (AH) and Encapsulating Security Payload (ESP) where only +the latter provides encryption in addition to authentication and integrity. The +key negotiations use the IKE(v1/v2) protocol to establish and maintain +Security Associations (SA). + +##### WireGuard[^wg] + +Aims to be a simple and fast open source secure network tunneling solution +working on layer 3, utilizing state-of-the-art cryptography while maintaining +much simpler codebase and runtime setup as alternative solutions[^wgwp]. Focus +is on fast in-kernel encryption. WireGuard[^wg] adds new network interfaces, +managable by standard tooling (ifconfig, route,...) which act as tunneling +interfaces. Main mechanism, called _Cryptokey routing_, are tables associating +public keys of endpoints with allowed IPs inside given tunnels. These behave as +routing tables when sending and access control lists (ACL) when receiving +packets. All packets are sent over UDP. Built-in roaming is achieved by both +server and clients being able to update the peer list by examining from where +correctly authenticated data originates. + +### Solution proposals + +#### TripleO[^to] with IPsec[^ip] + +> TripleO is a project aimed at installing, upgrading and operating OpenStack +> clouds using OpenStack's own cloud facilities as the foundation - building on +> Nova, Ironic, Neutron and Heat to automate cloud management at datacenter +> scale + +This project is retired as of February 2024, but its approach was considered +for adoption. + +Its deployment allowed for IPsec[^ip] encryption of node communication. When +utilized, two types of tunnels were created in overcloud: node-to-node tunnels +for each two nodes on the same network, for all networks those nodes were on, +and Virtual IP tunnels. Each node hosting the Virtual IP would open a tunnel +for any node in the specific network that can properly authenticate. + +#### OVN[^ov] + IPsec[^ip] + +There is support in the OVN[^ov] project for IPsec[^ip] encryption of tunnel +traffic[^oit]. A daemon running in each chassis automatically manages and +monitors IPsec[^ip] tunnel states. + +#### Neutron[^ne] + Cilium[^ci] + +Another potential architecture involves a Neutron[^ne] plugin hooking an +eBPF[^eb] proxy on each interface and moving internal traffic via an encrypted +Cilium[^ci] mesh. Cilium uses IPsec[^ip] or WireGuard[wg] to transparently +encrypt node-to-node traffic. There were some attempts to integrate Cilium[^ci] +with OpenStack [^neci1], [^neci2], but we didn't find any concrete projects +which would leverage the transparent encryption ability of Cilium[^ci] in +OpenStack environment. This path would pressumably require significant +development. + +#### Neutron[^ne] + Calico[^ca] + +The Calico[^ca] project in its community open source version provides +node-to-node encryption using WireGuard[^wg]. Despite being primarily a +Kubernetes networking project, it provides an OpenStack integration[^caos] via +a Neutron[^ne] plugin and deploying the necessary subset of tools like etcd, +Calico agent Felix, routing daemon BIRD and a DHCP agent. + +### Proof of concept implementations + +#### Neutron Plugin + +Initially the potential for developing a specialized Neutron plugin was +investigated and a simple skeleton implementation for testing purposes was +devised. + +Own development was later abandoned in favor of a more sustainable solution +using existing technologies as disussed in +[preliminary considerations](#preliminary-considerations). + +#### Manual setup + +We created a working Proof of Concept with manually setting up VXLAN tunnels +between nodes. While this solution ensures no impact on OpenStack and is easy +to set up, it has limitations, such as unencrypted data transmission if the +connection breaks. To mitigate this, we proposed using a dedicated subnet +present only in the IPsec[^ip] tunnels. + +We presented the idea to the kolla-ansible[^ak] project, but it was deemed out +of scope. Instead, we were directed towards a native Open vSwitch solution +supporting IPsec[^ip]. This requires creating a new OpenStack service +(working name: openstack-ipsec) and a role to manage chassis keys and run the +openstack-ipsec container on each node. + +#### Proof of concept (PoC) implementation + +In our second proof of concept, we decided to implement support for +openstack-ipsec. The initial step involved creating a new container image +within the kolla[^kl] project specifically for this purpose. + +##### Architecture + +When Neutron[^ne] uses OVN[^ov] as controller it instructs it to create the +necessary virtual networking infrastructure (logical switches, routers, etc.), +particullary to create Geneve tunnels between compute nodes. These tunnels are +used to carry traffic between instances on different compute nodes. + +In PoC setup Libreswan[^ls] suite runs on each compute node and manages the +IPSec[^ip] tunnels. It encrypts the traffic flowing over the Geneve tunnels, +ensuring that data is secure as it traverses the physical network. In setup +phase it establishes IPSec tunnels between compute nodes by negotiating the +necessary security parameters (encryption, authentication, etc.). Once the +tunnels are established, Libreswan[^ls] monitors and manages them, ensuring +that the encryption keys are periodically refreshed and that the tunnels remain +up. It also dynamically adds and removes tunnels based on changes of network +topology. + +A packet originating from a VM on one compute node and destined for a VM on +a different node is processed by OVS and encapsulated into a Geneve tunnel. +Before the Geneve-encapsulated packet leaves the compute node, it passes +through the Libreswan process, which applies IPSec encryption. The encrypted +packet traverses the physical network to the destination compute node. On the +destination node, Libreswan[^ls] decrypts the packet, and OVN[^ov] handles +decapsulation and forwards it to the target VM. + +##### Challanges + +Implementing the openstack-ipsec image we encountered a significant challenge: +the ovs-ctl start-ovs-ipsec command could not run inside the container because +it requires a running init.d or systemctl to start the IPsec daemon immediately +after OVS[^ov] deploys the configuration. We attempted to use supervisor to +manage the processes within the container. However, this solution forced a +manual start of the IPsec daemon before ovs-ctl had the chance to create the +appropriate configurations. + +Another challenge was the requirement for both the IPsec daemon and ovs-ipsec +to run within a single container. This added complexity to the container +configuration and management, making it harder to ensure both services operated +correctly and efficiently. + +##### Additional infrastructure + +New ansible role for generating chassis keys and distributing them to the +respective machines was created. This utility also handles the configuration on +each machine. Managing and creating production certificates is up to the user, +which is also true for the backend TLS certificates in kolla-ansible[^ka]. +While this management should be handled within the same process, it currently +poses a risk of downtime when certificates expire, as it requires careful +management and timely renewal of certificates. + +The new container image was designed to include all necessary +components for openstack-ipsec. Using supervisor to manage the IPsec daemon +within the container involved creating configuration files to ensure all +services start correctly. However, integrating supervisor introduced additional +complexity and potential points of failure. + +##### Possible improvements + +PoC doesn't currently address the opt-out possibility for disabling the +encryption for some specific group of nodes, where operator deems it +detrimental because of them being virtual or where security is already handled +in some other layer of the stack. This could be implemented as a further +customization available to the operator to encrypt only some subset of Geneve +tunnels, either in blacklist or whitelist manner. + +Further refinement is needed to ensure ovs-ctl and the IPsec daemon start and +configure correctly within the container environment. Exploring alternative +process management tools or improving the configuration of supervisor could +help achieve a more robust solution. + +Implementing automated certificate management could mitigate the risks +associated with manual certificate renewals. Tools like Certbot or integration +with existing Public Key Infrastructure (PKI) solutions might be beneficial. + +Engaging with the upstream Open vSwitch community to address containerization +challenges and improve support for running ovs-ctl within containers could lead +to more sustainable solution. + +## Decision + +The final proof of concept implementation demonstrated the feasibility of +implementing transparent IPsec[^ip] encryption between nodes in an OVN[^ov] +logical networking setup in OpenStack. +To recapitulate our preliminary considerations: + +### Utilize existing solutions + +Implementation in kolla-ansible[^ka] is unintrusive, provided by a +self-contained new kolla[^kl] container, which only adds an IPsec[^ip] +tunneling support module to OVS[^sw], which is already an integral part of +OpenStack networking, and a mature open source toolkit - Libreswan[^ls]. Also +OVN[^ov] has native support in OpenStack and became the default controller for +Neutron[^ne]. + +### Address threat modeling issues + +As disussed in [motivation](#motivation) and [threat +modelling](#potential-threats-in-detail) sections our concern lies with the +potentially vulnerable physical infrastructure between nodes inside or between +data centers. In this case ensuring encryption and integrity of packets before +leaving any node addresses these threats, while avoiding the complexity of +securing the communication on the VM level, where frequent additions, deletions +and migrations could render such system complicated and error prone. We also +don't needlessly encrypt VM communication inside one node. + +### Avoid redundant encryption + +As the encryption happens inside tunnels specific for inter-node workload +communication, isolated on own network and also inside Geneve tunnels, no cloud +service data, which could be possibly encrypted on higher-levels (TLS) is +possible here. As to the workload communication itself - detecting higher-layer +encryption in a way that would allow IPsec[^ip] to avoid redundant encryption +is complex and would require custom modifications or non-standard solutions. +It's usually safer and more straightforward to allow the redundancy, ensuring +security at multiple layers, rather than trying to eliminate it. + +### Performance impact and ease of use + +Setup is straightforward for the operator, there is just a flag to enable or +disable the IPsec[^ip] encryption inside Geneve tunnels and the need to set the +Neutron[^ne] agent to OVN[^ov]. No other configuration is necessary. The only +other administrative burden is the deployment of certificates to provided +configuration directory on the control node. + +Certificate management for this solution can and should be handled in the same +way as for the backend service certificates which are part of the ongoing +efforts to provide complete service communication encryption in kolla-ansible. +Currently the management of these certificates is partially left on external +processes, but if a toolset or a process would be devised inside the project, +this solution would fit in. + +### Upstream integration + +The potential for upstream adoption and long-term maintainability makes this a +promising direction for securing inter-node communication in OpenStack +environments. + +## References + +[^ne]: [Neutron](https://docs.openstack.org/neutron/latest/) - networking as a service (NaaS) in OpenStack +[^eb]: [eBPF](https://en.wikipedia.org/wiki/EBPF) +[^ia]: Team IaaS call [minutes](https://github.com/SovereignCloudStack/minutes/blob/main/iaas/20240214.md) +[^sw]: [open vSwitch](https://www.openvswitch.org/) +[^ip]: [IPsec](https://en.wikipedia.org/wiki/IPsec) +[^ipwh]: [Why is IPsec so complicated](https://destcert.com/resources/why-the-is-ipsec-so-complicated/) +[^to]: [TripleO](https://docs.openstack.org/developer/tripleo-docs/) - OpenStack on OpenStack +[^ci]: [Cillium](https://cilium.io/) +[^ca]: [Calico](https://docs.tigera.io/calico/latest/about) +[^caos]: [Calico for OpenStack](https://docs.tigera.io/calico/latest/getting-started/openstack/overview) +[^ta]: [Tailscale](https://tailscale.com/solutions/devops) +[^ov]: [Open Virtual Network](https://www.ovn.org/en/) (OVN) +[^oit]: [OVN IPsec tutorial](https://docs.ovn.org/en/latest/tutorials/ovn-ipsec.html) +[^kl]: [kolla](https://opendev.org/openstack/kolla) project +[^ka]: [kolla-ansible](https://docs.openstack.org/kolla-ansible/latest/) project +[^wg]: [WireGuard](https://www.wireguard.com/) +[^wgwp]: WireGuard [white paper](https://www.wireguard.com/papers/wireguard.pdf) +[^ie]: [Internet Engineering Task Force](https://www.ietf.org/) (IETF) +[^rfc]: [RFC8926](https://datatracker.ietf.org/doc/html/rfc8926#name-inter-data-center-traffic) +[^lkc]: [Linux Kernel Crypto API](https://www.kernel.org/doc/html/v4.10/crypto/index.html) +[^ls]: [Libreswan](https://libreswan.org/) VPN software +[^ms]: [MACsec standard](https://en.wikipedia.org/wiki/IEEE_802.1AE) +[^neci1]: [Neutron + Cilium architecture example](https://gist.github.com/oblazek/466a9ae836f663f8349b71e76abaee7e) +[^neci2]: [Neutron + Cilium Proposal](https://github.com/cilium/cilium/issues/13433) diff --git a/Standards/scs-0123-v1-mandatory-and-supported-IaaS-services.md b/Standards/scs-0123-v1-mandatory-and-supported-IaaS-services.md new file mode 100644 index 000000000..1d94990bc --- /dev/null +++ b/Standards/scs-0123-v1-mandatory-and-supported-IaaS-services.md @@ -0,0 +1,82 @@ +--- +title: Mandatory and Supported IaaS Services +type: Standard +status: Draft +track: IaaS +--- + +## Introduction + +To be SCS-compliant a Cloud Service Provider (CSP) has to fulfill all SCS standards. +Some of those standards are broad and consider all APIs of all services on the IaaS-Layer like the consideration of a [role standard](https://github.com/SovereignCloudStack/issues/issues/396). +There exist many services on that layer and for a first step they need to be limited to have a clear scope for the standards and the Cloud Service Providers following them. +For this purpose, this standard will establish lists for mandatory services whose APIs have to be present in a SCS cloud as well as supported services, which APIs are considered by some standards and may even be tested for their integration but are optional in a sense that their omission will not violate SCS conformance. + +## Motivation + +There are many OpenStack APIs and their corresponding services that can be deployed on the IaaS level. +These services have differences in the quality of their implementation and liveness and some of them may be easily omitted when creating an IaaS deployment. +To fulfill all SCS-provided standards only a subset of these APIs are required. +Some more but not all remaining OpenStack APIs are also supported additionally by the SCS project and may be part of its reference implementation. +This results in different levels of support for specific services. +This document will give readers insight about how the SCS classifies the OpenStack APIs accordingly. +If a cloud provides all mandatory and any number of supported OpenStack APIs, it can be tested for SCS-compliance. +Any unsupported APIs will not be tested. + +## Mandatory IaaS APIs + +The following IaaS APIs MUST be present in SCS-compliant IaaS deployments and could be implemented with the corresponding OpenStack services: + +| Mandatory API | corresponding OpenStack Service | description | +|-----|-----|-----| +| **block-storage** | Cinder | Block Storage service | +| **compute** | Nova | Compute service | +| **identity** | Keystone | Identity service | +| **image** | Glance | Image service | +| **load-balancer** | Octavia | Load-balancer service | +| **network** | Neutron | Networking service | +| **s3** | S3 API object storage | Object Storage service | + +:::caution + +S3 API implementations may differ in certain offered features. +CSPs must publicly describe, which implementation they use in their deployment. +Users should always research whether a needed feature is supported in the offered implementation. + +::: + +The endpoints of services MUST be findable through the `catalog list` of the identity API[^1]. + +[^1]: [Integrate into the service catalog of Keystone](https://docs.openstack.org/keystone/latest/contributor/service-catalog.html) + +## Supported IaaS APIs + +The following IaaS APIs MAY be present in SCS-compliant IaaS deployment, e.g. implemented thorugh the corresponding OpenStack services, and are considered in the SCS standards. + +| Supported API | corresponding OpenStack Service | description | +|-----|-----|-----| +| **bare-metal** | Ironic | Bare Metal provisioning service | +| **billing** | CloudKitty | Rating/Billing service | +| **dns** | Designate | DNS service | +| **ha** | Masakari | Instances High Availability service | +| **key-manager** | Barbican | Key Manager service | +| **object-store** | Swift | Object Store with different possible backends | +| **orchestration** | Heat | Orchestration service | +| **shared-file-systems** | Manila | Shared File Systems service | +| **telemetry** | Ceilometer | Telemetry service | +| **time-series-database** | Gnocchi | Time Series Database service | + +## Unsupported IaaS APIs + +All other OpenStack services, whose APIs are not mentioned in the mandatory or supported lists will not be tested for their compatibility and conformance in SCS clouds by the SCS community. +Those services MAY be integrated into IaaS deployments by a Cloud Service Provider on their own responsibility but the SCS will not assume they are present and potential issues that occur during deployment or usage have to be handled by the CSP on their own accord. +The SCS standard offers no guarantees for compatibility or reliability of services categorized as unsupported. + +## Related Documents + +[The OpenStack Services](https://www.openstack.org/software/) + +## Conformance Tests + +The presence of the mandatory OpenStack APIs will be tested in [this test-script](https://github.com/SovereignCloudStack/standards/blob/mandatory-and-supported-IaaS-services/Tests/iaas/mandatory-services/mandatory-iaas-services.py). +The test will further check, whether the object store endpoint is compatible to s3. diff --git a/Standards/scs-0210-v2-k8s-version-policy.md b/Standards/scs-0210-v2-k8s-version-policy.md index f09773034..3b086ec3d 100644 --- a/Standards/scs-0210-v2-k8s-version-policy.md +++ b/Standards/scs-0210-v2-k8s-version-policy.md @@ -56,7 +56,7 @@ In order to keep up-to-date with the latest Kubernetes features, bug fixes and s the provided Kubernetes versions should be kept up-to-date with new upstream releases: - The latest minor version MUST be provided no later than 4 months after release. -- The latest patch version MUST be provided no later than 1 week after release. +- The latest patch version MUST be provided no later than 2 weeks after release. - This time period MUST be even shorter for patches that fix critical CVEs. In this context, a critical CVE is a CVE with a CVSS base score >= 8 according to the CVSS version used in the original CVE record (e.g., CVSSv3.1). diff --git a/Standards/scs-0211-w1-kaas-default-storage-class-implementation-testing.md b/Standards/scs-0211-w1-kaas-default-storage-class-implementation-testing.md index d8112f299..1eeb89e48 100644 --- a/Standards/scs-0211-w1-kaas-default-storage-class-implementation-testing.md +++ b/Standards/scs-0211-w1-kaas-default-storage-class-implementation-testing.md @@ -2,7 +2,7 @@ title: "SCS KaaS default storage class: Implementation and Testing Notes" type: Supplement track: KaaS -status: Proposal +status: Draft supplements: - scs-0211-v1-kaas-default-storage-class.md --- diff --git a/Standards/scs-0214-w1-k8s-node-distribution-implementation-testing.md b/Standards/scs-0214-w1-k8s-node-distribution-implementation-testing.md index 79282fbd7..4366365a0 100644 --- a/Standards/scs-0214-w1-k8s-node-distribution-implementation-testing.md +++ b/Standards/scs-0214-w1-k8s-node-distribution-implementation-testing.md @@ -2,7 +2,7 @@ title: "Kubernetes Node Distribution and Availability: Implementation and Testing Notes" type: Supplement track: KaaS -status: Proposal +status: Draft supplements: - scs-0214-v1-k8s-node-distribution.md - scs-0214-v2-k8s-node-distribution.md diff --git a/Standards/scs-0219-v1-kaas-networking.md b/Standards/scs-0219-v1-kaas-networking.md new file mode 100644 index 000000000..8f35f7925 --- /dev/null +++ b/Standards/scs-0219-v1-kaas-networking.md @@ -0,0 +1,99 @@ +--- +title: KaaS Networking Standard +type: Standard +status: Draft +track: KaaS +--- + +## Introduction + +Kubernetes defines a networking model that needs to be implemented by a separate CNI plugin. +Beyond basic connectivity within the cluster, however, there are many networking features that are specified but optional. +Some of these optional features provide vital functionality, such as the NetworkPolicy API and the Ingress API. + +This standard specifies a minimal set of networking features that users can expect in clusters created by an SCS-compliant KaaS provider. + +## Terminology + +The following terms are used throughout this document: + +| Term | Meaning | +|------|---------| +| KaaS, managed Kubernetes | Kubernetes as a Service, automated on-demand deployment of Kubernetes clusters. | +| CSP | Cloud Service Provider, the provider of the KaaS infrastructure. | +| CNI | Container Network Interface, a standardized networking interface for container runtimes. | +| CNI plugin, networking plugin | Kubernetes bindings for a CNI implementation, translates Kubernetes API concepts into more basic container networking concepts. | +| network policy | A set of rules to restrict network traffic in a Kubernetes cluster. | + +## Motivation + +KaaS providers will typically support aditional networking functionality beyond basic Kubernetes networking. +The specific range of features depends on the used CNI plugin, but may also be extended by additional operators. +Users may expect certain optional functionality, so we should define a baseline feature set that has to be available in an SCS-compliant KaaS cluster. + +## Design Considerations + +The Kubernetes API can be extended arbitrarily. +Many CNI plugins will define custom resources to enable functionality that is not covered in the official [API specification](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/). +Sometimes they will even reuse names from different API groups, such as `NetworkPolicy`, which exists in the basic `networking.k8s.io/v1` API, but also in `projectcalico.org/v3`. + +To avoid any ambiguity, we should therefore be explicit about the API groups and versions of resources. +We should also avoid mandating third-party API extensions, to avoid dependencies on specific third-party software and keep the standard as generic as possible. + +### Options considered + +#### NetworkPolicy API + +Kubernetes network policies are used to restrict network traffic between pods in a cluster, but also between pods and external network resources. +The policy rules can filter based on port and address ranges, but also on Kubernetes-specific target attributes such as namespaces and labels. +They must be implemented by the CNI plugin, and though they are widely supported, they are still technically optional, and there are some lightweight networking plugins, such as Flannel, that are not enforcing them. + +Nonetheless, network policies are widely used and most users will expect them in a managed Kubernetes cluster. +The wide, but varying support among CNI plugins makes them a good target for SCS standardization. + +#### Default Network Policies in Namespaces + +Basic network policies are namespaced resources, and can only filter traffic to and from pods in their own namespace. +In a newly created namespace without policies the default behavior will apply, which is to not restrict traffic at all. + +It can be desirable to automatically create default network policies in new namespaces, using a policy operator such as Kyverno. +A CSP could provide such an operator and offer a number of default policies, like blocking connections to other namespaces by default, or blocking access to the OpenStack metadata service. + +Any user with permissions to manage their own network policies in a namespace will of course be able to remove or modify any default network policies in that namespace. +CSP-provided network policies should thus only be viewed as a safety default, and should only be deployed if they are actually beneficial to users. + +#### AdminNetworkPolicy API + +An alternative to automatically created default network policies are API extensions that allow cluster-wide networking rules. +Some CNI plugins have implemented such extensions, e.g. Calico's `GlobalNetworkPolicy` and Cilium's `CiliumClusterwideNetworkPolicy`. + +The Kubernetes Network Special Interest Group is currently working on an [official API extension](https://network-policy-api.sigs.k8s.io/api-overview/) to cover this functionality. +This API extension introduces the new `AdminNetworkPolicy` and `BaselineAdminNetworkPolicy` resources, which represent cluster-wide network policies with respectively higher or lower precedence than namespaced network policies. + +This API is also a good candidate for standardization because it consolidates a number of vendor-specific workarounds to limitations of the NetworkPolicy API. +It has not been stabilized yet, so currently we can at most recommend CNI plugins where there is ongoing work to support these features. + +#### Ingress API + +The Ingress API allows the external exposure of HTTP/HTTPS-based services running in the cluster. +Unlike the L3/L4-based LoadBalancer Service type, Ingress provides L7 load balancing, HTTP routing, and TLS termination for services. +This functionality can be provided within the cluster by a pod-based ingress controller such as `ingress-nginx`, that exposes Ingress resources as Services. + +However, there are also Ingress controllers that integrate with underlying infrastructure and may help to reduce overhead. +Examples for this are the Cilium CNI plugin, which comes with built-in Ingress support, and the Octavia Ingress controller, which may be a good choice if OpenStack Octavia is already used to provide L3/L4 load balancing. + +The CSPs that manage the underlying infrastructure can of course make the best choice for such an integrated Ingress controller, so they should be encouraged to do so. +Even with a CSP-provided default Ingress controller present, users will be able to use alternative Ingress controllers by creating a new `IngressClass`, which can then be referenced in Ingress resources. + +## Decision + +CSPs MUST provide a network plugin that fully supports `NetworkPolicy` resources in the API version `networking.k8s.io/v1`. +CSPs SHOULD provide a network plugin that supports or is working on support for the `AdminNetworkPolicy` and `BaselineAdminNetworkPolicy` resources of the `policy.networking.k8s.io` API group, in their latest version, up to `v1`. + +CSPs SHOULD offer the option for a managed, `networking.k8s.io/v1`-compliant Ingress controller and a default `IngressClass` resource for this controller. + +CSPs MAY add default networking restrictions, using either `networking.k8s.io/v1`-compliant `NetworkPolicy` resources with a policy operator, or alternatively any cluster-wide network policy extensions provided by the CNI plugin. + +## Conformance Tests + +Required support for network policies will be tested using the upstream e2e tests via Sonobuoy. diff --git a/Standards/scs-0219-w1-kaas-networking.md b/Standards/scs-0219-w1-kaas-networking.md new file mode 100644 index 000000000..3e34948d2 --- /dev/null +++ b/Standards/scs-0219-w1-kaas-networking.md @@ -0,0 +1,27 @@ +--- +title: "KaaS Networking Standard: Implementation Notes" +type: Supplement +track: KaaS +status: Draft +supplements: + - scs-0219-v1-kaas-networking.md +--- +## List of compliant CNI Plugins + +The Kubernetes Network Policy API working group maintains a [list of work-in-progress implementations](https://network-policy-api.sigs.k8s.io/implementations/) of the AdminNetworkPolicy and BaselineAdminNetworkPolicy resources. +Besides their own proof-of-concept implementation of [kube-network-policies](https://github.com/kubernetes-sigs/kube-network-policies), at the time of writing they list the following CNI plugins: + +- [OVN-Kubernetes](https://github.com/ovn-org/ovn-kubernetes/) +- [Antrea](https://github.com/antrea-io/antrea/) +- [KubeOVN](https://github.com/kubeovn/kube-ovn) +- [Calico](https://github.com/projectcalico/calico) +- [Cilium](https://github.com/cilium/cilium) + +All of these plugins also implement the basic NetworkPolicy API, and are therefore compliant both with the standard's requirements and recommendations. + +The CNI plugin [Flannel](https://github.com/flannel-io/flannel) does not support network policies by itself, but can be combined with Calico for policy enforcement. +This configuration is known as [Canal](https://docs.tigera.io/calico/latest/getting-started/kubernetes/flannel/install-for-flannel) and will likely profit from Calico's support for AdminNetworkPolicy. + +There are more CNI plugins that support the NetworkPolicy API, but are not known to work on support of the AdminNetworkPolicy extensions. +As such they are still compliant with the current version of the Standard. +However, these seem to be either vendor-specific, like the [Azure CNI](https://learn.microsoft.com/de-de/azure/aks/configure-azure-cni), or unmaintained, like [Weave](https://github.com/weaveworks/weave). diff --git a/Standards/scs-0302-v1-domain-manager-role.md b/Standards/scs-0302-v1-domain-manager-role.md index 29ffa5a7c..a418a23b7 100644 --- a/Standards/scs-0302-v1-domain-manager-role.md +++ b/Standards/scs-0302-v1-domain-manager-role.md @@ -1,17 +1,27 @@ --- title: Domain Manager configuration for Keystone type: Standard -status: Draft +status: Stable +stabilized_at: 2024-11-13 track: IAM --- ## Introduction SCS Clouds should provide a way to grant Domain Manager rights to SCS Customers which provides IAM self-service capabilities within an OpenStack domain. -This is not properly implemented in the default OpenStack configuration and requires specific adjustments to the Keystone identity management configuration. +Such capabilities should enable the SCS customer to manage identity resources within their domain without involving the provider of the cloud. To avoid conflict with the unscoped `admin` role in OpenStack we want to refer to this new persona as "Domain Manager", introducing the `manager` role in the API for domains. -### Glossary +:::info + +The Domain Manager functionality will be a native part of the official OpenStack beginning with release 2024.2 ("Dalmatian"). + +To implement the Domain Manager in SCS clouds using an OpenStack release older than 2024.2, please refer to the supplemental [implementation notes for this standard](https://github.com/SovereignCloudStack/standards/blob/main/Standards/scs-0302-w1-domain-manager-implementation-notes.md). +The implementation notes document describes an alternative implementation that can be used for OpenStack 2024.1 and older releases. + +::: + +## Terminology The following special terms are used throughout this standard document: @@ -31,16 +41,6 @@ The following special terms are used throughout this standard document: [^1]: [OpenStack Documentation: Role-Based Access Control Overview](https://static.opendev.org/docs/patrole/latest/rbac-overview.html) -### Impact - -Applying this standard modifies the API policy configuration of Keystone and introduces a new persona to Keystone to enable IAM self-service for customers within a domain. -Once assigned, this persona allows special Domain Manager users within a domain to manage users, project, groups and role assignments as part of the IAM self-service. - -However, the configuration change introduced by this standard does not automatically assign the Domain Manager persona to any users per default. -Assigning the new persona and granting customers the resulting self-service capabilities is a deliberate action to be taken by the CSP on a per-tenant (i.e. per domain) basis. - -Omitting the provisioning of any Domain Manager users (i.e. not assigning the new persona to any user) will result in an OpenStack cloud that behaves identically to a configuration without the standard applied, making the actual usage of the functionality a CSP's choice and entirely optional. - ## Motivation In the default configuration of Keystone, only users with the `admin` role may manage the IAM resources such as projects, groups and users and their relation through role assignments. @@ -94,180 +94,52 @@ This means that by creating a new role and extending Keystone's API policy confi [^4]: [OpenStack Documentation: Administering Applications that use oslo.policy](https://docs.openstack.org/oslo.policy/latest/admin/index.html) -## Open questions - -### Limitations - -The approach described in this standard imposes the following limitations: +## Decision -1. as a result of the "`identity:list_domains`" rule (see below), Domain Managers are able to see all domains[^5] via "`openstack domain list`" and can inspect the metadata of other domains with "`openstack domain show`" -2. as a result of the "`identity:list_roles`" rule (see below), Domain Managers are able to see all roles via "`openstack role list`" and can inspect the metadata of other roles with "`openstack role show`" +A role named "`manager`" MUST be present in the identity service. -**As a result of points 1 and 2, metadata of all domains and roles will be exposed to all Domain Managers!** +The identity service MUST implement the Domain Manager functionality for this role. +The implementation details depend on the OpenStack Keystone version used. +See the sections below for reference. -If a CSP deems either of these points critical, they may abstain from granting the `"manager"` role to any user in a domain scope, effectively disabling the Domain Manager functionality. See [Impact](#impact). +### For OpenStack Keystone 2024.2 or later -[^5]: see the [corresponding Launchpad bug at Keystone](https://bugs.launchpad.net/keystone/+bug/2041611) +For OpenStack Keystone 2024.2 or later the Domain Manager persona is already integrated natively. +To guarantee proper scope protection, the Identity API MUST be configured with "`enforce_scope`" and "`enforce_new_defaults`" enabled for the oslo.policy library. -## Decision +Example entries for the `keystone.conf` configuration file: -A role named "`manager`" is to be created via the Keystone API and the policy adjustments quoted below are to be applied. - -### Policy adjustments - -The following policy has to be applied to Keystone in a verbatim fashion. -The only parts of the policy definitions that may be changed are: - -1. The "`base_*`" definitions to align them to the correct OpenStack defaults matching the OpenStack release of the environment in case those differ from this template. -2. The "`is_domain_managed_role`" definition (see next section below). - -```yaml -# SCS Domain Manager policy configuration - -# Section A: OpenStack base definitions -# The entries beginning with "base_" should be exact copies of the -# default "identity:" definitions for the target OpenStack release. -# They will be extended upon for the manager role below this section. -"base_get_domain": "(role:reader and system_scope:all) or token.domain.id:%(target.domain.id)s or token.project.domain.id:%(target.domain.id)s" -"base_list_domains": "(role:reader and system_scope:all)" -"base_list_roles": "(role:reader and system_scope:all)" -"base_get_role": "(role:reader and system_scope:all)" -"base_list_users": "(role:reader and system_scope:all) or (role:reader and domain_id:%(target.domain_id)s)" -"base_get_user": "(role:reader and system_scope:all) or (role:reader and token.domain.id:%(target.user.domain_id)s) or user_id:%(target.user.id)s" -"base_create_user": "(role:admin and system_scope:all) or (role:admin and token.domain.id:%(target.user.domain_id)s)" -"base_update_user": "(role:admin and system_scope:all) or (role:admin and token.domain.id:%(target.user.domain_id)s)" -"base_delete_user": "(role:admin and system_scope:all) or (role:admin and token.domain.id:%(target.user.domain_id)s)" -"base_list_projects": "(role:reader and system_scope:all) or (role:reader and domain_id:%(target.domain_id)s)" -"base_get_project": "(role:reader and system_scope:all) or (role:reader and domain_id:%(target.project.domain_id)s) or project_id:%(target.project.id)s" -"base_create_project": "(role:admin and system_scope:all) or (role:admin and domain_id:%(target.project.domain_id)s)" -"base_update_project": "(role:admin and system_scope:all) or (role:admin and domain_id:%(target.project.domain_id)s)" -"base_delete_project": "(role:admin and system_scope:all) or (role:admin and domain_id:%(target.project.domain_id)s)" -"base_list_user_projects": "(role:reader and system_scope:all) or (role:reader and domain_id:%(target.user.domain_id)s) or user_id:%(target.user.id)s" -"base_check_grant": "(role:reader and system_scope:all) or ((role:reader and domain_id:%(target.user.domain_id)s and domain_id:%(target.project.domain_id)s) or (role:reader and domain_id:%(target.user.domain_id)s and domain_id:%(target.domain.id)s) or (role:reader and domain_id:%(target.group.domain_id)s and domain_id:%(target.project.domain_id)s) or (role:reader and domain_id:%(target.group.domain_id)s and domain_id:%(target.domain.id)s)) and (domain_id:%(target.role.domain_id)s or None:%(target.role.domain_id)s)" -"base_list_grants": "(role:reader and system_scope:all) or (role:reader and domain_id:%(target.user.domain_id)s and domain_id:%(target.project.domain_id)s) or (role:reader and domain_id:%(target.user.domain_id)s and domain_id:%(target.domain.id)s) or (role:reader and domain_id:%(target.group.domain_id)s and domain_id:%(target.project.domain_id)s) or (role:reader and domain_id:%(target.group.domain_id)s and domain_id:%(target.domain.id)s)" -"base_create_grant": "(role:admin and system_scope:all) or ((role:admin and domain_id:%(target.user.domain_id)s and domain_id:%(target.project.domain_id)s) or (role:admin and domain_id:%(target.user.domain_id)s and domain_id:%(target.domain.id)s) or (role:admin and domain_id:%(target.group.domain_id)s and domain_id:%(target.project.domain_id)s) or (role:admin and domain_id:%(target.group.domain_id)s and domain_id:%(target.domain.id)s)) and (domain_id:%(target.role.domain_id)s or None:%(target.role.domain_id)s)" -"base_revoke_grant": "(role:admin and system_scope:all) or ((role:admin and domain_id:%(target.user.domain_id)s and domain_id:%(target.project.domain_id)s) or (role:admin and domain_id:%(target.user.domain_id)s and domain_id:%(target.domain.id)s) or (role:admin and domain_id:%(target.group.domain_id)s and domain_id:%(target.project.domain_id)s) or (role:admin and domain_id:%(target.group.domain_id)s and domain_id:%(target.domain.id)s)) and (domain_id:%(target.role.domain_id)s or None:%(target.role.domain_id)s)" -"base_list_role_assignments": "(role:reader and system_scope:all) or (role:reader and domain_id:%(target.domain_id)s)" -"base_list_groups": "(role:reader and system_scope:all) or (role:reader and domain_id:%(target.group.domain_id)s)" -"base_get_group": "(role:reader and system_scope:all) or (role:reader and domain_id:%(target.group.domain_id)s)" -"base_create_group": "(role:admin and system_scope:all) or (role:admin and domain_id:%(target.group.domain_id)s)" -"base_update_group": "(role:admin and system_scope:all) or (role:admin and domain_id:%(target.group.domain_id)s)" -"base_delete_group": "(role:admin and system_scope:all) or (role:admin and domain_id:%(target.group.domain_id)s)" -"base_list_groups_for_user": "(role:reader and system_scope:all) or (role:reader and domain_id:%(target.user.domain_id)s) or user_id:%(user_id)s" -"base_list_users_in_group": "(role:reader and system_scope:all) or (role:reader and domain_id:%(target.group.domain_id)s)" -"base_remove_user_from_group": "(role:admin and system_scope:all) or (role:admin and domain_id:%(target.group.domain_id)s and domain_id:%(target.user.domain_id)s)" -"base_check_user_in_group": "(role:reader and system_scope:all) or (role:reader and domain_id:%(target.group.domain_id)s and domain_id:%(target.user.domain_id)s)" -"base_add_user_to_group": "(role:admin and system_scope:all) or (role:admin and domain_id:%(target.group.domain_id)s and domain_id:%(target.user.domain_id)s)" - -# Section B: Domain Manager Extensions - -# classify domain managers with a special role -"is_domain_manager": "role:manager" - -# specify a rule that whitelists roles which domain admins are permitted -# to assign and revoke within their domain -"is_domain_managed_role": "'member':%(target.role.name)s or 'load-balancer_member':%(target.role.name)s" - -# allow domain admins to retrieve their own domain (does not need changes) -"identity:get_domain": "rule:base_get_domain or rule:admin_required" - -# list_domains is needed for GET /v3/domains?name=... requests -# this is mandatory for things like -# `create user --domain $DOMAIN_NAME $USER_NAME` to correctly discover -# domains by name -"identity:list_domains": "rule:is_domain_manager or rule:base_list_domains or rule:admin_required" - -# list_roles is needed for GET /v3/roles?name=... requests -# this is mandatory for things like `role add ... $ROLE_NAME`` to correctly -# discover roles by name -"identity:list_roles": "rule:is_domain_manager or rule:base_list_roles or rule:admin_required" - -# get_role is needed for GET /v3/roles/{role_id} requests -# this is mandatory for the OpenStack SDK to properly process role assignments -# which are issued by role id instead of name -"identity:get_role": "(rule:is_domain_manager and rule:is_domain_managed_role) or rule:base_get_role or rule:admin_required" - -# allow domain admins to manage users within their domain -"identity:list_users": "(rule:is_domain_manager and token.domain.id:%(target.domain_id)s) or rule:base_list_users or rule:admin_required" -"identity:get_user": "(rule:is_domain_manager and token.domain.id:%(target.user.domain_id)s) or rule:base_get_user or rule:admin_required" -"identity:create_user": "(rule:is_domain_manager and token.domain.id:%(target.user.domain_id)s) or rule:base_create_user or rule:admin_required" -"identity:update_user": "(rule:is_domain_manager and token.domain.id:%(target.user.domain_id)s) or rule:base_update_user or rule:admin_required" -"identity:delete_user": "(rule:is_domain_manager and token.domain.id:%(target.user.domain_id)s) or rule:base_delete_user or rule:admin_required" - -# allow domain admins to manage projects within their domain -"identity:list_projects": "(rule:is_domain_manager and token.domain.id:%(target.domain_id)s) or rule:base_list_projects or rule:admin_required" -"identity:get_project": "(rule:is_domain_manager and token.domain.id:%(target.project.domain_id)s) or rule:base_get_project or rule:admin_required" -"identity:create_project": "(rule:is_domain_manager and token.domain.id:%(target.project.domain_id)s) or rule:base_create_project or rule:admin_required" -"identity:update_project": "(rule:is_domain_manager and token.domain.id:%(target.project.domain_id)s) or rule:base_update_project or rule:admin_required" -"identity:delete_project": "(rule:is_domain_manager and token.domain.id:%(target.project.domain_id)s) or rule:base_delete_project or rule:admin_required" -"identity:list_user_projects": "(rule:is_domain_manager and token.domain.id:%(target.user.domain_id)s) or rule:base_list_user_projects or rule:admin_required" - -# allow domain managers to manage role assignments within their domain -# (restricted to specific roles by the 'is_domain_managed_role' rule) -# -# project-level role assignment to user within domain -"is_domain_user_project_grant": "token.domain.id:%(target.user.domain_id)s and token.domain.id:%(target.project.domain_id)s" -# project-level role assignment to group within domain -"is_domain_group_project_grant": "token.domain.id:%(target.group.domain_id)s and token.domain.id:%(target.project.domain_id)s" -# domain-level role assignment to group -"is_domain_level_group_grant": "token.domain.id:%(target.group.domain_id)s and token.domain.id:%(target.domain.id)s" -# domain-level role assignment to user -"is_domain_level_user_grant": "token.domain.id:%(target.user.domain_id)s and token.domain.id:%(target.domain.id)s" -"domain_manager_grant": "rule:is_domain_manager and (rule:is_domain_user_project_grant or rule:is_domain_group_project_grant or rule:is_domain_level_group_grant or rule:is_domain_level_user_grant)" -"identity:check_grant": "rule:domain_manager_grant or rule:base_check_grant or rule:admin_required" -"identity:list_grants": "rule:domain_manager_grant or rule:base_list_grants or rule:admin_required" -"identity:create_grant": "(rule:domain_manager_grant and rule:is_domain_managed_role) or rule:base_create_grant or rule:admin_required" -"identity:revoke_grant": "(rule:domain_manager_grant and rule:is_domain_managed_role) or rule:base_revoke_grant or rule:admin_required" -"identity:list_role_assignments": "(rule:is_domain_manager and token.domain.id:%(target.domain_id)s) or rule:base_list_role_assignments or rule:admin_required" - - -# allow domain managers to manage groups within their domain -"identity:list_groups": "(rule:is_domain_manager and token.domain.id:%(target.group.domain_id)s) or (role:reader and system_scope:all) or rule:base_list_groups or rule:admin_required" -"identity:get_group": "(rule:is_domain_manager and token.domain.id:%(target.group.domain_id)s) or (role:reader and system_scope:all) or rule:base_get_group or rule:admin_required" -"identity:create_group": "(rule:is_domain_manager and token.domain.id:%(target.group.domain_id)s) or rule:base_create_group or rule:admin_required" -"identity:update_group": "(rule:is_domain_manager and token.domain.id:%(target.group.domain_id)s) or rule:base_update_group or rule:admin_required" -"identity:delete_group": "(rule:is_domain_manager and token.domain.id:%(target.group.domain_id)s) or rule:base_delete_group or rule:admin_required" -"identity:list_groups_for_user": "(rule:is_domain_manager and token.domain.id:%(target.user.domain_id)s) or rule:base_list_groups_for_user or rule:admin_required" -"identity:list_users_in_group": "(rule:is_domain_manager and token.domain.id:%(target.group.domain_id)s) or rule:base_list_users_in_group or rule:admin_required" -"identity:remove_user_from_group": "(rule:is_domain_manager and token.domain.id:%(target.group.domain_id)s and token.domain.id:%(target.user.domain_id)s) or rule:base_remove_user_from_group or rule:admin_required" -"identity:check_user_in_group": "(rule:is_domain_manager and token.domain.id:%(target.group.domain_id)s and token.domain.id:%(target.user.domain_id)s) or rule:base_check_user_in_group or rule:admin_required" -"identity:add_user_to_group": "(rule:is_domain_manager and token.domain.id:%(target.group.domain_id)s and token.domain.id:%(target.user.domain_id)s) or rule:base_add_user_to_group or rule:admin_required" +```ini +[oslo_policy] +enforce_new_defaults = True +enforce_scope = True ``` -Note that the policy file begins with a list of "`base_*`" rule definitions ("Section A"). -These mirror the default policies of recent OpenStack releases. -They are used as a basis for the domain-manager-specific changes which are implemented in "Section B" where they are referenced to via "`or rule:base_*`" accordingly. -The section of "`base_*`" rules is meant for easy maintenance/update of default rules while keeping the domain-manager-specific rules separate. - -> **Note:** -> The "`or rule:admin_required`" appendix to the rule definitions in "Section B" is included for backwards compatibility with environments not yet fully configured for the new secure RBAC standard[^6]. - -[^6]: [OpenStack Technical Committee Governance Documents: Consistent and Secure Default RBAC](https://governance.openstack.org/tc/goals/selected/consistent-and-secure-rbac.html) +The "`is_domain_managed_role`" policy rule MAY be adjusted using a dedicated `policy.yaml` file for the Identity API in order to adjust the set of roles a Domain Manager is able to assign/revoke. +When doing so, the `admin` role MUST NOT be added to this set. -#### Specifying manageable roles via "`is_domain_managed_role`" +#### Note about upgrading from SCS Domain Manager to native integration -The "`is_domain_managed_role`" rule of the above policy template may be adjusted according to the requirements of the CSP and infrastructure architecture to specify different or multiple roles as manageable by Domain Managers as long as the policy rule adheres to the following: +In case the Identity API was upgraded from an older version where the policy-based Domain Manager implementation of SCS described in the [implementation notes for this standard](https://github.com/SovereignCloudStack/standards/blob/main/Standards/scs-0302-w1-domain-manager-implementation-notes.md) was still in use, the policies described there MUST be removed. +The only exception to this is the "`is_domain_managed_role`" rule in case any adjustments have been made to that rule and the CSP wants to preserve them. -- the "`is_domain_managed_role`" rule MUST NOT contain the "`admin`" role, neither directly nor transitively -- the "`is_domain_managed_role`" rule MUST define all applicable roles directly, it MUST NOT contain a "`rule:`" reference within itself +### For OpenStack Keystone 2024.1 or below -##### Example: permitting multiple roles +For OpenStack Keystone 2024.1 or below, the Domain Manager functionality MUST be implemented using API policies. +For details, refer to the [implementation notes for this standard](https://github.com/SovereignCloudStack/standards/blob/main/Standards/scs-0302-w1-domain-manager-implementation-notes.md). -The following example permits the "`reader`" role to be assigned/revoked by a Domain Manager in addition to the default "`member`" and "`load-balancer_member`" roles. -Further roles can be appended using the logical `or` directive. +For the release 2024.1 and below, changing the "`enforce_scope`" and "`enforce_new_defaults`" options for the Identity API is not necessary for the Domain Manager implementation. -```yaml -"is_domain_managed_role": "'member':%(target.role.name)s or 'load-balancer_member':%(target.role.name)s or 'reader':%(target.role.name)s" -``` - -**Note regarding the `manager` role** +## Related Documents -When adjusting the "`is_domain_managed_role`" rule a CSP might opt to also include the "`manager`" role itself in the manageable roles, resulting in Domain Managers being able to propagate the Domain Manager capabilities to other users within their domain. -This increases the self-service capabilities of the customer but introduces risks of Domain Managers also being able to revoke this role from themselves or each other (within their domain) in an unintended fashion. +### Upstream contribution spec for the Domain Manager functionality -CSPs have to carefully evaluate whether Domain Manager designation authority should reside solely on their side or be part of the customer self-service scope and decide about adding "`'manager':%(target.role.name)s`" to the rule accordingly. +**Description:** Upstream Identity service specification to introduce the Domain Manager functionality natively in OpenStack Keystone. +After implementing the Domain Manager functionality as described in the [implementation notes for this standard](https://github.com/SovereignCloudStack/standards/blob/main/Standards/scs-0302-w1-domain-manager-implementation-notes.md), the SCS project contributed the functionality to the official OpenStack project. +This eventually resulted in the feature being integrated natively in OpenStack Keystone starting with the 2024.2 release. +The specification was the starting point of the contribution. -## Related Documents +**Link:** [OpenStack Identity Specs: Domain Manager Persona for domain-scoped self-service administration](https://specs.openstack.org/openstack/keystone-specs/specs/keystone/2024.1/domain-manager-persona.html) ### "admin"-ness not properly scoped diff --git a/Standards/scs-0302-w1-domain-manager-implementation-notes.md b/Standards/scs-0302-w1-domain-manager-implementation-notes.md new file mode 100644 index 000000000..6e2c60298 --- /dev/null +++ b/Standards/scs-0302-w1-domain-manager-implementation-notes.md @@ -0,0 +1,194 @@ +--- +title: Domain Manager implementation notes +type: Supplement +track: IAM +status: Draft +supplements: + - scs-0302-v1-domain-manager-role.md +--- + +## Implementation notes + +:::caution + +If a Keystone release of OpenStack 2024.2 or later is used, **the policy configuration described in this document MUST be removed again** in case it was applied in the past prior to the upgrade. + +::: + +:::info + +The implementation described in this document only applies to Keystone releases prior to the OpenStack release 2024.2 ("Dalmatian"). +This document describes a transitional solution to offer the Domain Manager functionality for SCS clouds based on an OpenStack release earlier than 2024.2. + +Beginning with the 2024.2 release of OpenStack, the Domain Manager persona is integrated natively into Keystone and the implementation described below is unnecessary and might conflict with the native implementation. + +::: + +### Policy adjustments + +The following policy can be applied to Keystone releases older than 2024.2 ("Dalmatian"). +It mimics the Domain Manager persona implemented by Keystone starting with version 2024.2 and makes the functionality available for earlier releases of Keystone. + +The only parts of the policy definitions below that may be changed are: + +1. The "`base_*`" definitions to align them to the correct OpenStack defaults matching the OpenStack release of the environment in case those differ from this template. +2. The "`is_domain_managed_role`" definition (see next section below). + +```yaml +# SCS Domain Manager policy configuration + +# Section A: OpenStack base definitions +# The entries beginning with "base_" should be exact copies of the +# default "identity:" definitions for the target OpenStack release. +# They will be extended upon for the manager role below this section. +"base_get_domain": "(role:reader and system_scope:all) or token.domain.id:%(target.domain.id)s or token.project.domain.id:%(target.domain.id)s" +"base_list_domains": "(role:reader and system_scope:all)" +"base_list_roles": "(role:reader and system_scope:all)" +"base_get_role": "(role:reader and system_scope:all)" +"base_list_users": "(role:reader and system_scope:all) or (role:reader and domain_id:%(target.domain_id)s)" +"base_get_user": "(role:reader and system_scope:all) or (role:reader and token.domain.id:%(target.user.domain_id)s) or user_id:%(target.user.id)s" +"base_create_user": "(role:admin and system_scope:all) or (role:admin and token.domain.id:%(target.user.domain_id)s)" +"base_update_user": "(role:admin and system_scope:all) or (role:admin and token.domain.id:%(target.user.domain_id)s)" +"base_delete_user": "(role:admin and system_scope:all) or (role:admin and token.domain.id:%(target.user.domain_id)s)" +"base_list_projects": "(role:reader and system_scope:all) or (role:reader and domain_id:%(target.domain_id)s)" +"base_get_project": "(role:reader and system_scope:all) or (role:reader and domain_id:%(target.project.domain_id)s) or project_id:%(target.project.id)s" +"base_create_project": "(role:admin and system_scope:all) or (role:admin and domain_id:%(target.project.domain_id)s)" +"base_update_project": "(role:admin and system_scope:all) or (role:admin and domain_id:%(target.project.domain_id)s)" +"base_delete_project": "(role:admin and system_scope:all) or (role:admin and domain_id:%(target.project.domain_id)s)" +"base_list_user_projects": "(role:reader and system_scope:all) or (role:reader and domain_id:%(target.user.domain_id)s) or user_id:%(target.user.id)s" +"base_check_grant": "(role:reader and system_scope:all) or ((role:reader and domain_id:%(target.user.domain_id)s and domain_id:%(target.project.domain_id)s) or (role:reader and domain_id:%(target.user.domain_id)s and domain_id:%(target.domain.id)s) or (role:reader and domain_id:%(target.group.domain_id)s and domain_id:%(target.project.domain_id)s) or (role:reader and domain_id:%(target.group.domain_id)s and domain_id:%(target.domain.id)s)) and (domain_id:%(target.role.domain_id)s or None:%(target.role.domain_id)s)" +"base_list_grants": "(role:reader and system_scope:all) or (role:reader and domain_id:%(target.user.domain_id)s and domain_id:%(target.project.domain_id)s) or (role:reader and domain_id:%(target.user.domain_id)s and domain_id:%(target.domain.id)s) or (role:reader and domain_id:%(target.group.domain_id)s and domain_id:%(target.project.domain_id)s) or (role:reader and domain_id:%(target.group.domain_id)s and domain_id:%(target.domain.id)s)" +"base_create_grant": "(role:admin and system_scope:all) or ((role:admin and domain_id:%(target.user.domain_id)s and domain_id:%(target.project.domain_id)s) or (role:admin and domain_id:%(target.user.domain_id)s and domain_id:%(target.domain.id)s) or (role:admin and domain_id:%(target.group.domain_id)s and domain_id:%(target.project.domain_id)s) or (role:admin and domain_id:%(target.group.domain_id)s and domain_id:%(target.domain.id)s)) and (domain_id:%(target.role.domain_id)s or None:%(target.role.domain_id)s)" +"base_revoke_grant": "(role:admin and system_scope:all) or ((role:admin and domain_id:%(target.user.domain_id)s and domain_id:%(target.project.domain_id)s) or (role:admin and domain_id:%(target.user.domain_id)s and domain_id:%(target.domain.id)s) or (role:admin and domain_id:%(target.group.domain_id)s and domain_id:%(target.project.domain_id)s) or (role:admin and domain_id:%(target.group.domain_id)s and domain_id:%(target.domain.id)s)) and (domain_id:%(target.role.domain_id)s or None:%(target.role.domain_id)s)" +"base_list_role_assignments": "(role:reader and system_scope:all) or (role:reader and domain_id:%(target.domain_id)s)" +"base_list_groups": "(role:reader and system_scope:all) or (role:reader and domain_id:%(target.group.domain_id)s)" +"base_get_group": "(role:reader and system_scope:all) or (role:reader and domain_id:%(target.group.domain_id)s)" +"base_create_group": "(role:admin and system_scope:all) or (role:admin and domain_id:%(target.group.domain_id)s)" +"base_update_group": "(role:admin and system_scope:all) or (role:admin and domain_id:%(target.group.domain_id)s)" +"base_delete_group": "(role:admin and system_scope:all) or (role:admin and domain_id:%(target.group.domain_id)s)" +"base_list_groups_for_user": "(role:reader and system_scope:all) or (role:reader and domain_id:%(target.user.domain_id)s) or user_id:%(user_id)s" +"base_list_users_in_group": "(role:reader and system_scope:all) or (role:reader and domain_id:%(target.group.domain_id)s)" +"base_remove_user_from_group": "(role:admin and system_scope:all) or (role:admin and domain_id:%(target.group.domain_id)s and domain_id:%(target.user.domain_id)s)" +"base_check_user_in_group": "(role:reader and system_scope:all) or (role:reader and domain_id:%(target.group.domain_id)s and domain_id:%(target.user.domain_id)s)" +"base_add_user_to_group": "(role:admin and system_scope:all) or (role:admin and domain_id:%(target.group.domain_id)s and domain_id:%(target.user.domain_id)s)" + +# Section B: Domain Manager Extensions + +# classify domain managers with a special role +"is_domain_manager": "role:manager" + +# specify a rule that whitelists roles which domain admins are permitted +# to assign and revoke within their domain +"is_domain_managed_role": "'member':%(target.role.name)s or 'load-balancer_member':%(target.role.name)s" + +# allow domain admins to retrieve their own domain (does not need changes) +"identity:get_domain": "rule:base_get_domain or rule:admin_required" + +# list_domains is needed for GET /v3/domains?name=... requests +# this is mandatory for things like +# `create user --domain $DOMAIN_NAME $USER_NAME` to correctly discover +# domains by name +"identity:list_domains": "rule:is_domain_manager or rule:base_list_domains or rule:admin_required" + +# list_roles is needed for GET /v3/roles?name=... requests +# this is mandatory for things like `role add ... $ROLE_NAME`` to correctly +# discover roles by name +"identity:list_roles": "rule:is_domain_manager or rule:base_list_roles or rule:admin_required" + +# get_role is needed for GET /v3/roles/{role_id} requests +# this is mandatory for the OpenStack SDK to properly process role assignments +# which are issued by role id instead of name +"identity:get_role": "(rule:is_domain_manager and rule:is_domain_managed_role) or rule:base_get_role or rule:admin_required" + +# allow domain admins to manage users within their domain +"identity:list_users": "(rule:is_domain_manager and token.domain.id:%(target.domain_id)s) or rule:base_list_users or rule:admin_required" +"identity:get_user": "(rule:is_domain_manager and token.domain.id:%(target.user.domain_id)s) or rule:base_get_user or rule:admin_required" +"identity:create_user": "(rule:is_domain_manager and token.domain.id:%(target.user.domain_id)s) or rule:base_create_user or rule:admin_required" +"identity:update_user": "(rule:is_domain_manager and token.domain.id:%(target.user.domain_id)s) or rule:base_update_user or rule:admin_required" +"identity:delete_user": "(rule:is_domain_manager and token.domain.id:%(target.user.domain_id)s) or rule:base_delete_user or rule:admin_required" + +# allow domain admins to manage projects within their domain +"identity:list_projects": "(rule:is_domain_manager and token.domain.id:%(target.domain_id)s) or rule:base_list_projects or rule:admin_required" +"identity:get_project": "(rule:is_domain_manager and token.domain.id:%(target.project.domain_id)s) or rule:base_get_project or rule:admin_required" +"identity:create_project": "(rule:is_domain_manager and token.domain.id:%(target.project.domain_id)s) or rule:base_create_project or rule:admin_required" +"identity:update_project": "(rule:is_domain_manager and token.domain.id:%(target.project.domain_id)s) or rule:base_update_project or rule:admin_required" +"identity:delete_project": "(rule:is_domain_manager and token.domain.id:%(target.project.domain_id)s) or rule:base_delete_project or rule:admin_required" +"identity:list_user_projects": "(rule:is_domain_manager and token.domain.id:%(target.user.domain_id)s) or rule:base_list_user_projects or rule:admin_required" + +# allow domain managers to manage role assignments within their domain +# (restricted to specific roles by the 'is_domain_managed_role' rule) +# +# project-level role assignment to user within domain +"is_domain_user_project_grant": "token.domain.id:%(target.user.domain_id)s and token.domain.id:%(target.project.domain_id)s" +# project-level role assignment to group within domain +"is_domain_group_project_grant": "token.domain.id:%(target.group.domain_id)s and token.domain.id:%(target.project.domain_id)s" +# domain-level role assignment to group +"is_domain_level_group_grant": "token.domain.id:%(target.group.domain_id)s and token.domain.id:%(target.domain.id)s" +# domain-level role assignment to user +"is_domain_level_user_grant": "token.domain.id:%(target.user.domain_id)s and token.domain.id:%(target.domain.id)s" +"domain_manager_grant": "rule:is_domain_manager and (rule:is_domain_user_project_grant or rule:is_domain_group_project_grant or rule:is_domain_level_group_grant or rule:is_domain_level_user_grant)" +"identity:check_grant": "rule:domain_manager_grant or rule:base_check_grant or rule:admin_required" +"identity:list_grants": "rule:domain_manager_grant or rule:base_list_grants or rule:admin_required" +"identity:create_grant": "(rule:domain_manager_grant and rule:is_domain_managed_role) or rule:base_create_grant or rule:admin_required" +"identity:revoke_grant": "(rule:domain_manager_grant and rule:is_domain_managed_role) or rule:base_revoke_grant or rule:admin_required" +"identity:list_role_assignments": "(rule:is_domain_manager and token.domain.id:%(target.domain_id)s) or rule:base_list_role_assignments or rule:admin_required" + +# allow domain managers to manage groups within their domain +"identity:list_groups": "(rule:is_domain_manager and token.domain.id:%(target.group.domain_id)s) or (role:reader and system_scope:all) or rule:base_list_groups or rule:admin_required" +"identity:get_group": "(rule:is_domain_manager and token.domain.id:%(target.group.domain_id)s) or (role:reader and system_scope:all) or rule:base_get_group or rule:admin_required" +"identity:create_group": "(rule:is_domain_manager and token.domain.id:%(target.group.domain_id)s) or rule:base_create_group or rule:admin_required" +"identity:update_group": "(rule:is_domain_manager and token.domain.id:%(target.group.domain_id)s) or rule:base_update_group or rule:admin_required" +"identity:delete_group": "(rule:is_domain_manager and token.domain.id:%(target.group.domain_id)s) or rule:base_delete_group or rule:admin_required" +"identity:list_groups_for_user": "(rule:is_domain_manager and token.domain.id:%(target.user.domain_id)s) or rule:base_list_groups_for_user or rule:admin_required" +"identity:list_users_in_group": "(rule:is_domain_manager and token.domain.id:%(target.group.domain_id)s) or rule:base_list_users_in_group or rule:admin_required" +"identity:remove_user_from_group": "(rule:is_domain_manager and token.domain.id:%(target.group.domain_id)s and token.domain.id:%(target.user.domain_id)s) or rule:base_remove_user_from_group or rule:admin_required" +"identity:check_user_in_group": "(rule:is_domain_manager and token.domain.id:%(target.group.domain_id)s and token.domain.id:%(target.user.domain_id)s) or rule:base_check_user_in_group or rule:admin_required" +"identity:add_user_to_group": "(rule:is_domain_manager and token.domain.id:%(target.group.domain_id)s and token.domain.id:%(target.user.domain_id)s) or rule:base_add_user_to_group or rule:admin_required" +``` + +Note that the policy file begins with a list of "`base_*`" rule definitions ("Section A"). +These mirror the default policies of recent OpenStack releases. +They are used as a basis for the domain-manager-specific changes which are implemented in "Section B" where they are referenced to via "`or rule:base_*`" accordingly. +The section of "`base_*`" rules is meant for easy maintenance/update of default rules while keeping the domain-manager-specific rules separate. + +> **Note:** +> The "`or rule:admin_required`" appendix to the rule definitions in "Section B" is included for backwards compatibility with environments not yet fully configured for the new secure RBAC standard[^1]. + +[^1]: [OpenStack Technical Committee Governance Documents: Consistent and Secure Default RBAC](https://governance.openstack.org/tc/goals/selected/consistent-and-secure-rbac.html) + +#### Specifying manageable roles via "`is_domain_managed_role`" + +The "`is_domain_managed_role`" rule of the above policy template may be adjusted according to the requirements of the CSP and infrastructure architecture to specify different or multiple roles as manageable by Domain Managers as long as the policy rule adheres to the following: + +- the "`is_domain_managed_role`" rule MUST NOT contain the "`admin`" role, neither directly nor transitively +- the "`is_domain_managed_role`" rule MUST define all applicable roles directly, it MUST NOT contain a "`rule:`" reference within itself + +##### Example: permitting multiple roles + +The following example permits the "`reader`" role to be assigned/revoked by a Domain Manager in addition to the default "`member`" and "`load-balancer_member`" roles. +Further roles can be appended using the logical `or` directive. + +```yaml +"is_domain_managed_role": "'member':%(target.role.name)s or 'load-balancer_member':%(target.role.name)s or 'reader':%(target.role.name)s" +``` + +**Note regarding the `manager` role** + +When adjusting the "`is_domain_managed_role`" rule a CSP might opt to also include the "`manager`" role itself in the manageable roles, resulting in Domain Managers being able to propagate the Domain Manager capabilities to other users within their domain. +This increases the self-service capabilities of the customer but introduces risks of Domain Managers also being able to revoke this role from themselves or each other (within their domain) in an unintended fashion. + +CSPs have to carefully evaluate whether Domain Manager designation authority should reside solely on their side or be part of the customer self-service scope and decide about adding "`'manager':%(target.role.name)s`" to the rule accordingly. + +### Impact + +Applying this implementation modifies the API policy configuration of Keystone and introduces a new persona to Keystone to enable IAM self-service for customers within a domain. +Once assigned, this persona allows special Domain Manager users within a domain to manage users, project, groups and role assignments as part of the IAM self-service. + +However, the configuration change introduced by this implementation does not automatically assign the Domain Manager persona to any users per default. +Assigning the new persona and granting customers the resulting self-service capabilities is a deliberate action to be taken by the CSP on a per-tenant (i.e. per domain) basis. + +Omitting the provisioning of any Domain Manager users (i.e. not assigning the new persona to any user) will result in an OpenStack cloud that behaves identically to a configuration without the implementation applied, making the actual usage of the functionality a CSP's choice and entirely optional. + +#### Security implications + +As a result of the "`identity:list_roles`" rule (see above), Domain Managers are able to see all roles via "`openstack role list`" and can inspect the metadata of any role with "`openstack role show`" diff --git a/Standards/scs-XXXX-vN-decision-record-template.md b/Standards/scs-XXXX-vN-decision-record-template.md index 4b73c1ca0..774bd10b6 100644 --- a/Standards/scs-XXXX-vN-decision-record-template.md +++ b/Standards/scs-XXXX-vN-decision-record-template.md @@ -1,7 +1,7 @@ --- title: _Descriptive title_ type: Decision Record -status: Proposal +status: Draft track: Global # | IaaS | Ops | KaaS | IAM --- diff --git a/Standards/scs-XXXX-vN-standard-template.md b/Standards/scs-XXXX-vN-standard-template.md index 52a4e7c6e..1b8afaf22 100644 --- a/Standards/scs-XXXX-vN-standard-template.md +++ b/Standards/scs-XXXX-vN-standard-template.md @@ -1,7 +1,7 @@ --- title: _Descriptive title_ type: Standard # | Procedural -status: Proposal +status: Draft track: Global # | IaaS | Ops | KaaS | IAM --- diff --git a/Tests/config.toml b/Tests/config.toml index 6833fc5ad..9791996d2 100644 --- a/Tests/config.toml +++ b/Tests/config.toml @@ -26,6 +26,7 @@ subjects = [ "poc-kdo", "poc-wgcloud", "regio-a", + "scaleup-occ2", "syseleven-dus2", "syseleven-ham1", "wavestack", diff --git a/Tests/iaas/entropy/entropy-check.py b/Tests/iaas/entropy/entropy-check.py index b24cfcf34..da4e8a7ef 100755 --- a/Tests/iaas/entropy/entropy-check.py +++ b/Tests/iaas/entropy/entropy-check.py @@ -437,7 +437,7 @@ def main(argv): all_flavors = conn.list_flavors(get_extra=True) if '*' not in image_visibility: - logger.debug(f"Images: filter for visibility {', '.join(image_visibility)}") + logger.debug(f"Images: filter for visibility {', '.join(sorted(image_visibility))}") all_images = [img for img in all_images if img.visibility in image_visibility] all_image_names = [f"{img.name} ({img.visibility})" for img in all_images] logger.debug(f"Images: {', '.join(all_image_names) or '(NONE)'}") diff --git a/Tests/iaas/flavor-naming/cli.py b/Tests/iaas/flavor-naming/cli.py index 86969cbbb..796b6a733 100755 --- a/Tests/iaas/flavor-naming/cli.py +++ b/Tests/iaas/flavor-naming/cli.py @@ -72,7 +72,7 @@ def parse(cfg, version, name, output='none'): if flavorname is None: print(f"NOT an SCS flavor: {namestr}") elif output == 'prose': - printv(name, end=': ') + printv(namestr, end=': ') print(f"{prettyname(flavorname)}") elif output == 'yaml': print(yaml.dump(flavorname_to_dict(flavorname), explicit_start=True)) diff --git a/Tests/iaas/flavor-naming/flavor-name-check.py b/Tests/iaas/flavor-naming/flavor-name-check.py index 536372757..e5d395e54 100755 --- a/Tests/iaas/flavor-naming/flavor-name-check.py +++ b/Tests/iaas/flavor-naming/flavor-name-check.py @@ -86,6 +86,9 @@ def main(argv): nm2 = _fnmck.outname(ret2) if nm1 != nm2: print(f"WARNING: {nm1} != {nm2}") + snm = _fnmck.outname(ret.shorten()) + if snm != nm1: + print(f"Shortened name: {snm}") argv = argv[1:] scs = 1 diff --git a/Tests/iaas/flavor-naming/flavor_names.py b/Tests/iaas/flavor-naming/flavor_names.py index 08b6d11d1..10ca54da6 100644 --- a/Tests/iaas/flavor-naming/flavor_names.py +++ b/Tests/iaas/flavor-naming/flavor_names.py @@ -162,6 +162,9 @@ class Main: raminsecure = BoolAttr("?no ECC", letter="u") ramoversubscribed = BoolAttr("?RAM Over", letter="o") + def shorten(self): + return self + class Disk: """Class representing the disk part""" @@ -171,6 +174,9 @@ class Disk: disksize = OptIntAttr("#.GB Disk") disktype = TblAttr("Disk type", {'': '(unspecified)', "n": "Networked", "h": "Local HDD", "s": "SSD", "p": "HiPerf NVMe"}) + def shorten(self): + return self + class Hype: """Class repesenting Hypervisor""" @@ -178,6 +184,9 @@ class Hype: component_name = "hype" hype = TblAttr(".Hypervisor", {"kvm": "KVM", "xen": "Xen", "hyv": "Hyper-V", "vmw": "VMware", "bms": "Bare Metal System"}) + def shorten(self): + return None + class HWVirt: """Class repesenting support for hardware virtualization""" @@ -185,6 +194,9 @@ class HWVirt: component_name = "hwvirt" hwvirt = BoolAttr("?HardwareVirt", letter="hwv") + def shorten(self): + return None + class CPUBrand: """Class repesenting CPU brand""" @@ -192,9 +204,11 @@ class CPUBrand: component_name = "cpubrand" cpuvendor = TblAttr("CPU Vendor", {"i": "Intel", "z": "AMD", "a": "ARM", "r": "RISC-V"}) cpugen = DepTblAttr("#.CPU Gen", cpuvendor, { - "i": {None: '(unspecified)', 0: "Unspec/Pre-Skylake", 1: "Skylake", 2: "Cascade Lake", 3: "Ice Lake", 4: "Sapphire Rapids"}, - "z": {None: '(unspecified)', 0: "Unspec/Pre-Zen", 1: "Zen 1", 2: "Zen 2", 3: "Zen 3", 4: "Zen 4"}, - "a": {None: '(unspecified)', 0: "Unspec/Pre-A76", 1: "A76/NeoN1", 2: "A78/X1/NeoV1", 3: "A710/NeoN2"}, + "i": {None: '(unspecified)', 0: "Unspec/Pre-Skylake", 1: "Skylake", 2: "Cascade Lake", 3: "Ice Lake", 4: "Sapphire Rapids", + 5: 'Sierra Forest (E)', 6: 'Granite Rapids (P)'}, + "z": {None: '(unspecified)', 0: "Unspec/Pre-Zen", 1: "Zen 1", 2: "Zen 2", 3: "Zen 3", 4: "Zen 4/4c", 5: "Zen 5/5c"}, + "a": {None: '(unspecified)', 0: "Unspec/Pre-A76", 1: "A76/NeoN1", 2: "A78/X1/NeoV1", 3: "A71x/NeoN2/V2", + 4: "AmpereOne", 5: "A72x/NeoN3/V3"}, "r": {None: '(unspecified)', 0: "Unspec"}, }) perf = TblAttr("Performance", {"": "Std Perf", "h": "High Perf", "hh": "Very High Perf", "hhh": "Very Very High Perf"}) @@ -204,21 +218,44 @@ def __init__(self, cpuvendor="i", cpugen=0, perf=""): self.cpugen = cpugen self.perf = perf + def shorten(self): + # For non-x86-64, don't strip out CPU brand for short name, as it contains the architecture + if self.cpuvendor in ('i', 'z'): + return None + return CPUBrand(self.cpuvendor) + class GPU: """Class repesenting GPU support""" type = "GPU" component_name = "gpu" gputype = TblAttr("Type", {"g": "vGPU", "G": "Pass-Through GPU"}) - brand = TblAttr("Brand", {"N": "nVidia", "A": "AMD", "I": "Intel"}) + brand = TblAttr("Brand", {"N": "Nvidia", "A": "AMD", "I": "Intel"}) gen = DepTblAttr("Gen", brand, { "N": {'': '(unspecified)', "f": "Fermi", "k": "Kepler", "m": "Maxwell", "p": "Pascal", - "v": "Volta", "t": "Turing", "a": "Ampere", "l": "AdaLovelace"}, - "A": {'': '(unspecified)', "0.4": "GCN4.0/Polaris", "0.5": "GCN5.0/Vega", "1": "RDNA1/Navi1x", "2": "RDNA2/Navi2x", "3": "RDNA3/Navi3x"}, - "I": {'': '(unspecified)', "0.9": "Gen9/Skylake", "0.95": "Gen9.5/KabyLake", "1": "Xe1/Gen12.1", "2": "Xe2"}, + "v": "Volta", "t": "Turing", "a": "Ampere", "l": "AdaLovelace", "g": "GraceHopper"}, + "A": {'': '(unspecified)', "0.4": "GCN4.0/Polaris", "0.5": "GCN5.0/Vega", "1": "RDNA1/Navi1x", "2": "C/RDNA2/Navi2x", + "3": "C/RDNA3/Navi3x", "3.5": "C/RDNA3.5", "4": "C/RDNA4"}, + "I": {'': '(unspecified)', "0.9": "Gen9/Skylake", "0.95": "Gen9.5/KabyLake", "1": "Xe1/Gen12.1/DG1", "2": "Xe2/Gen12.2", + "3": "Arc/Gen12.7/DG2"}, }) - cu = OptIntAttr("#.CU/EU/SM") - perf = TblAttr("Performance", {"": "Std Perf", "h": "High Perf", "hh": "Very High Perf", "hhh": "Very Very High Perf"}) + cu = OptIntAttr("#.N:SMs/A:CUs/I:EUs") + perf = TblAttr("Frequency", {"": "Std Freq", "h": "High Freq", "hh": "Very High Freq"}) + vram = OptIntAttr("#.V:GiB VRAM") + vramperf = TblAttr("Bandwidth", {"": "Std BW {<~1GiB/s)", "h": "High BW", "hh": "Very High BW"}) + + def __init__(self, gputype="g", brand="N", gen='', cu=None, perf='', vram=None, vramperf=''): + self.gputype = gputype + self.brand = brand + self.gen = gen + self.cu = cu + self.perf = perf + self.vram = vram + self.vramperf = vramperf + + def shorten(self): + # remove h modifiers + return GPU(gputype=self.gputype, brand=self.brand, gen=self.gen, cu=self.cu, vram=self.vram) class IB: @@ -227,6 +264,9 @@ class IB: component_name = "ib" ib = BoolAttr("?IB") + def shorten(self): + return self + class Flavorname: """A flavor name; merely a bunch of components""" @@ -244,14 +284,15 @@ def __init__( def shorten(self): """return canonically shortened name as recommended in the standard""" - if self.hype is None and self.hwvirt is None and self.cpubrand is None: - return self - # For non-x86-64, don't strip out CPU brand for short name, as it contains the architecture - if self.cpubrand and self.cpubrand.cpuvendor not in ('i', 'z'): - return Flavorname(cpuram=self.cpuram, disk=self.disk, - cpubrand=CPUBrand(self.cpubrand.cpuvendor), - gpu=self.gpu, ib=self.ib) - return Flavorname(cpuram=self.cpuram, disk=self.disk, gpu=self.gpu, ib=self.ib) + return Flavorname( + cpuram=self.cpuram and self.cpuram.shorten(), + disk=self.disk and self.disk.shorten(), + hype=self.hype and self.hype.shorten(), + hwvirt=self.hwvirt and self.hwvirt.shorten(), + cpubrand=self.cpubrand and self.cpubrand.shorten(), + gpu=self.gpu and self.gpu.shorten(), + ib=self.ib and self.ib.shorten(), + ) class Outputter: @@ -274,7 +315,7 @@ class Outputter: hype = "_%s" hwvirt = "_%?" cpubrand = "_%s%0%s" - gpu = "_%s%s%s%-%s" + gpu = "_%s%s%s%-%s%-%s" ib = "_%?" def output_component(self, pattern, component, parts): @@ -337,7 +378,7 @@ class SyntaxV1: hwvirt = re.compile(r"\-(hwv)") # cpubrand needs final lookahead assertion to exclude confusion with _ib extension cpubrand = re.compile(r"\-([izar])([0-9]*)(h*)(?=$|\-)") - gpu = re.compile(r"\-([gG])([NAI])([^:h]*)(?::([0-9]+)|)(h*)") + gpu = re.compile(r"\-([gG])([NAI])([^:h]*)(?::([0-9]+)|)(h*)(?::([0-9]+)|)(h*)") ib = re.compile(r"\-(ib)") @staticmethod @@ -362,7 +403,7 @@ class SyntaxV2: hwvirt = re.compile(r"_(hwv)") # cpubrand needs final lookahead assertion to exclude confusion with _ib extension cpubrand = re.compile(r"_([izar])([0-9]*)(h*)(?=$|_)") - gpu = re.compile(r"_([gG])([NAI])([^\-h]*)(?:\-([0-9]+)|)(h*)") + gpu = re.compile(r"_([gG])([NAI])([^\-h]*)(?:\-([0-9]+)|)(h*)(?:\-([0-9]+)|)(h*)") ib = re.compile(r"_(ib)") @staticmethod @@ -693,10 +734,14 @@ def prettyname(flavorname, prefix=""): if flavorname.gpu: stg += "and " + _tbl_out(flavorname.gpu, "gputype") stg += _tbl_out(flavorname.gpu, "brand") - stg += _tbl_out(flavorname.gpu, "perf", True) stg += _tbl_out(flavorname.gpu, "gen", True) if flavorname.gpu.cu is not None: - stg += f"(w/ {flavorname.gpu.cu} CU/EU/SM) " + stg += f"(w/ {flavorname.gpu.cu} {_tbl_out(flavorname.gpu, 'perf', True)}SMs/CUs/EUs" + # Can not specify VRAM without CUs + if flavorname.gpu.vram: + stg += f" and {flavorname.gpu.vram} GiB {_tbl_out(flavorname.gpu, 'vramperf', True)}VRAM) " + else: + stg += ") " # IB if flavorname.ib: stg += "and Infiniband " diff --git a/Tests/iaas/image-metadata/image-md-check.py b/Tests/iaas/image-metadata/image-md-check.py index ec8b9fa35..77830d3a2 100755 --- a/Tests/iaas/image-metadata/image-md-check.py +++ b/Tests/iaas/image-metadata/image-md-check.py @@ -11,13 +11,18 @@ SPDX-License-Identifier: CC-BY-SA-4.0 """ +import calendar +from collections import Counter +import getopt +import logging import os import sys import time -import calendar -import getopt + import openstack -from collections import Counter + + +logger = logging.getLogger(__name__) def usage(ret): @@ -31,8 +36,10 @@ def usage(ret): print(" -v/--verbose : Be more verbose") print(" -s/--skip-completeness: Don't check whether we have all mandatory images") print(" -h/--help : Print this usage information") - print("If you pass images, only these will be validated, otherwise all (public unless") - print(" -p is specified) images from the catalog will be processed.") + print(" [-V/--image-visibility VIS_LIST] : filters images by visibility") + print(" (default: 'public,community'; use '*' to disable)") + print("If you pass images, only these will be validated, otherwise all images") + print("(filtered according to -p, -V) from the catalog will be processed.") sys.exit(ret) @@ -335,15 +342,19 @@ def miss_replacement_images(by_name, outd_list): def main(argv): "Main entry point" + # configure logging, disable verbose library logging + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) + openstack.enable_logging(debug=False) # Option parsing global verbose + image_visibility = set() private = False skip = False cloud = os.environ.get("OS_CLOUD") err = 0 try: - opts, args = getopt.gnu_getopt(argv[1:], "phvc:s", - ("private", "help", "os-cloud=", "verbose", "skip-completeness")) + opts, args = getopt.gnu_getopt(argv[1:], "phvc:sV:", + ("private", "help", "os-cloud=", "verbose", "skip-completeness", "image-visibility=")) except getopt.GetoptError: # as exc: print("CRITICAL: Command-line syntax error", file=sys.stderr) usage(1) @@ -351,27 +362,39 @@ def main(argv): if opt[0] == "-h" or opt[0] == "--help": usage(0) elif opt[0] == "-p" or opt[0] == "--private": - private = True + private = True # only keep this for backwards compatibility (we have -V now) elif opt[0] == "-v" or opt[0] == "--verbose": verbose = True + logging.getLogger().setLevel(logging.DEBUG) elif opt[0] == "-s" or opt[0] == "--skip-completeness": skip = True elif opt[0] == "-c" or opt[0] == "--os-cloud": cloud = opt[1] + if opt[0] == "-V" or opt[0] == "--image-visibility": + image_visibility.update([v.strip() for v in opt[1].split(',')]) images = args if not cloud: print("CRITICAL: Need to specify --os-cloud or set OS_CLOUD environment.", file=sys.stderr) usage(1) + if not image_visibility: + image_visibility.update(("public", "community")) + if private: + image_visibility.add("private") try: conn = openstack.connect(cloud=cloud, timeout=24) all_images = list(conn.image.images()) + if '*' not in image_visibility: + logger.debug(f"Images: filter for visibility {', '.join(sorted(image_visibility))}") + all_images = [img for img in all_images if img.visibility in image_visibility] + all_image_names = [f"{img.name} ({img.visibility})" for img in all_images] + logger.debug(f"Images: {', '.join(all_image_names) or '(NONE)'}") by_name = {img.name: img for img in all_images} if len(by_name) != len(all_images): counter = Counter([img.name for img in all_images]) duplicates = [name for name, count in counter.items() if count > 1] print(f'WARNING: duplicate names detected: {", ".join(duplicates)}', file=sys.stderr) if not images: - images = [img.name for img in all_images if private or img.visibility == 'public'] + images = [img.name for img in all_images] # Analyse image metadata outdated_images = [] for imgnm in images: diff --git a/Tests/iaas/key-manager/check-for-key-manager.py b/Tests/iaas/key-manager/check-for-key-manager.py old mode 100644 new mode 100755 index 6b5a5b70a..dae49acdd --- a/Tests/iaas/key-manager/check-for-key-manager.py +++ b/Tests/iaas/key-manager/check-for-key-manager.py @@ -1,135 +1,84 @@ -"""Mandatory APIs checker +#!/usr/bin/env python3 +"""Key Manager service checker for scs-0116-v1-key-manager-standard.md + This script retrieves the endpoint catalog from Keystone using the OpenStack -SDK and checks whether a key manager APi endpoint is present. +SDK and checks whether a key manager API endpoint is present. +It then checks whether a user with the maximum of a member role can create secrets. +This will only work after policy adjustments or with the new secure RBAC roles and policies. The script relies on an OpenStack SDK compatible clouds.yaml file for authentication with Keystone. """ import argparse -import json import logging import os +import sys import openstack - logger = logging.getLogger(__name__) -def connect(cloud_name: str) -> openstack.connection.Connection: - """Create a connection to an OpenStack cloud - :param string cloud_name: - The name of the configuration to load from clouds.yaml. - :returns: openstack.connnection.Connection - """ - return openstack.connect( - cloud=cloud_name, - ) +def initialize_logging(): + logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.INFO) -def check_for_member_role(conn: openstack.connection.Connection - ) -> None: - """Checks whether the current user has at maximum privileges - of the member role. - :param connection: - The current connection to an OpenStack cloud. - :returns: boolean, when role with most priviledges is member - """ +def check_for_member_role(conn: openstack.connection.Connection) -> None: + """Checks whether the current user has at maximum privileges of the member role. - auth_data = conn.auth - auth_dict = { - "identity": { - "methods": ["password"], - "password": { - "user": { - "name": auth_data['username'], - "domain": {"name": auth_data['project_domain_name']}, - "password": auth_data['password'] - } - }, - }, - "scope": { - "project": { - "domain": {"name": auth_data['project_domain_name']}, - "name": auth_data['project_name'] - } - } - } - - has_member_role = False - request = conn.session.request(auth_data['auth_url'] + '/v3/auth/tokens', - 'POST', - json={'auth': auth_dict}) - for role in json.loads(request.content)["token"]["roles"]: - role_name = role["name"] - if role_name == "admin" or role_name == "manager": - return False - elif role_name == "member": - print("User has member role.") - has_member_role = True - elif role_name == "reader": - print("User has reader role.") - else: - print("User has custom role.") - return False - return has_member_role - - -def check_presence_of_key_manager(cloud_name: str): + :param conn: connection to an OpenStack cloud. + :returns: boolean, when role with most privileges is member + """ + role_names = set(conn.session.auth.get_access(conn.session).role_names) + if role_names & {"admin", "manager"}: + return False + if "reader" in role_names: + logger.info("User has reader role.") + custom_roles = sorted(role_names - {"reader", "member"}) + if custom_roles: + logger.info(f"User has custom roles {', '.join(custom_roles)}.") + return "member" in role_names + + +def check_presence_of_key_manager(conn: openstack.connection.Connection) -> None: try: - connection = connect(cloud_name) - services = connection.service_catalog - except Exception as e: - print(str(e)) - raise Exception( - f"Connection to cloud '{cloud_name}' was not successfully. " - f"The Catalog endpoint could not be accessed. " - f"Please check your cloud connection and authorization." - ) + services = conn.service_catalog + except Exception: + logger.critical("Could not access Catalog endpoint.") + raise for svc in services: - svc_type = svc['type'] + svc_type = svc["type"] if svc_type == "key-manager": # key-manager is present # now we want to check whether a user with member role # can create and access secrets - check_key_manager_permissions(connection) - return 0 + logger.info("Key Manager is present") + return True + - # we did not find the key-manager service - logger.warning("There is no key-manager endpoint in the cloud.") - # we do not fail, until a key-manager MUST be present - return 0 +def _find_secret(conn: openstack.connection.Connection, secret_name_or_id: str): + """Replacement method for finding secrets. + + Mimicks the behavior of Connection.key_manager.find_secret() + but fixes an issue with the internal implementation raising an + exception due to an unexpected microversion parameter. + """ + secrets = conn.key_manager.secrets() + for s in secrets: + if s.name == secret_name_or_id or s.id == secret_name_or_id: + return s -def check_key_manager_permissions(conn: openstack.connection.Connection - ) -> None: +def check_key_manager_permissions(conn: openstack.connection.Connection) -> None: """ After checking that the current user only has the member and maybe the reader role, this method verifies that the user with a member role has sufficient access to the Key Manager API functionality. """ secret_name = "scs-member-role-test-secret" - if not check_for_member_role(conn): - logger.warning("Cannot test key-manager permissions. " - "User has wrong roles") - return None - - def _find_secret(secret_name_or_id: str): - """Replacement method for finding secrets. - - Mimicks the behavior of Connection.key_manager.find_secret() - but fixes an issue with the internal implementation raising an - exception due to an unexpected microversion parameter. - """ - secrets = conn.key_manager.secrets() - for s in secrets: - if s.name == secret_name_or_id or s.id == secret_name_or_id: - return s - return None - try: - existing_secret = _find_secret(secret_name) + existing_secret = _find_secret(conn, secret_name) if existing_secret: conn.key_manager.delete_secret(existing_secret) @@ -137,54 +86,71 @@ def _find_secret(secret_name_or_id: str): name=secret_name, payload_content_type="text/plain", secret_type="opaque", - payload="foo" - ) - - new_secret = _find_secret(secret_name) - assert new_secret, ( - f"Secret created with name '{secret_name}' was not discoverable by " - f"the user" - ) - conn.key_manager.delete_secret(new_secret) - except openstack.exceptions.ForbiddenException as e: - print( - "Users of the 'member' role can use Key Manager API: FAIL" + payload="foo", ) - print( - f"ERROR: {str(e)}" + try: + new_secret = _find_secret(conn, secret_name) + if not new_secret: + raise ValueError(f"Secret '{secret_name}' was not discoverable by the user") + finally: + conn.key_manager.delete_secret(new_secret) + except openstack.exceptions.ForbiddenException: + logger.debug('exception details', exc_info=True) + logger.error( + "Users with the 'member' role can use Key Manager API: FAIL" ) - exit(1) - print( - "Users of the 'member' role can use Key Manager API: PASS" + return 1 + logger.info( + "Users with the 'member' role can use Key Manager API: PASS" ) def main(): - parser = argparse.ArgumentParser( - description="SCS Mandatory IaaS Service Checker") + initialize_logging() + parser = argparse.ArgumentParser(description="SCS Mandatory IaaS Service Checker") parser.add_argument( - "--os-cloud", type=str, + "--os-cloud", + type=str, help="Name of the cloud from clouds.yaml, alternative " - "to the OS_CLOUD environment variable" + "to the OS_CLOUD environment variable", ) parser.add_argument( - "--debug", action="store_true", - help="Enable OpenStack SDK debug logging" + "--debug", action="store_true", help="Enable OpenStack SDK debug logging" ) args = parser.parse_args() - openstack.enable_logging(debug=args.debug) + # @mbuechse: I think this is so much as to be unusable! + # (If necessary, a developer can always uncomment) + # openstack.enable_logging(debug=args.debug) + if args.debug: + logger.setLevel(logging.DEBUG) # parse cloud name for lookup in clouds.yaml - cloud = os.environ.get("OS_CLOUD", None) - if args.os_cloud: - cloud = args.os_cloud - assert cloud, ( - "You need to have the OS_CLOUD environment variable set to your cloud " - "name or pass it via --os-cloud" - ) - - return check_presence_of_key_manager(cloud) + cloud = args.os_cloud or os.environ.get("OS_CLOUD", None) + if not cloud: + logger.critical( + "You need to have the OS_CLOUD environment variable set to your cloud " + "name or pass it via --os-cloud" + ) + return 2 + + with openstack.connect(cloud=cloud) as conn: + if not check_for_member_role(conn): + logger.critical("Cannot test key-manager permissions. User has wrong roles") + return 2 + if check_presence_of_key_manager(conn): + return check_key_manager_permissions(conn) + else: + # not an error, because key manager is merely recommended + logger.warning("There is no key-manager endpoint in the cloud.") if __name__ == "__main__": - main() + try: + sys.exit(main() or 0) + except SystemExit as e: + if e.code < 2: + print("key-manager-check: " + ('PASS', 'FAIL')[min(1, e.code)]) + raise + except BaseException: + logger.critical("exception", exc_info=True) + sys.exit(2) diff --git a/Tests/iaas/mandatory-services/README.md b/Tests/iaas/mandatory-services/README.md new file mode 100644 index 000000000..33a66d7f4 --- /dev/null +++ b/Tests/iaas/mandatory-services/README.md @@ -0,0 +1,66 @@ +# Mandatory IaaS Service APIs Test Suite + +## Test Environment Setup + +### Test Execution Environment + +> **NOTE:** The test execution procedure does not require cloud admin rights. + +To execute the test suite a valid cloud configuration for the OpenStack SDK in the shape of "`clouds.yaml`" is mandatory[^1]. +**The file is expected to be located in the current working directory where the test script is executed unless configured otherwise.** + +[^1]: [OpenStack Documentation: Configuring OpenStack SDK Applications](https://docs.openstack.org/openstacksdk/latest/user/config/configuration.html) + +The test execution environment can be located on any system outside of the cloud infrastructure that has OpenStack API access. +Make sure that the API access is configured properly in "`clouds.yaml`". + +It is recommended to use a Python virtual environment[^2]. +Next, install the OpenStack SDK and boto3 required by the test suite: + +```bash +pip3 install openstacksdk +pip3 install boto3 +``` + +Within this environment execute the test suite. + +[^2]: [Python 3 Documentation: Virtual Environments and Packages](https://docs.python.org/3/tutorial/venv.html) + +## Test Execution + +The test suite is executed as follows: + +```bash +python3 mandatory-iaas-services.py --os-cloud mycloud +``` + +As an alternative to "`--os-cloud`", the "`OS_CLOUD`" environment variable may be specified instead. +The parameter is used to look up the correct cloud configuration in "`clouds.yaml`". +For the example command above, this file should contain a `clouds.mycloud` section like this: + +```yaml +--- +clouds: + mycloud: + auth: + auth_url: ... + ... + ... +``` + +If the deployment uses s3 only and does not have the object store endpoint specified in the service catalog, the "`--s3-endpoint`" flag may be used to specify the s3 endpoint. +In that case the "`--s3-access`" and "`--s3-access-secret`" flags must also be set, to give all necessery credentials to the test suite: + +```bash +python3 mandatory-iaas-services3.py --os-cloud mycloud2 --s3-endpoint "http://s3-endpoint:9000" --s3-access test-user --s3-access-secret test-user-secret +``` + +For any further options consult the output of "`python3 volume-backup-tester.py --help`". + +### Script Behavior & Test Results + +If all tests pass, the script will return with an exit code of `0`. + +If any test fails, the script will halt, print the exact error to `stderr` and return with a non-zero exit code. + +There is no cleanup done by this test as it mainly only inspect the service catalog and only for the object store creates a bucket, which is then promptly deleted. diff --git a/Tests/iaas/mandatory-services/mandatory-iaas-services.py b/Tests/iaas/mandatory-services/mandatory-iaas-services.py new file mode 100644 index 000000000..ab5cc0a2f --- /dev/null +++ b/Tests/iaas/mandatory-services/mandatory-iaas-services.py @@ -0,0 +1,299 @@ +"""Mandatory APIs checker +This script retrieves the endpoint catalog from Keystone using the OpenStack +SDK and checks whether all mandatory APi endpoints, are present. +The script relies on an OpenStack SDK compatible clouds.yaml file for +authentication with Keystone. +As the s3 endpoint might differ, a missing one will only result in a warning. +""" + +import argparse +import boto3 +from collections import Counter +import logging +import os +import re +import sys +import uuid + +import openstack + + +TESTCONTNAME = "scs-test-container" + +logger = logging.getLogger(__name__) +mandatory_services = ["compute", "identity", "image", "network", + "load-balancer", "placement", "object-store"] +block_storage_service = ["volume", "volumev3", "block-storage"] + + +def connect(cloud_name: str) -> openstack.connection.Connection: + """Create a connection to an OpenStack cloud + :param string cloud_name: + The name of the configuration to load from clouds.yaml. + :returns: openstack.connnection.Connection + """ + return openstack.connect( + cloud=cloud_name, + ) + + +def check_presence_of_mandatory_services(cloud_name: str, s3_credentials=None): + try: + connection = connect(cloud_name) + services = connection.service_catalog + except Exception as e: + print(str(e)) + raise Exception( + f"Connection to cloud '{cloud_name}' was not successfully. " + f"The Catalog endpoint could not be accessed. " + f"Please check your cloud connection and authorization." + ) + + if s3_credentials: + mandatory_services.remove("object-store") + for svc in services: + svc_type = svc['type'] + if svc_type in mandatory_services: + mandatory_services.remove(svc_type) + continue + if svc_type in block_storage_service: + block_storage_service.remove(svc_type) + + bs_service_not_present = 0 + if len(block_storage_service) == 3: + # neither block-storage nor volume nor volumev3 is present + # we must assume, that there is no volume service + logger.error("FAIL: No block-storage (volume) endpoint found.") + mandatory_services.append(block_storage_service[0]) + bs_service_not_present = 1 + if not mandatory_services: + # every mandatory service API had an endpoint + return 0 + bs_service_not_present + else: + # there were multiple mandatory APIs not found + logger.error(f"FAIL: The following endpoints are missing: " + f"{mandatory_services}") + return len(mandatory_services) + bs_service_not_present + + +def list_containers(conn): + "Gets a list of buckets" + return [cont.name for cont in conn.object_store.containers()] + + +def create_container(conn, name): + "Creates a test container" + conn.object_store.create_container(name) + return list_containers(conn) + + +def del_container(conn, name): + "Deletes a test container" + conn.object_store.delete(name) + # return list_containers(conn) + + +def s3_conn(creds, conn=None): + "Return an s3 client conn" + vrfy = True + if conn: + cacert = conn.config.config.get("cacert") + # TODO: Handle self-signed certs (from ca_cert in openstack config) + if cacert: + print("WARNING: Trust all Certificates in S3, " + f"OpenStack uses {cacert}", file=sys.stderr) + vrfy = False + return boto3.resource('s3', aws_access_key_id=creds["AK"], + aws_secret_access_key=creds["SK"], + endpoint_url=creds["HOST"], + verify=vrfy) + + +def list_s3_buckets(s3): + "Get a list of s3 buckets" + return [buck.name for buck in s3.buckets.all()] + + +def create_bucket(s3, name): + "Create an s3 bucket" + # bucket = s3.Bucket(name) + # bucket.create() + s3.create_bucket(Bucket=name) + return list_s3_buckets(s3) + + +def del_bucket(s3, name): + "Delete an s3 bucket" + buck = s3.Bucket(name=name) + buck.delete() + # s3.delete_bucket(Bucket=name) + + +def s3_from_env(creds, fieldnm, env, prefix=""): + "Set creds[fieldnm] to os.environ[env] if set" + if env in os.environ: + creds[fieldnm] = prefix + os.environ[env] + if fieldnm not in creds: + print(f"WARNING: s3_creds[{fieldnm}] not set", file=sys.stderr) + + +def s3_from_ostack(creds, conn, endpoint): + "Set creds from openstack swift/keystone" + rgx = re.compile(r"^(https*://[^/]*)/") + match = rgx.match(endpoint) + if match: + creds["HOST"] = match.group(1) + # Use first ec2 cred if one exists + ec2_creds = [cred for cred in conn.identity.credentials() + if cred.type == "ec2"] + if len(ec2_creds): + # FIXME: Assume cloud is not evil + ec2_dict = eval(ec2_creds[0].blob, {"null": None}) + creds["AK"] = ec2_dict["access"] + creds["SK"] = ec2_dict["secret"] + return + # Generate keyid and secret + ak = uuid.uuid4().hex + sk = uuid.uuid4().hex + blob = f'{{"access": "{ak}", "secret": "{sk}"}}' + try: + conn.identity.create_credential(type="ec2", blob=blob, + user_id=conn.current_user_id, + project_id=conn.current_project_id) + creds["AK"] = ak + creds["SK"] = sk + except BaseException as exc: + print(f"WARNING: ec2 creds creation failed: {exc!s}", file=sys.stderr) + # pass + + +def check_for_s3_and_swift(cloud_name: str, s3_credentials=None): + # If we get credentials we assume, that there is no Swift and only test s3 + if s3_credentials: + try: + s3 = s3_conn(s3_credentials) + except Exception as e: + print(str(e)) + logger.error("FAIL: Connection to s3 failed.") + return 1 + s3_buckets = list_s3_buckets(s3) + if not s3_buckets: + s3_buckets = create_bucket(s3, TESTCONTNAME) + assert s3_buckets + if s3_buckets == [TESTCONTNAME]: + del_bucket(s3, TESTCONTNAME) + # everything worked, and we don't need to test for Swift: + print("SUCCESS: S3 exists") + return 0 + # there were no credentials given, so we assume s3 is accessable via + # the service catalog and Swift might exist too + try: + connection = connect(cloud_name) + connection.authorize() + except Exception as e: + print(str(e)) + raise Exception( + f"Connection to cloud '{cloud_name}' was not successfully. " + f"The Catalog endpoint could not be accessed. " + f"Please check your cloud connection and authorization." + ) + s3_creds = {} + try: + endpoint = connection.object_store.get_endpoint() + except Exception as e: + logger.error( + f"FAIL: No object store endpoint found in cloud " + f"'{cloud_name}'. No testing for the s3 service possible. " + f"Details: %s", e + ) + return 1 + # Get S3 endpoint (swift) and ec2 creds from OpenStack (keystone) + s3_from_ostack(s3_creds, connection, endpoint) + # Overrides (var names are from libs3, in case you wonder) + s3_from_env(s3_creds, "HOST", "S3_HOSTNAME", "https://") + s3_from_env(s3_creds, "AK", "S3_ACCESS_KEY_ID") + s3_from_env(s3_creds, "SK", "S3_SECRET_ACCESS_KEY") + + s3 = s3_conn(s3_creds, connection) + s3_buckets = list_s3_buckets(s3) + if not s3_buckets: + s3_buckets = create_bucket(s3, TESTCONTNAME) + assert s3_buckets + + # If we got till here, s3 is working, now swift + swift_containers = list_containers(connection) + # if not swift_containers: + # swift_containers = create_container(connection, TESTCONTNAME) + result = 0 + if Counter(s3_buckets) != Counter(swift_containers): + print("WARNING: S3 buckets and Swift Containers differ:\n" + f"S3: {sorted(s3_buckets)}\nSW: {sorted(swift_containers)}") + result = 1 + else: + print("SUCCESS: S3 and Swift exist and agree") + # Clean up + # FIXME: Cleanup created EC2 credential + # if swift_containers == [TESTCONTNAME]: + # del_container(connection, TESTCONTNAME) + # Cleanup created S3 bucket + if s3_buckets == [TESTCONTNAME]: + del_bucket(s3, TESTCONTNAME) + return result + + +def main(): + parser = argparse.ArgumentParser( + description="SCS Mandatory IaaS Service Checker") + parser.add_argument( + "--os-cloud", type=str, + help="Name of the cloud from clouds.yaml, alternative " + "to the OS_CLOUD environment variable" + ) + parser.add_argument( + "--s3-endpoint", type=str, + help="URL to the s3 service." + ) + parser.add_argument( + "--s3-access", type=str, + help="Access Key to connect to the s3 service." + ) + parser.add_argument( + "--s3-access-secret", type=str, + help="Access secret to connect to the s3 service." + ) + parser.add_argument( + "--debug", action="store_true", + help="Enable OpenStack SDK debug logging" + ) + args = parser.parse_args() + openstack.enable_logging(debug=args.debug) + + # parse cloud name for lookup in clouds.yaml + cloud = os.environ.get("OS_CLOUD", None) + if args.os_cloud: + cloud = args.os_cloud + assert cloud, ( + "You need to have the OS_CLOUD environment variable set to your cloud " + "name or pass it via --os-cloud" + ) + + s3_credentials = None + if args.s3_endpoint: + if (not args.s3_access) or (not args.s3_access_secret): + print("WARNING: test for external s3 needs access key and access secret.") + s3_credentials = { + "AK": args.s3_access, + "SK": args.s3_access_secret, + "HOST": args.s3_endpoint + } + elif args.s3_access or args.s3_access_secret: + print("WARNING: access to s3 was given, but no endpoint provided.") + + result = check_presence_of_mandatory_services(cloud, s3_credentials) + result = result + check_for_s3_and_swift(cloud, s3_credentials) + + return result + + +if __name__ == "__main__": + main() diff --git a/Tests/iaas/security-groups/default-security-group-rules.py b/Tests/iaas/security-groups/default-security-group-rules.py old mode 100644 new mode 100755 index 773cf0bb8..def511956 --- a/Tests/iaas/security-groups/default-security-group-rules.py +++ b/Tests/iaas/security-groups/default-security-group-rules.py @@ -1,130 +1,181 @@ +#!/usr/bin/env python3 """Default Security Group Rules Checker This script tests the absence of any ingress default security group rule except for ingress rules from the same Security Group. Furthermore the presence of default rules for egress traffic is checked. """ +import argparse +from collections import Counter +import logging +import os +import sys import openstack -import os -import argparse +from openstack.exceptions import ResourceNotFound +logger = logging.getLogger(__name__) -def connect(cloud_name: str) -> openstack.connection.Connection: - """Create a connection to an OpenStack cloud +SG_NAME = "scs-test-default-sg" +DESCRIPTION = "scs-test-default-sg" - :param string cloud_name: - The name of the configuration to load from clouds.yaml. - :returns: openstack.connnection.Connection +def check_default_rules(rules, short=False): """ - return openstack.connect( - cloud=cloud_name, - ) + counts all verall ingress rules and egress rules, depending on the requested testing mode - -def test_rules(cloud_name: str): - try: - connection = connect(cloud_name) - rules = connection.network.default_security_group_rules() - except Exception as e: - print(str(e)) - raise Exception( - f"Connection to cloud '{cloud_name}' was not successfully. " - f"The default Security Group Rules could not be accessed. " - f"Please check your cloud connection and authorization." - ) - - # count all overall ingress rules and egress rules. - ingress_rules = 0 - ingress_from_same_sg = 0 - egress_rules = 0 - egress_ipv4_default_sg = 0 - egress_ipv4_custom_sg = 0 - egress_ipv6_default_sg = 0 - egress_ipv6_custom_sg = 0 + :param bool short + if short is True, the testing mode is set on short for older OpenStack versions + """ + ingress_rules = egress_rules = 0 + egress_vars = {'IPv4': {}, 'IPv6': {}} + for key, value in egress_vars.items(): + value['default'] = 0 + if not short: + value['custom'] = 0 if not rules: - print("No default security group rules defined.") - else: - for rule in rules: - direction = rule.direction - ethertype = rule.ethertype - r_custom_sg = rule.used_in_non_default_sg - r_default_sg = rule.used_in_default_sg - if direction == "ingress": - ingress_rules += 1 + logger.info("No default security group rules defined.") + for rule in rules: + direction = rule["direction"] + ethertype = rule["ethertype"] + if direction == "ingress": + if not short: # we allow ingress from the same security group # but only for the default security group - r_group_id = rule.remote_group_id - if (r_group_id == "PARENT" and not r_custom_sg): - ingress_from_same_sg += 1 - elif direction == "egress" and ethertype == "IPv4": - egress_rules += 1 - if rule.remote_ip_prefix: - # this rule does not allow traffic to all external ips - continue - if r_custom_sg: - egress_ipv4_custom_sg += 1 - if r_default_sg: - egress_ipv4_default_sg += 1 - elif direction == "egress" and ethertype == "IPv6": - egress_rules += 1 - if rule.remote_ip_prefix: - # this rule does not allow traffic to all external ips + if rule.remote_group_id == "PARENT" and not rule["used_in_non_default_sg"]: continue - if r_custom_sg: - egress_ipv6_custom_sg += 1 - if r_default_sg: - egress_ipv6_default_sg += 1 - - # test whether there are no other than the allowed ingress rules - assert ingress_rules == ingress_from_same_sg, ( - f"Expected only ingress rules for default security groups, " - f"that allow ingress traffic from the same group. " - f"But there are more - in total {ingress_rules} ingress rules. " - f"There should be only {ingress_from_same_sg} ingress rules.") - assert egress_rules > 0, ( - f"Expected to have more than {egress_rules} egress rules present.") - var_list = [egress_ipv4_default_sg, egress_ipv4_custom_sg, - egress_ipv6_default_sg, egress_ipv6_custom_sg] - assert all([var > 0 for var in var_list]), ( - "Not all expected egress rules are present. " - "Expected rules for egress for IPv4 and IPv6 " - "both for default and custom security groups.") - - result_dict = { - "Ingress Rules": ingress_rules, - "Egress Rules": egress_rules - } - return result_dict + ingress_rules += 1 + elif direction == "egress" and ethertype in egress_vars: + egress_rules += 1 + if short: + egress_vars[ethertype]['default'] += 1 + continue + if rule.remote_ip_prefix: + # this rule does not allow traffic to all external ips + continue + # note: these two are not mutually exclusive + if rule["used_in_default_sg"]: + egress_vars[ethertype]['default'] += 1 + if rule["used_in_non_default_sg"]: + egress_vars[ethertype]['custom'] += 1 + # test whether there are no unallowed ingress rules + if ingress_rules: + logger.error(f"Expected no default ingress rules, found {ingress_rules}.") + # test whether all expected egress rules are present + missing = [(key, key2) for key, val in egress_vars.items() for key2, val2 in val.items() if not val2] + if missing: + logger.error( + "Expected rules for egress for IPv4 and IPv6 both for default and custom security groups. " + f"Missing rule types: {', '.join(str(x) for x in missing)}" + ) + logger.info(str({ + "Unallowed Ingress Rules": ingress_rules, + "Egress Rules": egress_rules, + })) + + +def create_security_group(conn, sg_name: str = SG_NAME, description: str = DESCRIPTION): + """Create security group in openstack + + :returns: + ~openstack.network.v2.security_group.SecurityGroup: The new security group or None + """ + sg = conn.network.create_security_group(name=sg_name, description=description) + return sg.id + + +def delete_security_group(conn, sg_id): + conn.network.delete_security_group(sg_id) + # in case of a successful delete finding the sg will throw an exception + try: + conn.network.find_security_group(name_or_id=sg_id) + except ResourceNotFound: + logger.debug(f"Security group {sg_id} was deleted successfully.") + except Exception: + logger.critical(f"Security group {sg_id} was not deleted successfully") + raise + + +def altern_test_rules(connection: openstack.connection.Connection): + sg_id = create_security_group(connection) + try: + sg = connection.network.find_security_group(name_or_id=sg_id) + check_default_rules(sg.security_group_rules, short=True) + finally: + delete_security_group(connection, sg_id) + + +def test_rules(connection: openstack.connection.Connection): + try: + rules = list(connection.network.default_security_group_rules()) + except ResourceNotFound: + logger.info( + "API call failed. OpenStack components might not be up to date. " + "Falling back to old-style test method. " + ) + logger.debug("traceback", exc_info=True) + altern_test_rules(connection) + else: + check_default_rules(rules) + + +class CountingHandler(logging.Handler): + def __init__(self, level=logging.NOTSET): + super().__init__(level=level) + self.bylevel = Counter() + + def handle(self, record): + self.bylevel[record.levelno] += 1 def main(): parser = argparse.ArgumentParser( - description="SCS Default Security Group Rules Checker") + description="SCS Default Security Group Rules Checker", + ) parser.add_argument( - "--os-cloud", type=str, + "--os-cloud", + type=str, help="Name of the cloud from clouds.yaml, alternative " - "to the OS_CLOUD environment variable" + "to the OS_CLOUD environment variable", ) parser.add_argument( - "--debug", action="store_true", - help="Enable OpenStack SDK debug logging" + "--debug", action="store_true", help="Enable debug logging", ) args = parser.parse_args() openstack.enable_logging(debug=args.debug) + logging.basicConfig( + format="%(levelname)s: %(message)s", + level=logging.DEBUG if args.debug else logging.INFO, + ) + + # count the number of log records per level (used for summary and return code) + counting_handler = CountingHandler(level=logging.INFO) + logger.addHandler(counting_handler) # parse cloud name for lookup in clouds.yaml - cloud = os.environ.get("OS_CLOUD", None) - if args.os_cloud: - cloud = args.os_cloud - assert cloud, ( - "You need to have the OS_CLOUD environment variable set to your cloud " - "name or pass it via --os-cloud" - ) + cloud = args.os_cloud or os.environ.get("OS_CLOUD", None) + if not cloud: + raise ValueError( + "You need to have the OS_CLOUD environment variable set to your cloud " + "name or pass it via --os-cloud" + ) - print(test_rules(cloud)) + with openstack.connect(cloud) as conn: + test_rules(conn) + + c = counting_handler.bylevel + logger.debug(f"Total critical / error / warning: {c[logging.CRITICAL]} / {c[logging.ERROR]} / {c[logging.WARNING]}") + if not c[logging.CRITICAL]: + print("security-groups-default-rules-check: " + ('PASS', 'FAIL')[min(1, c[logging.ERROR])]) + return min(127, c[logging.CRITICAL] + c[logging.ERROR]) # cap at 127 due to OS restrictions if __name__ == "__main__": - main() + try: + sys.exit(main()) + except SystemExit: + raise + except BaseException as exc: + logging.debug("traceback", exc_info=True) + logging.critical(str(exc)) + sys.exit(1) diff --git a/Tests/iaas/standard-images/images-openstack.py b/Tests/iaas/standard-images/images-openstack.py index 07a810b9b..6f192e5d0 100755 --- a/Tests/iaas/standard-images/images-openstack.py +++ b/Tests/iaas/standard-images/images-openstack.py @@ -40,6 +40,8 @@ def print_usage(file=sys.stderr): Options: [-c/--os-cloud OS_CLOUD] sets cloud environment (default from OS_CLOUD env) [-d/--debug] enables DEBUG logging channel + [-V/--image-visibility VIS_LIST] filters images by visibility + (default: 'public,community'; use '*' to disable) """, end='', file=file) @@ -61,7 +63,7 @@ def main(argv): logger.addHandler(counting_handler) try: - opts, args = getopt.gnu_getopt(argv, "c:hd", ["os-cloud=", "help", "debug"]) + opts, args = getopt.gnu_getopt(argv, "c:hdV:", ["os-cloud=", "help", "debug", "image-visibility="]) except getopt.GetoptError as exc: logger.critical(f"{exc}") print_usage() @@ -74,6 +76,7 @@ def main(argv): yaml_path = args[0] cloud = os.environ.get("OS_CLOUD") + image_visibility = set() for opt in opts: if opt[0] == "-h" or opt[0] == "--help": print_usage() @@ -82,11 +85,16 @@ def main(argv): cloud = opt[1] if opt[0] == "-d" or opt[0] == "--debug": logging.getLogger().setLevel(logging.DEBUG) + if opt[0] == "-V" or opt[0] == "--image-visibility": + image_visibility.update([v.strip() for v in opt[1].split(',')]) if not cloud: logger.critical("You need to have OS_CLOUD set or pass --os-cloud=CLOUD.") return 1 + if not image_visibility: + image_visibility.update(("public", "community")) + # we only support local files; but we allow specifying the following URLs for the sake of # better documentation prefix = next(p for p in ( @@ -113,11 +121,15 @@ def main(argv): logger.debug(f"Fetching image list from cloud '{cloud}'") with openstack.connect(cloud=cloud, timeout=32) as conn: present_images = conn.list_images(show_all=True) - by_name = { - image.name: image - for image in present_images - } - logger.debug(f"Images present: {', '.join(sorted(by_name))}") + if '*' not in image_visibility: + logger.debug(f"Images: filter for visibility {', '.join(sorted(image_visibility))}") + present_images = [img for img in present_images if img.visibility in image_visibility] + all_image_names = [f"{img.name} ({img.visibility})" for img in present_images] + logger.debug(f"Images: {', '.join(all_image_names) or '(NONE)'}") + by_name = { + image.name: image + for image in present_images + } logger.debug(f"Checking {len(image_specs)} image specs against {len(present_images)} images") for image_spec in image_specs: diff --git a/Tests/iaas/volume-backup/volume-backup-tester.py b/Tests/iaas/volume-backup/volume-backup-tester.py old mode 100644 new mode 100755 index f4fa9522d..bcbb89664 --- a/Tests/iaas/volume-backup/volume-backup-tester.py +++ b/Tests/iaas/volume-backup/volume-backup-tester.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 """Volume Backup API tester for Block Storage API This test script executes basic operations on the Block Storage API centered @@ -14,7 +15,9 @@ import argparse import getpass +import logging import os +import sys import time import typing @@ -29,28 +32,23 @@ WAIT_TIMEOUT = 60 -def connect(cloud_name: str, password: typing.Optional[str] = None - ) -> openstack.connection.Connection: - """Create a connection to an OpenStack cloud - - :param string cloud_name: - The name of the configuration to load from clouds.yaml. - - :param string password: - Optional password override for the connection. - - :returns: openstack.connnection.Connection - """ - - if password: - return openstack.connect( - cloud=cloud_name, - password=password - ) - else: - return openstack.connect( - cloud=cloud_name, - ) +def wait_for_resource( + get_func: typing.Callable[[str], openstack.resource.Resource], + resource_id: str, + expected_status=("available", ), + timeout=WAIT_TIMEOUT, +) -> None: + seconds_waited = 0 + resource = get_func(resource_id) + while resource is None or resource.status not in expected_status: + time.sleep(1.0) + seconds_waited += 1 + if seconds_waited >= timeout: + raise RuntimeError( + f"Timed out after {seconds_waited} s: waiting for resource {resource_id} " + f"to be in status {expected_status} (current: {resource and resource.status})" + ) + resource = get_func(resource_id) def test_backup(conn: openstack.connection.Connection, @@ -64,133 +62,90 @@ def test_backup(conn: openstack.connection.Connection, """ # CREATE VOLUME - print("Creating volume ...") - volume = conn.block_storage.create_volume( - name=f"{prefix}volume", - size=1 - ) - assert volume is not None, ( - "Initial volume creation failed" - ) + volume_name = f"{prefix}volume" + logging.info(f"Creating volume '{volume_name}' ...") + volume = conn.block_storage.create_volume(name=volume_name, size=1) + if volume is None: + raise RuntimeError(f"Creation of initial volume '{volume_name}' failed") volume_id = volume.id - assert conn.block_storage.get_volume(volume_id) is not None, ( - "Retrieving initial volume by ID failed" - ) + if conn.block_storage.get_volume(volume_id) is None: + raise RuntimeError(f"Retrieving initial volume by ID '{volume_id}' failed") - print( + logging.info( f"↳ waiting for volume with ID '{volume_id}' to reach status " f"'available' ..." ) - seconds_waited = 0 - while conn.block_storage.get_volume(volume_id).status != "available": - time.sleep(1.0) - seconds_waited += 1 - assert seconds_waited < timeout, ( - f"Timeout reached while waiting for volume to reach status " - f"'available' (volume id: {volume_id}) after {seconds_waited} " - f"seconds" - ) - print("Create empty volume: PASS") + wait_for_resource(conn.block_storage.get_volume, volume_id, timeout=timeout) + logging.info("Create empty volume: PASS") # CREATE BACKUP - print("Creating backup from volume ...") - backup = conn.block_storage.create_backup( - name=f"{prefix}volume-backup", - volume_id=volume_id - ) - assert backup is not None, ( - "Backup creation failed" - ) + logging.info("Creating backup from volume ...") + backup = conn.block_storage.create_backup(name=f"{prefix}volume-backup", volume_id=volume_id) + if backup is None: + raise RuntimeError("Backup creation failed") backup_id = backup.id - assert conn.block_storage.get_backup(backup_id) is not None, ( - "Retrieving backup by ID failed" - ) + if conn.block_storage.get_backup(backup_id) is None: + raise RuntimeError("Retrieving backup by ID failed") - print(f"↳ waiting for backup '{backup_id}' to become available ...") - seconds_waited = 0 - while conn.block_storage.get_backup(backup_id).status != "available": - time.sleep(1.0) - seconds_waited += 1 - assert seconds_waited < timeout, ( - f"Timeout reached while waiting for backup to reach status " - f"'available' (backup id: {backup_id}) after {seconds_waited} " - f"seconds" - ) - print("Create backup from volume: PASS") + logging.info(f"↳ waiting for backup '{backup_id}' to become available ...") + wait_for_resource(conn.block_storage.get_backup, backup_id, timeout=timeout) + logging.info("Create backup from volume: PASS") # RESTORE BACKUP - print("Restoring backup to volume ...") restored_volume_name = f"{prefix}restored-backup" - conn.block_storage.restore_backup( - backup_id, - name=restored_volume_name - ) + logging.info(f"Restoring backup to volume '{restored_volume_name}' ...") + conn.block_storage.restore_backup(backup_id, name=restored_volume_name) - print( + logging.info( f"↳ waiting for restoration target volume '{restored_volume_name}' " f"to be created ..." ) - seconds_waited = 0 - while conn.block_storage.find_volume(restored_volume_name) is None: - time.sleep(1.0) - seconds_waited += 1 - assert seconds_waited < timeout, ( - f"Timeout reached while waiting for restored volume to be created " - f"(volume name: {restored_volume_name}) after {seconds_waited} " - f"seconds" - ) + wait_for_resource(conn.block_storage.find_volume, restored_volume_name, timeout=timeout) # wait for the volume restoration to finish - print( + logging.info( f"↳ waiting for restoration target volume '{restored_volume_name}' " f"to reach 'available' status ..." ) volume_id = conn.block_storage.find_volume(restored_volume_name).id - while conn.block_storage.get_volume(volume_id).status != "available": - time.sleep(1.0) - seconds_waited += 1 - assert seconds_waited < timeout, ( - f"Timeout reached while waiting for restored volume reach status " - f"'available' (volume id: {volume_id}) after {seconds_waited} " - f"seconds" - ) - print("Restore volume from backup: PASS") + wait_for_resource(conn.block_storage.get_volume, volume_id, timeout=timeout) + logging.info("Restore volume from backup: PASS") def cleanup(conn: openstack.connection.Connection, prefix=DEFAULT_PREFIX, - timeout=WAIT_TIMEOUT): + timeout=WAIT_TIMEOUT) -> bool: """ Looks up volume and volume backup resources matching the given prefix and deletes them. + Returns False if there were any errors during cleanup which might leave + resources behind. Otherwise returns True to indicate cleanup success. """ - def wait_for_resource(resource_type: str, resource_id: str, - expected_status="available") -> None: - seconds_waited = 0 - get_func = getattr(conn.block_storage, f"get_{resource_type}") - while get_func(resource_id).status != expected_status: - time.sleep(1.0) - seconds_waited += 1 - assert seconds_waited < timeout, ( - f"Timeout reached while waiting for {resource_type} during " - f"cleanup to be in status '{expected_status}' " - f"({resource_type} id: {resource_id}) after {seconds_waited} " - f"seconds" - ) - - print(f"\nPerforming cleanup for resources with the " - f"'{prefix}' prefix ...") + logging.info(f"Performing cleanup for resources with the '{prefix}' prefix ...") + cleanup_issues = 0 # count failed cleanup operations backups = conn.block_storage.backups() for backup in backups: - if backup.name.startswith(prefix): - try: - wait_for_resource("backup", backup.id) - except openstack.exceptions.ResourceNotFound: - # if the resource has vanished on - # its own in the meantime ignore it - continue - print(f"↳ deleting volume backup '{backup.id}' ...") + if not backup.name.startswith(prefix): + continue + try: + # we can only delete if status is available or error, so try and wait + wait_for_resource( + conn.block_storage.get_backup, + backup.id, + expected_status=("available", "error"), + timeout=timeout, + ) + logging.info(f"↳ deleting volume backup '{backup.id}' ...") conn.block_storage.delete_backup(backup.id) + except openstack.exceptions.ResourceNotFound: + # if the resource has vanished on its own in the meantime ignore it + continue + except Exception as e: + # Most common exception would be a timeout in wait_for_resource. + # We do not need to increment cleanup_issues here since + # any remaining ones will be caught in the next loop down below anyway. + logging.debug("traceback", exc_info=True) + logging.warning(str(e)) # wait for all backups to be cleaned up before attempting to remove volumes seconds_waited = 0 @@ -200,22 +155,42 @@ def wait_for_resource(resource_type: str, resource_id: str, ) > 0: time.sleep(1.0) seconds_waited += 1 - assert seconds_waited < timeout, ( - f"Timeout reached while waiting for all backups with prefix " - f"'{prefix}' to finish deletion" - ) + if seconds_waited >= timeout: + cleanup_issues += 1 + logging.warning( + f"Timeout reached while waiting for all backups with prefix " + f"'{prefix}' to finish deletion during cleanup after " + f"{seconds_waited} seconds" + ) + break volumes = conn.block_storage.volumes() for volume in volumes: - if volume.name.startswith(prefix): - try: - wait_for_resource("volume", volume.id) - except openstack.exceptions.ResourceNotFound: - # if the resource has vanished on - # its own in the meantime ignore it - continue - print(f"↳ deleting volume '{volume.id}' ...") + if not volume.name.startswith(prefix): + continue + try: + wait_for_resource( + conn.block_storage.get_volume, + volume.id, + expected_status=("available", "error"), + timeout=timeout, + ) + logging.info(f"↳ deleting volume '{volume.id}' ...") conn.block_storage.delete_volume(volume.id) + except openstack.exceptions.ResourceNotFound: + # if the resource has vanished on its own in the meantime ignore it + continue + except Exception as e: + logging.debug("traceback", exc_info=True) + logging.warning(str(e)) + cleanup_issues += 1 + + if cleanup_issues: + logging.info( + f"Some resources with the '{prefix}' prefix were not cleaned up!" + ) + + return not cleanup_issues def main(): @@ -257,26 +232,43 @@ def main(): ) args = parser.parse_args() openstack.enable_logging(debug=args.debug) + logging.basicConfig( + format="%(levelname)s: %(message)s", + level=logging.DEBUG if args.debug else logging.INFO, + ) # parse cloud name for lookup in clouds.yaml - cloud = os.environ.get("OS_CLOUD", None) - if args.os_cloud: - cloud = args.os_cloud - assert cloud, ( - "You need to have the OS_CLOUD environment variable set to your " - "cloud name or pass it via --os-cloud" - ) - conn = connect( - cloud, - password=getpass.getpass("Enter password: ") if args.ask else None - ) - if args.cleanup_only: - cleanup(conn, prefix=args.prefix, timeout=args.timeout) - else: - cleanup(conn, prefix=args.prefix, timeout=args.timeout) - test_backup(conn, prefix=args.prefix, timeout=args.timeout) - cleanup(conn, prefix=args.prefix, timeout=args.timeout) + cloud = args.os_cloud or os.environ.get("OS_CLOUD", None) + if not cloud: + raise Exception( + "You need to have the OS_CLOUD environment variable set to your " + "cloud name or pass it via --os-cloud" + ) + password = getpass.getpass("Enter password: ") if args.ask else None + + with openstack.connect(cloud, password=password) as conn: + if not cleanup(conn, prefix=args.prefix, timeout=args.timeout): + raise RuntimeError("Initial cleanup failed") + if args.cleanup_only: + logging.info("Cleanup-only run finished.") + return + try: + test_backup(conn, prefix=args.prefix, timeout=args.timeout) + except BaseException: + print('volume-backup-check: FAIL') + raise + else: + print('volume-backup-check: PASS') + finally: + cleanup(conn, prefix=args.prefix, timeout=args.timeout) if __name__ == "__main__": - main() + try: + sys.exit(main()) + except SystemExit: + raise + except BaseException as exc: + logging.debug("traceback", exc_info=True) + logging.critical(str(exc)) + sys.exit(1) diff --git a/Tests/iaas/volume-types/volume-types-check.py b/Tests/iaas/volume-types/volume-types-check.py old mode 100644 new mode 100755 index 444755816..4b1945fb8 --- a/Tests/iaas/volume-types/volume-types-check.py +++ b/Tests/iaas/volume-types/volume-types-check.py @@ -141,6 +141,8 @@ def main(argv): "Total critical / error / warning: " f"{c[logging.CRITICAL]} / {c[logging.ERROR]} / {c[logging.WARNING]}" ) + if not c[logging.CRITICAL]: + print("volume-types-check: " + ('PASS', 'FAIL')[min(1, c[logging.ERROR])]) return min(127, c[logging.CRITICAL] + c[logging.ERROR]) # cap at 127 due to OS restrictions diff --git a/Tests/iam/domain-manager/domain-manager-check.py b/Tests/iam/domain-manager/domain-manager-check.py old mode 100644 new mode 100755 index e56aad884..41040122b --- a/Tests/iam/domain-manager/domain-manager-check.py +++ b/Tests/iam/domain-manager/domain-manager-check.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 """Domain Manager policy configuration checker This script uses the OpenStack SDK to validate the proper implementation diff --git a/Tests/kaas/k8s-version-policy/k8s-eol-data.yml b/Tests/kaas/k8s-version-policy/k8s-eol-data.yml index 6a549c464..3a3d3b2eb 100644 --- a/Tests/kaas/k8s-version-policy/k8s-eol-data.yml +++ b/Tests/kaas/k8s-version-policy/k8s-eol-data.yml @@ -1,5 +1,7 @@ # https://kubernetes.io/releases/patch-releases/#detailed-release-history-for-active-branches +- branch: '1.31' + end-of-life: '2025-10-28' - branch: '1.30' end-of-life: '2025-06-28' - branch: '1.29' diff --git a/Tests/kaas/k8s-version-policy/k8s_version_policy.py b/Tests/kaas/k8s-version-policy/k8s_version_policy.py index a7884db28..cef272acd 100755 --- a/Tests/kaas/k8s-version-policy/k8s_version_policy.py +++ b/Tests/kaas/k8s-version-policy/k8s_version_policy.py @@ -45,7 +45,7 @@ MINOR_VERSION_CADENCE = timedelta(days=120) -PATCH_VERSION_CADENCE = timedelta(weeks=1) +PATCH_VERSION_CADENCE = timedelta(weeks=2) CVE_VERSION_CADENCE = timedelta(days=2) CVE_SEVERITY = 8 # CRITICAL diff --git a/Tests/kaas/k8s-version-policy/k8s_version_policy_test.py b/Tests/kaas/k8s-version-policy/k8s_version_policy_test.py index dd65ceb50..dcef89a1b 100644 --- a/Tests/kaas/k8s-version-policy/k8s_version_policy_test.py +++ b/Tests/kaas/k8s-version-policy/k8s_version_policy_test.py @@ -44,8 +44,8 @@ def release_data(): K8S_VERSION = K8sVersion(1, 28, 5) EXPECTED_RECENCIES = { datetime(2024, 1, 17): True, - datetime(2024, 1, 24): True, - datetime(2024, 1, 25): False, + datetime(2024, 1, 31): True, + datetime(2024, 2, 1): False, } diff --git a/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/Dockerfile b/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/Dockerfile deleted file mode 100644 index 5c75f80d3..000000000 --- a/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/Dockerfile +++ /dev/null @@ -1,23 +0,0 @@ -FROM golang:1.23 - -# Install kubectl -# Note: Latest version may be found on: -# https://aur.archlinux.org/packages/kubectl-bin/ -RUN wget https://storage.googleapis.com/kubernetes-release/release/v1.21.3/bin/linux/amd64/kubectl -O /usr/bin/kubectl && \ - chmod +x /usr/bin/kubectl && \ - apt-get update && \ - apt-get install -y jq - -COPY ./scs_k8s_tests /src/scs_k8s_tests -WORKDIR /src -COPY go.* /src/ -ENV CGO_ENABLED=0 -RUN go mod download - -#see: https://docs.docker.com/build/guide/mounts/ -RUN --mount=type=cache,target=/root/.cache/go-build \ - go test -c -o custom.test ./... - -CMD ["bash", "-c", "go tool test2json ./custom.test -test.v"] - - diff --git a/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/Makefile b/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/Makefile deleted file mode 100644 index 2202e9c2f..000000000 --- a/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/Makefile +++ /dev/null @@ -1,114 +0,0 @@ -# Makefile -# This makefile is for development purpose - -SHELL = /bin/bash -#SED ?= sed - -DOCKERFILE="Dockerfile" -IMAGE_REGISTRY="ghcr.io/sovereigncloudstack/standards" -IMAGE_NAME="scsconformance" -#IMAGE_VERSION_TAG ="v0.1.2" - -KIND_CLUSTER ="testcluster" - -#PLUGIN_NAME="k8s-default-storage-class-plugin-go" -PLUGIN_NAME="plugin" -PLUGIN_FILE="${PLUGIN_NAME}.yaml" - -#~ SONO_WAIT = 1 -#~ SONO_TIMEOUT = 60 - -KUBERNETES_SERVICE_HOST=127.0.0.1 -KUBERNETES_SERVICE_PORT=34743 - - -############################################################################### -## Helpers: ## -############################################################################### - -ifeq ($(IMAGE_VERSION_TAG),) - export TAG=dev -else - export TAG=${IMAGE_VERSION_TAG} -endif - -SONOBUOY_IMAGE = "${IMAGE_REGISTRY}/${IMAGE_NAME}:${TAG}" - -container-init: - @echo "" - @echo "[ContainerImageName] ${SONOBUOY_IMAGE}" - @echo "[SonobuoyPluginFile] ${PLUGIN_FILE}" - @echo "" - - -kind-init: - @echo "" - @echo "[KindCluster] ${KIND_CLUSTER}" - @echo "" - - -############################################################################### -## For develpoment usage: ## -############################################################################### - -dev-prerequests: - @echo "[check-test-setup]" - @kind version - @docker version - @sonobuoy version --short - @go version - - -dev-setup: kind-init - kind create cluster --name ${KIND_CLUSTER} - - -dev-build: container-init - @echo "[build]" - DOCKER_BUILDKIT=1 docker build . -f ${DOCKERFILE} -t ${SONOBUOY_IMAGE} - kind load docker-image --name ${KIND_CLUSTER} ${SONOBUOY_IMAGE} - - -dev-go: - @echo "[go]" - @echo "[KubernetesService] ${KUBERNETES_SERVICE_HOST}:${KUBERNETES_SERVICE_PORT}" - @rm -rf ./build || true - @mkdir ./build - go test -c -o ./build ./... -# go test -c -o ./build ./... --args --skip-labels="type=pod-list" -# go tool test2json ./build -test.v - - -dev-run: - @echo "[run-test]" - @echo "sonobuoy run --plugin ${PLUGIN_FILE} --wait=${SONO_WAIT} --timeout=${SONO_TIMEOUT}" -#~ @sonobuoy run --plugin ${PLUGIN_FILE} --wait=${SONO_WAIT} --timeout=${SONO_TIMEOUT} - @sonobuoy run --plugin ${PLUGIN_FILE} - @sonobuoy status - - -dev-result: - @echo "[result]" - #outfile=$(sonobuoy retrieve) && mkdir results && tar -xf ${outfile} -C results - sonobuoy retrieve - sonobuoy results *.tar.gz - mkdir results - tar -xf *.tar.gz -C results - - -dev-clean: - @echo "[clean]" - @sonobuoy delete --all --wait || true - @sonobuoy status || true - @rm -rf *.tar.gz || true - @rm -rf results || true - - - -dev-purge: kind-init dev-clean - @echo "[purge]" - kind delete cluster --name ${KIND_CLUSTER} || true - docker rmi ${SONOBUOY_IMAGE} || true - - -PHONY: dev-prerequests dev-build dev-run dev-result dev-clean dev-clean dev-purge diff --git a/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/scs_k8s_tests/main_test.go b/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/scs_k8s_tests/main_test.go deleted file mode 100644 index 95b2e0482..000000000 --- a/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/scs_k8s_tests/main_test.go +++ /dev/null @@ -1,108 +0,0 @@ -package scs_k8s_tests - -import ( - "context" - "fmt" - "os" - "testing" - - plugin_helper "github.com/vmware-tanzu/sonobuoy-plugins/plugin-helper" - v1 "k8s.io/api/core/v1" - "sigs.k8s.io/e2e-framework/pkg/env" - "sigs.k8s.io/e2e-framework/pkg/envconf" -) - - -const ( - ProgressReporterCtxKey = "SONOBUOY_PROGRESS_REPORTER" - NamespacePrefixKey = "NS_PREFIX" -) - -var testenv env.Environment - -func TestMain(m *testing.M) { - // Assume we are running in the cluster as a Sonobuoy plugin. - testenv = env.NewInClusterConfig() - - // Specifying a run ID so that multiple runs wouldn't collide. Allow a prefix to be set via env var - // so that a plugin configuration (yaml file) can easily set that without code changes. - nsPrefix := os.Getenv(NamespacePrefixKey) - runID := envconf.RandomName(nsPrefix, 4) - - // Create updateReporter; will also place into context during Setup for use in features. - updateReporter := plugin_helper.NewProgressReporter(0) - - testenv.Setup(func(ctx context.Context, config *envconf.Config) (context.Context, error) { - // Try and create the client; doing it before all the tests allows the tests to assume - // it can be created without error and they can just use config.Client(). - _,err:=config.NewClient() - return context.WithValue(ctx,ProgressReporterCtxKey,updateReporter) ,err - }) - - testenv.BeforeEachTest(func(ctx context.Context, cfg *envconf.Config, t *testing.T) (context.Context, error) { - fmt.Println("BeforeEachTest") - updateReporter.StartTest(t.Name()) - return createNSForTest(ctx, cfg, t, runID) - }) - - testenv.AfterEachTest(func(ctx context.Context, cfg *envconf.Config, t *testing.T) (context.Context, error) { - fmt.Println("AfterEachTest") - updateReporter.StopTest(t.Name(),t.Failed(),t.Skipped(),nil) - return deleteNSForTest(ctx, cfg, t, runID) - }) - - /* - testenv.BeforeEachFeature(func(ctx context.Context, config *envconf.Config, info features.Feature) (context.Context, error) { - // Note that you can also add logic here for before a feature is tested. There may be - // more than one feature in a test. - fmt.Println("BeforeEachFeature") - return ctx, nil - }) - - testenv.AfterEachFeature(func(ctx context.Context, config *envconf.Config, info features.Feature) (context.Context, error) { - // Note that you can also add logic here for after a feature is tested. There may be - // more than one feature in a test. - fmt.Println("AfterEachFeature") - return ctx, nil - }) - */ - - testenv.Finish( - // Teardown func: delete kind cluster - func(ctx context.Context, cfg *envconf.Config) (context.Context, error) { - fmt.Println("Finished go test suite") - //~ if err := ???; err != nil{ - //~ return ctx, err - //~ } - return ctx, nil - }, - ) - - os.Exit(testenv.Run(m)) -} - -// CreateNSForTest creates a random namespace with the runID as a prefix. It is stored in the context -// so that the deleteNSForTest routine can look it up and delete it. -func createNSForTest(ctx context.Context, cfg *envconf.Config, t *testing.T, runID string) (context.Context, error) { - ns := envconf.RandomName(runID, 10) - ctx = context.WithValue(ctx, nsKey(t), ns) - - t.Logf("Creating namespace %v for test %v", ns, t.Name()) - nsObj := v1.Namespace{} - nsObj.Name = ns - return ctx, cfg.Client().Resources().Create(ctx, &nsObj) -} - -// DeleteNSForTest looks up the namespace corresponding to the given test and deletes it. -func deleteNSForTest(ctx context.Context, cfg *envconf.Config, t *testing.T, runID string) (context.Context, error) { - ns := fmt.Sprint(ctx.Value(nsKey(t))) - t.Logf("Deleting namespace %v for test %v", ns, t.Name()) - - nsObj := v1.Namespace{} - nsObj.Name = ns - return ctx, cfg.Client().Resources().Delete(ctx, &nsObj) -} - -func nsKey(t *testing.T) string { - return "NS-for-%v" + t.Name() -} diff --git a/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/scs_k8s_tests/main_test.go.template b/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/scs_k8s_tests/main_test.go.template deleted file mode 100644 index 0d3f577a0..000000000 --- a/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/scs_k8s_tests/main_test.go.template +++ /dev/null @@ -1,107 +0,0 @@ -/* -Copyright 2021 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package k8s_test_template - -import ( - "context" - "fmt" - "os" - "testing" - - plugin_helper "github.com/vmware-tanzu/sonobuoy-plugins/plugin-helper" - v1 "k8s.io/api/core/v1" - "sigs.k8s.io/e2e-framework/pkg/env" - "sigs.k8s.io/e2e-framework/pkg/envconf" -) - - -const ( - ProgressReporterCtxKey = "SONOBUOY_PROGRESS_REPORTER" - NamespacePrefixKey = "NS_PREFIX" -) - -var testenv env.Environment - -func TestMain(m *testing.M) { - // Assume we are running in the cluster as a Sonobuoy plugin. - testenv = env.NewInClusterConfig() - - // Specifying a run ID so that multiple runs wouldn't collide. Allow a prefix to be set via env var - // so that a plugin configuration (yaml file) can easily set that without code changes. - nsPrefix := os.Getenv(NamespacePrefixKey) - runID := envconf.RandomName(nsPrefix, 4) - - // Create updateReporter; will also place into context during Setup for use in features. - updateReporter := plugin_helper.NewProgressReporter(0) - - testenv.Setup(func(ctx context.Context, config *envconf.Config) (context.Context, error) { - // Try and create the client; doing it before all the tests allows the tests to assume - // it can be created without error and they can just use config.Client(). - _,err:=config.NewClient() - return context.WithValue(ctx,ProgressReporterCtxKey,updateReporter) ,err - }) - - testenv.BeforeEachTest(func(ctx context.Context, cfg *envconf.Config, t *testing.T) (context.Context, error) { - updateReporter.StartTest(t.Name()) - return createNSForTest(ctx, cfg, t, runID) - }) - testenv.AfterEachTest(func(ctx context.Context, cfg *envconf.Config, t *testing.T) (context.Context, error) { - updateReporter.StopTest(t.Name(),t.Failed(),t.Skipped(),nil) - return deleteNSForTest(ctx, cfg, t, runID) - }) - - /* - testenv.BeforeEachFeature(func(ctx context.Context, config *envconf.Config, info features.Feature) (context.Context, error) { - // Note that you can also add logic here for before a feature is tested. There may be - // more than one feature in a test. - return ctx, nil - }) - testenv.AfterEachFeature(func(ctx context.Context, config *envconf.Config, info features.Feature) (context.Context, error) { - // Note that you can also add logic here for after a feature is tested. There may be - // more than one feature in a test. - return ctx, nil - }) - */ - - os.Exit(testenv.Run(m)) -} - -// CreateNSForTest creates a random namespace with the runID as a prefix. It is stored in the context -// so that the deleteNSForTest routine can look it up and delete it. -func createNSForTest(ctx context.Context, cfg *envconf.Config, t *testing.T, runID string) (context.Context, error) { - ns := envconf.RandomName(runID, 10) - ctx = context.WithValue(ctx, nsKey(t), ns) - - t.Logf("Creating namespace %v for test %v", ns, t.Name()) - nsObj := v1.Namespace{} - nsObj.Name = ns - return ctx, cfg.Client().Resources().Create(ctx, &nsObj) -} - -// DeleteNSForTest looks up the namespace corresponding to the given test and deletes it. -func deleteNSForTest(ctx context.Context, cfg *envconf.Config, t *testing.T, runID string) (context.Context, error) { - ns := fmt.Sprint(ctx.Value(nsKey(t))) - t.Logf("Deleting namespace %v for test %v", ns, t.Name()) - - nsObj := v1.Namespace{} - nsObj.Name = ns - return ctx, cfg.Client().Resources().Delete(ctx, &nsObj) -} - -func nsKey(t *testing.T) string { - return "NS-for-%v" + t.Name() -} diff --git a/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/scs_k8s_tests/scs_0200_example_test.go b/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/scs_k8s_tests/scs_0200_example_test.go deleted file mode 100644 index ee30f453c..000000000 --- a/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/scs_k8s_tests/scs_0200_example_test.go +++ /dev/null @@ -1,83 +0,0 @@ -/* - Copyright 2021 The Kubernetes Authors. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package scs_k8s_tests - -import ( - "context" - "testing" - "time" - "fmt" - plugin_helper "github.com/vmware-tanzu/sonobuoy-plugins/plugin-helper" - corev1 "k8s.io/api/core/v1" - "sigs.k8s.io/e2e-framework/pkg/envconf" - "sigs.k8s.io/e2e-framework/pkg/features" -) - - -func Test_scs_0200_sonobuoy_pass(t *testing.T) { - fmt.Println("Test a passing test") - testvar := 5 - if testvar != 5 { - t.Errorf("testvar = %d; want 5", testvar) - } -} - -func Test_scs_0200_sonobuoy_fail(t *testing.T) { - fmt.Println("Test a failing test") - testvar := 5 - if testvar != 3 { - t.Errorf("testvar = %d; want 3", testvar) - } -} - -func Test_scs_0200_sonobuoy_TestListPods(t *testing.T) { - f := features.New("pod list").WithLabel("type", "pod-count").Assess( - "pods from kube-system", - func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { - var pods corev1.PodList - err := cfg.Client().Resources("kube-system").List(context.TODO(), &pods) - if err != nil { - t.Fatal(err) - } - t.Logf("found %d pods", len(pods.Items)) - if len(pods.Items) == 0 { - t.Fatal("no pods in namespace kube-system") - } - return ctx - }) - - testenv.Test(t, f.Feature()) -} - -func Test_scs_0200_sonobuoy_TestListPods_Long(t *testing.T) { - f := features.New("pod list").WithLabel("type", "progress").Assess( - "pods from kube-system", - func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { - var pods corev1.PodList - err := cfg.Client().Resources("kube-system").List(context.TODO(), &pods) - if err != nil { - t.Fatal(err) - } - progressReporterVal := ctx.Value(ProgressReporterCtxKey) - progressReporter:=progressReporterVal.(plugin_helper.ProgressReporter) - for i:=0;i<5;i++{ - time.Sleep(5*time.Second) - progressReporter.SendMessageAsync("Waiting for a long test...") - } - return ctx - }) - - testenv.Test(t, f.Feature()) -} - diff --git a/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/scs_k8s_tests/scs_0201_example_test.go b/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/scs_k8s_tests/scs_0201_example_test.go deleted file mode 100644 index 1771f8058..000000000 --- a/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/scs_k8s_tests/scs_0201_example_test.go +++ /dev/null @@ -1,80 +0,0 @@ -package scs_k8s_tests - -import ( - "context" - "testing" - "time" - "fmt" - plugin_helper "github.com/vmware-tanzu/sonobuoy-plugins/plugin-helper" - corev1 "k8s.io/api/core/v1" - "sigs.k8s.io/e2e-framework/pkg/envconf" - "sigs.k8s.io/e2e-framework/pkg/features" -) - - -func Test_scs_0201_TestDummyIn(t *testing.T) { - fmt.Println("DEBUG: dummy test") - testvar := 5 - if testvar != 3 { - t.Errorf("testvar = %d; want 3", testvar) - } -} - -func Test_scs_0201_TestListPods(t *testing.T) { - f := features.New("pod list").Assess( - "pods from kube-system", - func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { - var pods corev1.PodList - err := cfg.Client().Resources("kube-system").List(context.TODO(), &pods) - if err != nil { - t.Fatal(err) - } - t.Logf("found %d pods", len(pods.Items)) - if len(pods.Items) == 0 { - t.Fatal("no pods in namespace kube-system") - } - return ctx - }) - - testenv.Test(t, f.Feature()) -} - -func Test_scs_0201_TestListPodsFailing(t *testing.T) { - f := features.New("pod list").Assess( - "pods from kube-test-a", - func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { - var pods corev1.PodList - err := cfg.Client().Resources("kube-test-a").List(context.TODO(), &pods) - if err != nil { - t.Fatal(err) - } - t.Logf("found %d pods", len(pods.Items)) - if len(pods.Items) == 0 { - t.Fatal("no pods in namespace kube-test-a") - } - return ctx - }) - - testenv.Test(t, f.Feature()) -} - -func Test_scs_0201_TestLongTest(t *testing.T) { - f := features.New("pod list").Assess( - "pods from kube-system", - func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { - var pods corev1.PodList - err := cfg.Client().Resources("kube-system").List(context.TODO(), &pods) - if err != nil { - t.Fatal(err) - } - progressReporterVal := ctx.Value(ProgressReporterCtxKey) - progressReporter:=progressReporterVal.(plugin_helper.ProgressReporter) - for i:=0;i<5;i++{ - time.Sleep(5*time.Second) - progressReporter.SendMessageAsync("Waiting for a long test...") - } - return ctx - }) - - testenv.Test(t, f.Feature()) -} diff --git a/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/scs_k8s_tests/template_test.go.template b/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/scs_k8s_tests/template_test.go.template deleted file mode 100644 index bf5476938..000000000 --- a/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/scs_k8s_tests/template_test.go.template +++ /dev/null @@ -1,66 +0,0 @@ -/* - Copyright 2021 The Kubernetes Authors. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package k8s_test_template - -import ( - "context" - "testing" - "time" - - plugin_helper "github.com/vmware-tanzu/sonobuoy-plugins/plugin-helper" - corev1 "k8s.io/api/core/v1" - "sigs.k8s.io/e2e-framework/pkg/envconf" - "sigs.k8s.io/e2e-framework/pkg/features" -) - - -func TestListPods(t *testing.T) { - f := features.New("pod list").Assess( - "pods from kube-system", - func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { - var pods corev1.PodList - err := cfg.Client().Resources("kube-system").List(context.TODO(), &pods) - if err != nil { - t.Fatal(err) - } - t.Logf("found %d pods", len(pods.Items)) - if len(pods.Items) == 0 { - t.Fatal("no pods in namespace kube-system") - } - return ctx - }) - - testenv.Test(t, f.Feature()) -} - -func TestLongTest(t *testing.T) { - f := features.New("pod list").Assess( - "pods from kube-system", - func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { - var pods corev1.PodList - err := cfg.Client().Resources("kube-system").List(context.TODO(), &pods) - if err != nil { - t.Fatal(err) - } - progressReporterVal := ctx.Value(ProgressReporterCtxKey) - progressReporter:=progressReporterVal.(plugin_helper.ProgressReporter) - for i:=0;i<5;i++{ - time.Sleep(5*time.Second) - progressReporter.SendMessageAsync("Waiting for a long test...") - } - return ctx - }) - - testenv.Test(t, f.Feature()) -} diff --git a/Tests/kaas/kaas-sonobuoy-tests/Dockerfile b/Tests/kaas/kaas-sonobuoy-tests/Dockerfile new file mode 100644 index 000000000..738c3bd55 --- /dev/null +++ b/Tests/kaas/kaas-sonobuoy-tests/Dockerfile @@ -0,0 +1,28 @@ +FROM golang:1.23 + +# Use build arguments to get the correct architecture +ARG TARGETARCH + +# Install kubectl based on the architecture +#See https://github.com/kubernetes-sigs/kubespray/pull/10066 +RUN apt-get update && apt-get install -y wget jq && \ + if [ "$TARGETARCH" = "amd64" ]; then \ + wget https://cdn.dl.k8s.io/release/v1.31.1/bin/linux/amd64/kubectl -O /usr/bin/kubectl; \ + elif [ "$TARGETARCH" = "arm64" ]; then \ + wget https://cdn.dl.k8s.io/release/v1.31.1/bin/linux/arm64/kubectl -O /usr/bin/kubectl; \ + else \ + echo "Unsupported architecture: $TARGETARCH" && exit 1; \ + fi && \ + chmod +x /usr/bin/kubectl + +COPY ./scs_k8s_conformance_tests /src/scs_k8s_conformance_tests +WORKDIR /src +COPY go.* /src/ +ENV CGO_ENABLED=0 +RUN go mod download + +#see: https://docs.docker.com/build/guide/mounts/ +RUN --mount=type=cache,target=/root/.cache/go-build \ + go test -c -o custom.test ./... + +CMD ["bash", "-c", "go tool test2json ./custom.test -test.v"] diff --git a/Tests/kaas/kaas-sonobuoy-tests/Makefile b/Tests/kaas/kaas-sonobuoy-tests/Makefile new file mode 100644 index 000000000..dffc6d7a2 --- /dev/null +++ b/Tests/kaas/kaas-sonobuoy-tests/Makefile @@ -0,0 +1,132 @@ +# Makefile +# This makefile is for development purpose + +############################################################################### +## Setup: ## +############################################################################### +SHELL = /bin/bash + +DOCKERFILE="Dockerfile" +IMAGE_REGISTRY="ghcr.io/sovereigncloudstack/standards" +IMAGE_NAME="scs-kaas-conformance" + +KIND_CLUSTER ="testcluster" + +PLUGIN_NAME="scs-conformance-sonobuoy-plugin" +PLUGIN_FILE="${PLUGIN_NAME}.yaml" +KIND_CONFIG_FILE="kind_config.yaml" + +SONO_WAIT= 10 + +############################################################################### +## Helpers: ## +############################################################################### + +ifeq ($(IMAGE_VERSION_TAG),) + export TAG=dev +else + export TAG=${IMAGE_VERSION_TAG} +endif + +SONOBUOY_IMAGE = "${IMAGE_REGISTRY}/${IMAGE_NAME}:${TAG}" + +container-init: + @echo "" + @echo "[ContainerImageName] ${SONOBUOY_IMAGE}" + @echo "[SonobuoyPluginFile] ${PLUGIN_FILE}" + @echo "" + + +kind-init: + @echo "" + @echo "[KindCluster] ${KIND_CLUSTER}" + @echo "" + +############################################################################### +## For develpoment usage: ## +############################################################################### + +dev-prerequests: + @kind version + @docker version -f json | jq '.Client.Version' + @sonobuoy version --short + @yq --version + @jq --version + @go version + @docker buildx version + + +dev-setup: kind-init + kind create cluster --config ${KIND_CONFIG_FILE} --name ${KIND_CLUSTER} + + +dev-build: container-init + @echo "[Building image...]" + DOCKER_BUILDKIT=1 docker build . -f ${DOCKERFILE} -t ${SONOBUOY_IMAGE} + kind load docker-image --name ${KIND_CLUSTER} ${SONOBUOY_IMAGE} + + +dev-run: + @echo "[Running sonobuoy...]" + @sonobuoy run -p ${PLUGIN_FILE} --wait=${SONO_WAIT} + + +dev-run-background: + @echo "[Running sonobuoy in background...]" + @sonobuoy run -p ${PLUGIN_FILE} + @sonobuoy status + + +dev-result: dev-clean-result + @echo "[Retrieve results...]" + sonobuoy retrieve + @echo "[Extracting results...]" + mkdir results + tar -xf *.tar.gz -C results + cat results/plugins/scs-kaas-conformance/sonobuoy_results.yaml | yq + cat results/plugins/scs-kaas-conformance/results/global/out.json | jq '.Output' + @echo "[Displaying results...]" + sonobuoy results *.tar.gz + + +dev-rerun: dev-clean-sonobuoy dev-build dev-run dev-result + + +test-function: + @echo "only run tests for: $${TESTFUNCTION_CODE}" + DEVELOPMENT_MODE=createcluster go test -run=$${TESTFUNCTION_CODE} ./... || true + +lint: check-golangci-lint + @echo "[Running golangci-lint...]" + @golangci-lint run ./... -v || true + +GOLANGCI_LINT_VERSION ?= v1.61.0 +check-golangci-lint: + @if ! [ -x "$$(command -v golangci-lint)" ]; then \ + echo "[golangci-lint not found, installing...]"; \ + go install github.com/golangci/golangci-lint/cmd/golangci-lint@$(GOLANGCI_LINT_VERSION); \ + echo "[golangci-lint installed]"; \ + else \ + echo "[golangci-lint is already installed]"; \ + fi + +dev-clean-result: + @rm -rf *.tar.gz || true + @rm -rf results || true + + +dev-clean-sonobuoy: dev-clean-result + @echo "[Cleanup sonobuoy environment from cluster...]" + @sonobuoy delete --all --wait || true + + +dev-purge: kind-init dev-clean-sonobuoy + @echo "[Purge everthing...]" + @echo "[Deleting kind cluster...]" + kind delete cluster --name ${KIND_CLUSTER} || true + @echo "[Removing docker image...]" + docker rmi ${SONOBUOY_IMAGE} || true + @rm -rf ./build || true + +PHONY: dev-prerequests dev-build dev-run dev-result dev-clean-sonobuoy dev-clean-result dev-purge dev-rerun dev-run-background + diff --git a/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/README.md b/Tests/kaas/kaas-sonobuoy-tests/README.md similarity index 80% rename from Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/README.md rename to Tests/kaas/kaas-sonobuoy-tests/README.md index cf5681dac..b5db4532d 100644 --- a/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/README.md +++ b/Tests/kaas/kaas-sonobuoy-tests/README.md @@ -34,21 +34,13 @@ For test development with Sonobuoy, [KinD](https://kind.sigs.k8s.io/) is used as make dev-setup ``` -1. Set environment variables - - ```bash - export IMAGE_VERSION_TAG="dev" - export K8S_HOST= - export K8S_PORT= - ``` - -2. Build the image and upload it to the KinD cluster +1. Build the image and upload it to the KinD cluster ```bash make dev-build ``` -3. Execute the Sonobuoy plugin +2. Execute the Sonobuoy plugin ```bash make dev-run @@ -61,7 +53,7 @@ For test development with Sonobuoy, [KinD](https://kind.sigs.k8s.io/) is used as sonobuoy status ``` -4. Retrieve the Results +3. Retrieve the Results Once Sonobuoy is done running the plugin you can retrieve the results as following: @@ -69,7 +61,7 @@ For test development with Sonobuoy, [KinD](https://kind.sigs.k8s.io/) is used as make dev-result ``` -5. Clean the Sonobuoy testcase from the KinD cluster +4. Clean the Sonobuoy testcase from the KinD cluster Cleaning up all Kubernetes resources which were placed on the KinD cluster by sonobuoy @@ -77,10 +69,21 @@ For test development with Sonobuoy, [KinD](https://kind.sigs.k8s.io/) is used as make dev-clean ``` -6. Purge everything +These steps can also be carried out in short form using the following command: - Deleting the KinD cluster +```bash +make dev-rerun +``` - ```bash - make dev-purge - ``` +Finnaly to remove the kind cluster simply use: + +```bash +make dev-purge +``` + +## Execution of only certain test functions for development purposes + +```bash +export TESTFUNCTION_CODE= +make test-function +``` diff --git a/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/go.mod b/Tests/kaas/kaas-sonobuoy-tests/go.mod similarity index 95% rename from Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/go.mod rename to Tests/kaas/kaas-sonobuoy-tests/go.mod index b1fe960ce..1fe28bda6 100644 --- a/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/go.mod +++ b/Tests/kaas/kaas-sonobuoy-tests/go.mod @@ -1,6 +1,6 @@ -module kaas/kaas-sonobuoy-go-example-2 +module kaas/kaas-sonobuoy-tests -go 1.17 +go 1.21 require ( github.com/vmware-tanzu/sonobuoy-plugins/plugin-helper v0.0.0-20211029183731-1d6848b67eec @@ -27,6 +27,7 @@ require ( github.com/satori/go.uuid v1.2.1-0.20181028125025-b2ce2384e17b // indirect github.com/sirupsen/logrus v1.7.0 // indirect github.com/spf13/pflag v1.0.5 // indirect + github.com/vladimirvivien/gexe v0.1.1 // indirect github.com/vmware-tanzu/sonobuoy v1.11.5-prerelease.1.0.20211004145628-b633b4fefcdc // indirect golang.org/x/net v0.23.0 // indirect golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d // indirect diff --git a/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/go.sum b/Tests/kaas/kaas-sonobuoy-tests/go.sum similarity index 96% rename from Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/go.sum rename to Tests/kaas/kaas-sonobuoy-tests/go.sum index 7fed0bb8f..5bddac2e0 100644 --- a/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/go.sum +++ b/Tests/kaas/kaas-sonobuoy-tests/go.sum @@ -413,7 +413,6 @@ github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= go.etcd.io/bbolt v1.3.3/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= go.etcd.io/bbolt v1.3.5/go.mod h1:G5EMThwa9y8QZGBClrRx5EY+Yw9kAhnjy3bSjsnlVTQ= @@ -444,9 +443,6 @@ golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8U golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20201002170205-7f63de1d35b0/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20210220033148-5ea612d1eb83/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I= -golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= -golang.org/x/crypto v0.21.0/go.mod h1:0BP7YvVV9gBbVKyeTG0Gyn+gZm94bibOW5BjDEYAOMs= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= @@ -478,8 +474,6 @@ golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzB golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.1-0.20200828183125-ce943fd02449/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -511,12 +505,7 @@ golang.org/x/net v0.0.0-20200520004742-59133d7f0dd7/go.mod h1:qpuaurCH72eLCgpAm/ golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20210224082022-3d97a244fca7/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20210428140749-89ef3d95e781/go.mod h1:OJAsFXCWl8Ukc7SiCT/9KSuxbyM7479/AVlXFRxuMCk= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= -golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= -golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs= golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= @@ -533,8 +522,6 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -583,21 +570,12 @@ golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20210426230700-d19ff857e887/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210603081109-ebe580a85c40/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4= golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210615171337-6886f2dfbf5b/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= -golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= -golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= golang.org/x/term v0.18.0 h1:FcHjZXDMxI8mM3nwhX9HlKop4C0YQvCVCdwYl2wOtE8= golang.org/x/term v0.18.0/go.mod h1:ILwASektA3OnRv7amZ1xhE/KTR+u50pbXfZ03+6Nx58= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -607,9 +585,6 @@ golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -659,8 +634,6 @@ golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roY golang.org/x/tools v0.0.0-20201224043029-2b0845dc783e/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0= -golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -783,7 +756,6 @@ k8s.io/code-generator v0.21.1/go.mod h1:hUlps5+9QaTrKx+jiM4rmq7YmH8wPOIko64uZCHD k8s.io/component-base v0.21.1/go.mod h1:NgzFZ2qu4m1juby4TnrmpR8adRk6ka62YdH5DkIIyKA= k8s.io/gengo v0.0.0-20200413195148-3a45101e95ac/go.mod h1:ezvh/TsK7cY6rbqRK0oQQ8IAqLxYwwyPxAX1Pzy0ii0= k8s.io/gengo v0.0.0-20201214224949-b6c5ce23f027/go.mod h1:FiNAH4ZV3gBg2Kwh89tzAEV2be7d5xI0vBa/VySYy3E= -k8s.io/klog v1.0.0 h1:Pt+yjF5aB1xDSVbau4VsWe+dQNzA0qv1LlXdC2dF6Q8= k8s.io/klog v1.0.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I= k8s.io/klog/v2 v2.0.0/go.mod h1:PBfzABfn139FHAV07az/IF9Wp1bkk3vpT2XSJ76fSDE= k8s.io/klog/v2 v2.2.0/go.mod h1:Od+F08eJP+W3HUb4pSrPpgp9DGU4GzlpG/TmITuYh/Y= diff --git a/Tests/kaas/kaas-sonobuoy-tests/kind_config.yaml b/Tests/kaas/kaas-sonobuoy-tests/kind_config.yaml new file mode 100644 index 000000000..947a9fa8a --- /dev/null +++ b/Tests/kaas/kaas-sonobuoy-tests/kind_config.yaml @@ -0,0 +1,10 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +networking: + apiServerAddress: 127.0.0.1 + apiServerPort: 6443 +nodes: +- role: control-plane +- role: worker +- role: worker +- role: worker diff --git a/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/plugin.yaml b/Tests/kaas/kaas-sonobuoy-tests/scs-conformance-sonobuoy-plugin.yaml similarity index 70% rename from Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/plugin.yaml rename to Tests/kaas/kaas-sonobuoy-tests/scs-conformance-sonobuoy-plugin.yaml index c3af316e2..48097d943 100644 --- a/Tests/kaas/kaas-sonobuoy-go-example-e2e-framework/plugin.yaml +++ b/Tests/kaas/kaas-sonobuoy-tests/scs-conformance-sonobuoy-plugin.yaml @@ -1,13 +1,13 @@ sonobuoy-config: driver: Job - plugin-name: scsconformance + plugin-name: scs-kaas-conformance result-format: gojson - description: An easy to start from project for making K8s aware tests. + description: A testsuite for testing the scs conformance of k8s clusters. spec: command: - bash args: ["-c","go tool test2json ./custom.test -test.v | tee ${SONOBUOY_RESULTS_DIR}/out.json ; echo ${SONOBUOY_RESULTS_DIR}/out.json > ${SONOBUOY_RESULTS_DIR}/done"] - image: ghcr.io/sovereigncloudstack/standards/scsconformance:dev + image: ghcr.io/sovereigncloudstack/standards/scs-kaas-conformance:dev env: - name: NS_PREFIX value: custom diff --git a/Tests/kaas/kaas-sonobuoy-tests/scs_k8s_conformance_tests/main_test.go b/Tests/kaas/kaas-sonobuoy-tests/scs_k8s_conformance_tests/main_test.go new file mode 100644 index 000000000..98d305a5b --- /dev/null +++ b/Tests/kaas/kaas-sonobuoy-tests/scs_k8s_conformance_tests/main_test.go @@ -0,0 +1,147 @@ +package scs_k8s_tests + +import ( + "context" + "fmt" + "log" + "os" + "testing" + + plugin_helper "github.com/vmware-tanzu/sonobuoy-plugins/plugin-helper" + v1 "k8s.io/api/core/v1" + "sigs.k8s.io/e2e-framework/pkg/env" + "sigs.k8s.io/e2e-framework/pkg/envconf" + "sigs.k8s.io/e2e-framework/pkg/envfuncs" +) + +// Define a custom type for the context key +type nsContextKey string + +// Define a custom type for context keys +type contextKey string + +const ( + ProgressReporterCtxKey = "SONOBUOY_PROGRESS_REPORTER" + NamespacePrefixKey = "NS_PREFIX" + DevelopmentModeKey = "DEVELOPMENT_MODE" +) + +var testenv env.Environment + +func TestMain(m *testing.M) { + + // Specifying a run ID so that multiple runs wouldn't collide. Allow a prefix to be set via env var + // so that a plugin configuration (yaml file) can easily set that without code changes. + nsPrefix := os.Getenv(NamespacePrefixKey) + runID := envconf.RandomName(nsPrefix, 4) + + // Create updateReporter; will also place into context during Setup for use in features. + updateReporter := plugin_helper.NewProgressReporter(0) + + developmentMode := os.Getenv(DevelopmentModeKey) + log.Printf("Setup test enviornment for: %#v", developmentMode) + + switch KubernetesEnviornment := developmentMode; KubernetesEnviornment { + + case "createcluster": + log.Println("Create kind cluster for test") + testenv = env.New() + kindClusterName := envconf.RandomName("gotestcluster", 16) + //~ namespace := envconf.RandomName("testnamespace", 16) + + testenv.Setup( + envfuncs.CreateKindCluster(kindClusterName), + ) + + testenv.Finish( + //~ envfuncs.DeleteNamespace(namespace), + envfuncs.DestroyKindCluster(kindClusterName), + ) + + case "usecluster": + log.Println("Use existing k8s cluster for the test") + log.Println("Not Yet Implemented") + //~ testenv = env.NewFromFlags() + //~ KubeConfig:= os.Getenv(KUBECONFIGFILE) + //~ testenv = env.NewWithKubeConfig(KubeConfig) + + default: + // Assume we are running in the cluster as a Sonobuoy plugin. + log.Println("Running tests inside k8s cluster") + testenv = env.NewInClusterConfig() + + testenv.Setup(func(ctx context.Context, config *envconf.Config) (context.Context, error) { + // Try and create the client; doing it before all the tests allows the tests to assume + // it can be created without error and they can just use config.Client(). + _, err := config.NewClient() + return context.WithValue(ctx, contextKey(ProgressReporterCtxKey), updateReporter), err + }) + + testenv.Finish( + func(ctx context.Context, cfg *envconf.Config) (context.Context, error) { + log.Println("Finished go test suite") + //~ if err := ???; err != nil{ + //~ return ctx, err + //~ } + return ctx, nil + }, + ) + + } + + testenv.BeforeEachTest(func(ctx context.Context, cfg *envconf.Config, t *testing.T) (context.Context, error) { + fmt.Println("BeforeEachTest") + updateReporter.StartTest(t.Name()) + return createNSForTest(ctx, cfg, t, runID) + }) + + testenv.AfterEachTest(func(ctx context.Context, cfg *envconf.Config, t *testing.T) (context.Context, error) { + fmt.Println("AfterEachTest") + updateReporter.StopTest(t.Name(), t.Failed(), t.Skipped(), nil) + return deleteNSForTest(ctx, cfg, t, runID) + }) + + /* + testenv.BeforeEachFeature(func(ctx context.Context, config *envconf.Config, info features.Feature) (context.Context, error) { + // Note that you can also add logic here for before a feature is tested. There may be + // more than one feature in a test. + fmt.Println("BeforeEachFeature") + return ctx, nil + }) + + testenv.AfterEachFeature(func(ctx context.Context, config *envconf.Config, info features.Feature) (context.Context, error) { + // Note that you can also add logic here for after a feature is tested. There may be + // more than one feature in a test. + fmt.Println("AfterEachFeature") + return ctx, nil + }) + */ + + os.Exit(testenv.Run(m)) +} + +// CreateNSForTest creates a random namespace with the runID as a prefix. It is stored in the context +// so that the deleteNSForTest routine can look it up and delete it. +func createNSForTest(ctx context.Context, cfg *envconf.Config, t *testing.T, runID string) (context.Context, error) { + ns := envconf.RandomName(runID, 10) + ctx = context.WithValue(ctx, nsKey(t), ns) + + t.Logf("Creating namespace %v for test %v", ns, t.Name()) + nsObj := v1.Namespace{} + nsObj.Name = ns + return ctx, cfg.Client().Resources().Create(ctx, &nsObj) +} + +// DeleteNSForTest looks up the namespace corresponding to the given test and deletes it. +func deleteNSForTest(ctx context.Context, cfg *envconf.Config, t *testing.T, runID string) (context.Context, error) { + ns := fmt.Sprint(ctx.Value(nsKey(t))) + t.Logf("Deleting namespace %v for test %v", ns, t.Name()) + + nsObj := v1.Namespace{} + nsObj.Name = ns + return ctx, cfg.Client().Resources().Delete(ctx, &nsObj) +} + +func nsKey(t *testing.T) nsContextKey { + return nsContextKey("NS-for-" + t.Name()) +} diff --git a/Tests/kaas/kaas-sonobuoy-tests/scs_k8s_conformance_tests/scs_0200_smoke_test.go b/Tests/kaas/kaas-sonobuoy-tests/scs_k8s_conformance_tests/scs_0200_smoke_test.go new file mode 100644 index 000000000..62ec43e3d --- /dev/null +++ b/Tests/kaas/kaas-sonobuoy-tests/scs_k8s_conformance_tests/scs_0200_smoke_test.go @@ -0,0 +1,15 @@ +package scs_k8s_tests + +import ( + "os" + "testing" +) + +func Test_scs_0200_smoke(t *testing.T) { + // This test ensures that no DevelopmentMode was set + // when using this test-suite productively + developmentMode := os.Getenv(DevelopmentModeKey) + if developmentMode != "" { + t.Errorf("developmentMode is set to = %v; want None", developmentMode) + } +} diff --git a/Tests/requirements.txt b/Tests/requirements.txt index 12f76944e..9505a7061 100644 --- a/Tests/requirements.txt +++ b/Tests/requirements.txt @@ -26,7 +26,7 @@ charset-normalizer==3.3.2 # via requests click==8.1.7 # via -r requirements.in -cryptography==43.0.0 +cryptography==43.0.1 # via openstacksdk decorator==5.1.1 # via diff --git a/Tests/scs-compatible-iaas.yaml b/Tests/scs-compatible-iaas.yaml index 0d9c0ee61..5ad119fbf 100644 --- a/Tests/scs-compatible-iaas.yaml +++ b/Tests/scs-compatible-iaas.yaml @@ -154,7 +154,75 @@ modules: tags: [mandatory] description: > Must fulfill all requirements of + - id: scs-0114-v1 + name: Volume Types + url: https://docs.scs.community/standards/scs-0114-v1-volume-type-standard + run: + - executable: ./iaas/volume-types/volume-types-check.py + args: -c {os_cloud} -d + testcases: + - id: volume-types-check + tags: [mandatory] + description: > + Must fulfill all requirements of + - id: scs-0115-v1 + name: Default rules for security groups + url: https://docs.scs.community/standards/scs-0115-v1-default-rules-for-security-groups + run: + - executable: ./iaas/security-groups/default-security-group-rules.py + args: --os-cloud {os_cloud} --debug + testcases: + - id: security-groups-default-rules-check + tags: [mandatory] + description: > + Must fulfill all requirements of + - id: scs-0116-v1 + name: Key manager + url: https://docs.scs.community/standards/scs-0116-v1-key-manager-standard + run: + - executable: ./iaas/key-manager/check-for-key-manager.py + args: --os-cloud {os_cloud} --debug + testcases: + - id: key-manager-check + tags: [mandatory] + description: > + Must fulfill all requirements of + - id: scs-0117-v1 + name: Volume backup + url: https://docs.scs.community/standards/scs-0117-v1-volume-backup-service + run: + - executable: ./iaas/volume-backup/volume-backup-tester.py + args: --os-cloud {os_cloud} --debug + testcases: + - id: volume-backup-check + tags: [mandatory] + description: > + Must fulfill all requirements of + - id: scs-0121-v1 + name: Availability Zones + url: https://docs.scs.community/standards/scs-0121-v1-Availability-Zones-Standard + testcases: + - id: availability-zones-check + tags: [availability-zones] + description: > + Note: manual check! Must fulfill all requirements of + - id: scs-0302-v1 + name: Domain Manager Role + url: https://docs.scs.community/standards/scs-0302-v1-domain-manager-role + # run: + # - executable: ./iam/domain-manager/domain-manager-check.py + # args: --os-cloud {os_cloud} --debug --domain-config ... + testcases: + - id: domain-manager-check + tags: [domain-manager] + description: > + Note: manual check! Must fulfill all requirements of timeline: + - date: 2024-11-08 + versions: + v5: draft + v4: effective + v3: deprecated - date: 2024-08-23 versions: v5: draft @@ -202,8 +270,15 @@ versions: - ref: scs-0104-v1 parameters: image_spec: https://raw.githubusercontent.com/SovereignCloudStack/standards/main/Tests/iaas/scs-0104-v1-images-v5.yaml + - scs-0114-v1 + - scs-0115-v1 + - scs-0116-v1 + - scs-0117-v1 + - scs-0121-v1 + - scs-0302-v1 targets: main: mandatory + preview: domain-manager/availability-zones - version: v4 stabilized_at: 2024-02-28 include: diff --git a/Tests/scs-compliance-check.py b/Tests/scs-compliance-check.py index 64da66133..a81c6737e 100755 --- a/Tests/scs-compliance-check.py +++ b/Tests/scs-compliance-check.py @@ -108,9 +108,11 @@ def apply_argv(self, argv): usage() sys.exit(0) elif opt[0] == "-v" or opt[0] == "--verbose": + if self.verbose: + logger.setLevel(logging.DEBUG) self.verbose = True elif opt[0] == "--debug": - logging.getLogger().setLevel(logging.DEBUG) + logger.setLevel(logging.DEBUG) elif opt[0] == "-q" or opt[0] == "--quiet": self.quiet = True logging.getLogger().setLevel(logging.ERROR) @@ -271,7 +273,7 @@ def run_suite(suite: TestSuite, runner: CheckRunner): return builder.finalize(permissible_ids=suite.ids) -def print_report(subject: str, suite: TestSuite, targets: dict, results: dict): +def print_report(subject: str, suite: TestSuite, targets: dict, results: dict, verbose=False): print(f"{subject} {suite.name}:") for tname, target_spec in targets.items(): failed, missing, passed = suite.select(tname, target_spec).eval_buckets(results) @@ -283,7 +285,10 @@ def print_report(subject: str, suite: TestSuite, targets: dict, results: dict): summary_parts.append(f"{len(missing)} missing") verdict += f" ({', '.join(summary_parts)})" print(f"- {tname}: {verdict}") - for offenders, category in ((failed, 'FAILED'), (missing, 'MISSING')): + reportcateg = [(failed, 'FAILED'), (missing, 'MISSING')] + if verbose: + reportcateg.append((passed, 'PASSED')) + for offenders, category in reportcateg: if category == 'MISSING' and suite.partial: continue # do not report each missing testcase if a filter was used if not offenders: @@ -363,7 +368,7 @@ def main(argv): if runner.spamminess: print("********" * 10) # 80 characters for version, suite, results in report_data: - print_report(config.subject, suite, version['targets'], results) + print_report(config.subject, suite, version['targets'], results, config.verbose) if config.output: version_report = {version['version']: results for version, _, results in report_data} report = create_report(argv, config, spec, version_report, runner.get_invocations()) diff --git a/compliance-monitor/README.md b/compliance-monitor/README.md index e4f21d385..b504b6eba 100644 --- a/compliance-monitor/README.md +++ b/compliance-monitor/README.md @@ -209,16 +209,21 @@ Needs to be authenticated (via basic auth). Supports content type `text/plain; version=0.0.4; charset=utf-8` only. -### GET /pages +### GET /{view_type}/table\[_full\] -Returns the compliance table for all active subjects (type `text/html`). +Returns the compliance table for all active subjects, where `view_type` can be one of the following: -Query parameters: +- `markdown`: return Markdown fragment (mimetype `text/markdown`) +- `fragment`: return HTML fragment (mimetype `text/html`) +- `page`: return full HTML page (mimetype `text/html`) + +If `table_full` is used, then HTTP basic auth must be performed, and the table will show the +privileged view (i.e., any FAIL will be reported regardless of manual approval). -- `fragment_only` (optional `0` or `1`, default `1`): return just the table (otherwise a complete HTML doc) +### GET /{view_type}/details\[_full\]/{subject}/{scopeuuid} -### GET /subjects +Returns compliance details for given subject and scope. -Returns the list of subjects (together with activity status). +### GET /{view_type}/scope/{scopeuuid} -### POST /subjects +Returns spec overview for the given scope. diff --git a/compliance-monitor/bootstrap.yaml b/compliance-monitor/bootstrap.yaml index 8928923f9..8339c422d 100644 --- a/compliance-monitor/bootstrap.yaml +++ b/compliance-monitor/bootstrap.yaml @@ -50,6 +50,9 @@ accounts: - public_key: "AAAAC3NzaC1lZDI1NTE5AAAAILufk4C7e0eQQIkmUDK8GB2IoiDjYtv6mx2eE8wZ3VWT" public_key_type: "ssh-ed25519" public_key_name: "primary" + - subject: scaleup-occ2 + delegates: + - zuul_ci - subject: syseleven-dus2 delegates: - zuul_ci @@ -63,52 +66,3 @@ accounts: - public_key: "AAAAC3NzaC1lZDI1NTE5AAAAILufk4C7e0eQQIkmUDK8GB2IoiDjYtv6mx2eE8wZ3VWT" public_key_type: "ssh-ed25519" public_key_name: "primary" -subjects: - gx-scs: - active: true - name: gx-scs - provider: plusserver GmbH - artcodix: - active: true - name: CNDS - provider: artcodix GmbH - pco-prod1: - active: true - name: pluscloud open prod1 - provider: plusserver GmbH - pco-prod2: - active: true - name: pluscloud open prod2 - provider: plusserver GmbH - pco-prod3: - active: true - name: pluscloud open prod3 - provider: plusserver GmbH - pco-prod4: - active: true - name: pluscloud open prod4 - provider: plusserver GmbH - poc-kdo: - active: true - name: PoC KDO - provider: KDO Service GmbH / OSISM GmbH - poc-wgcloud: - active: true - name: PoC WG-Cloud OSBA - provider: Cloud&Heat Technologies GmbH - syseleven-dus2: - active: true - name: SysEleven dus2 - provider: SysEleven GmbH - syseleven-ham1: - active: true - name: SysEleven ham1 - provider: SysEleven GmbH - regio-a: - active: true - name: REGIO.cloud - provider: OSISM GmbH - wavestack: - active: true - name: Wavestack - provider: noris network AG/Wavecon GmbH diff --git a/compliance-monitor/docker-compose.yml b/compliance-monitor/docker-compose.yml index a526d07e8..118290843 100644 --- a/compliance-monitor/docker-compose.yml +++ b/compliance-monitor/docker-compose.yml @@ -27,6 +27,7 @@ services: environment: - SCM_DB_HOST=postgres - SCM_DB_PASSWORD_FILE=/run/secrets/db_password + - SCM_BASE_URL=https://compliance.sovereignit.cloud/ volumes: - ../Tests:/Tests - ./bootstrap.yaml:/code/bootstrap.yaml diff --git a/compliance-monitor/monitor.py b/compliance-monitor/monitor.py index e18acc97f..c6dcb2a41 100755 --- a/compliance-monitor/monitor.py +++ b/compliance-monitor/monitor.py @@ -1,4 +1,17 @@ #!/usr/bin/env python3 +# AN IMPORTANT NOTE ON CONCURRENCY: +# This server is based on uvicorn and, as such, is not multi-threaded. +# (It could use multiple processes, but we don't do that yet.) +# Consequently, we don't need to use any measures for thread-safety. +# However, if we do at some point enable the use of multiple processes, +# we should make sure that all processes are "on the same page" with regard +# to basic data such as certificate scopes, templates, and accounts. +# One way to achieve this synchronicity could be to use the Postgres server +# more, however, I hope that more efficient ways are possible. +# Also, it is quite likely that the signal SIGHUP could no longer be used +# to trigger a re-load. In any case, the `uvicorn.run` call would have to be +# fundamentally changed: +# > You must pass the application as an import string to enable 'reload' or 'workers'. from collections import defaultdict from datetime import date, datetime, timedelta from enum import Enum @@ -7,11 +20,9 @@ import os import os.path from shutil import which +import signal from subprocess import run from tempfile import NamedTemporaryFile -# _thread: low-level library, but (contrary to the name) not private -# https://docs.python.org/3/library/_thread.html -from _thread import allocate_lock, get_ident from typing import Annotated, Optional from fastapi import Depends, FastAPI, HTTPException, Request, Response, status @@ -30,8 +41,7 @@ db_find_account, db_update_account, db_update_publickey, db_filter_publickeys, db_get_reports, db_get_keys, db_insert_report, db_get_recent_results2, db_patch_approval2, db_get_report, db_ensure_schema, db_get_apikeys, db_update_apikey, db_filter_apikeys, db_clear_delegates, - db_patch_subject, db_get_subjects, db_insert_result2, db_get_relevant_results2, db_add_delegate, - db_find_subjects, + db_find_subjects, db_insert_result2, db_get_relevant_results2, db_add_delegate, ) @@ -86,6 +96,11 @@ class ViewType(Enum): fragment = "fragment" +VIEW_REPORT = { + ViewType.markdown: 'report.md', + ViewType.fragment: 'report.md', + ViewType.page: 'overview.html', +} VIEW_DETAIL = { ViewType.markdown: 'details.md', ViewType.fragment: 'details.md', @@ -96,7 +111,12 @@ class ViewType(Enum): ViewType.fragment: 'overview.md', ViewType.page: 'overview.html', } -REQUIRED_TEMPLATES = tuple(set(fn for view in (VIEW_DETAIL, VIEW_TABLE) for fn in view.values())) +VIEW_SCOPE = { + ViewType.markdown: 'scope.md', + ViewType.fragment: 'scope.md', + ViewType.page: 'overview.html', +} +REQUIRED_TEMPLATES = tuple(set(fn for view in (VIEW_REPORT, VIEW_DETAIL, VIEW_TABLE, VIEW_SCOPE) for fn in view.values())) # do I hate these globals, but I don't see another way with these frameworks @@ -112,10 +132,7 @@ class ViewType(Enum): templates_map = { k: None for k in REQUIRED_TEMPLATES } -# map thread id (cf. `get_ident`) to a dict that maps scope uuids to scope documents -# -- access this using function `get_scopes` -_scopes = defaultdict(dict) # thread-local storage (similar to threading.local, but more efficient) -_scopes_lock = allocate_lock() # mutex lock so threads can add their local storage without races +_scopes = {} # map scope uuid to `PrecomputedScope` instance class TimestampEncoder(json.JSONEncoder): @@ -211,8 +228,6 @@ def import_bootstrap(bootstrap_path, conn): db_filter_apikeys(cur, accountid, lambda keyid, *_: keyid in keyids) keyids = set(db_update_publickey(cur, accountid, key) for key in account.get("keys", ())) db_filter_publickeys(cur, accountid, lambda keyid, *_: keyid in keyids) - for subject, record in subjects.items(): - db_patch_subject(cur, {'subject': subject, **record}) conn.commit() @@ -266,10 +281,12 @@ def evaluate(self, scope_results): by_validity[self.versions[vname].validity].append(vname) # go through worsening validity values until a passing version is found relevant = [] + best_passed = None for validity in ('effective', 'warn', 'deprecated'): vnames = by_validity[validity] relevant.extend(vnames) if any(version_results[vname]['result'] == 1 for vname in vnames): + best_passed = validity break # always include draft (but only at the end) relevant.extend(by_validity['draft']) @@ -283,6 +300,7 @@ def evaluate(self, scope_results): vname + ASTERISK_LOOKUP[self.versions[vname].validity] for vname in passed ]), + 'best_passed': best_passed, } def update_lookup(self, target_dict): @@ -321,19 +339,8 @@ def import_cert_yaml_dir(yaml_path, target_dict): def get_scopes(): - """returns thread-local copy of the scopes dict""" - ident = get_ident() - with _scopes_lock: - yaml_path = _scopes['_yaml_path'] - counter = _scopes['_counter'] - current = _scopes.get(ident) - if current is None: - _scopes[ident] = current = {'_counter': -1} - if current['_counter'] != counter: - current.clear() - import_cert_yaml_dir(yaml_path, current) - current['_counter'] = counter - return current + """returns the scopes dict""" + return _scopes def import_templates(template_dir, env, templates): @@ -540,14 +547,24 @@ async def get_status( return convert_result_rows_to_dict2(rows2, get_scopes(), include_report=True) -def render_view(view, view_type, results, base_url='/', title=None): +def _build_report_url(base_url, report, *args, **kwargs): + if kwargs.get('download'): + return f"{base_url}reports/{report}" + url = f"{base_url}page/report/{report}" + if len(args) == 2: # version, testcase_id --> add corresponding fragment specifier + url += f"#{args[0]}_{args[1]}" + return url + + +def render_view(view, view_type, detail_page='detail', base_url='/', title=None, **kwargs): media_type = {ViewType.markdown: 'text/markdown'}.get(view_type, 'text/html') stage1 = stage2 = view[view_type] if view_type is ViewType.page: stage1 = view[ViewType.fragment] - def detail_url(subject, scope): return f"{base_url}page/detail/{subject}/{scope}" # noqa: E306,E704 - def report_url(report): return f"{base_url}reports/{report}" # noqa: E306,E704 - fragment = templates_map[stage1].render(results=results, detail_url=detail_url, report_url=report_url) + def scope_url(uuid): return f"{base_url}page/scope/{uuid}" # noqa: E306,E704 + def detail_url(subject, scope): return f"{base_url}page/{detail_page}/{subject}/{scope}" # noqa: E306,E704 + def report_url(report, *args, **kwargs): return _build_report_url(base_url, report, *args, **kwargs) # noqa: E306,E704 + fragment = templates_map[stage1].render(detail_url=detail_url, report_url=report_url, scope_url=scope_url, **kwargs) if view_type != ViewType.markdown and stage1.endswith('.md'): fragment = markdown(fragment, extensions=['extra']) if stage1 != stage2: @@ -555,6 +572,23 @@ def report_url(report): return f"{base_url}reports/{report}" # noqa: E306,E704 return Response(content=fragment, media_type=media_type) +@app.get("/{view_type}/report/{report_uuid}") +async def get_report_view( + request: Request, + account: Annotated[Optional[tuple[str, str]], Depends(auth)], + conn: Annotated[connection, Depends(get_conn)], + view_type: ViewType, + report_uuid: str, +): + with conn.cursor() as cur: + specs = db_get_report(cur, report_uuid) + if not specs: + raise HTTPException(status_code=404) + spec = specs[0] + check_role(account, spec['subject'], ROLES['read_any']) + return render_view(VIEW_REPORT, view_type, report=spec, base_url=settings.base_url, title=f'Report {report_uuid}') + + @app.get("/{view_type}/detail/{subject}/{scopeuuid}") async def get_detail( request: Request, @@ -569,7 +603,7 @@ async def get_detail( rows2, get_scopes(), grace_period_days=GRACE_PERIOD_DAYS, subjects=(subject, ), scopes=(scopeuuid, ), ) - return render_view(VIEW_DETAIL, view_type, results2, base_url=settings.base_url, title=f'{subject} compliance') + return render_view(VIEW_DETAIL, view_type, results=results2, base_url=settings.base_url, title=f'{subject} compliance') @app.get("/{view_type}/detail_full/{subject}/{scopeuuid}") @@ -587,7 +621,7 @@ async def get_detail_full( results2 = convert_result_rows_to_dict2( rows2, get_scopes(), include_report=True, subjects=(subject, ), scopes=(scopeuuid, ), ) - return render_view(VIEW_DETAIL, view_type, results2, base_url=settings.base_url, title=f'{subject} compliance') + return render_view(VIEW_DETAIL, view_type, results=results2, base_url=settings.base_url, title=f'{subject} compliance') @app.get("/{view_type}/table") @@ -599,7 +633,7 @@ async def get_table( with conn.cursor() as cur: rows2 = db_get_relevant_results2(cur, approved_only=True) results2 = convert_result_rows_to_dict2(rows2, get_scopes(), grace_period_days=GRACE_PERIOD_DAYS) - return render_view(VIEW_TABLE, view_type, results2, base_url=settings.base_url, title="SCS compliance overview") + return render_view(VIEW_TABLE, view_type, results=results2, base_url=settings.base_url, title="SCS compliance overview") @app.get("/{view_type}/table_full") @@ -613,7 +647,33 @@ async def get_table_full( with conn.cursor() as cur: rows2 = db_get_relevant_results2(cur, approved_only=False) results2 = convert_result_rows_to_dict2(rows2, get_scopes()) - return render_view(VIEW_TABLE, view_type, results2, base_url=settings.base_url, title="SCS compliance overview") + return render_view( + VIEW_TABLE, view_type, results=results2, + detail_page='detail_full', base_url=settings.base_url, + title="SCS compliance overview", + ) + + +@app.get("/{view_type}/scope/{scopeuuid}") +async def get_scope( + request: Request, + conn: Annotated[connection, Depends(get_conn)], + view_type: ViewType, + scopeuuid: str, +): + spec = get_scopes()[scopeuuid].spec + versions = spec['versions'] + relevant = sorted([name for name, version in versions.items() if version['_explicit_validity']]) + modules_chart = {} + for name in relevant: + for include in versions[name]['include']: + module_id = include['module']['id'] + row = modules_chart.get(module_id) + if row is None: + row = modules_chart[module_id] = {'module': include['module'], 'columns': {}} + row['columns'][name] = include + rows = sorted(list(modules_chart.values()), key=lambda row: row['module']['id']) + return render_view(VIEW_SCOPE, view_type, spec=spec, relevant=relevant, rows=rows, base_url=settings.base_url, title=spec['name']) @app.get("/pages") @@ -657,48 +717,22 @@ async def post_results( conn.commit() -@app.get("/subjects") -async def get_subjects( - request: Request, - account: Annotated[tuple[str, str], Depends(auth)], - conn: Annotated[connection, Depends(get_conn)], - active: Optional[bool] = None, limit: int = 10, skip: int = 0, -): - """get subjects, potentially filtered by activity status""" - check_role(account, roles=ROLES['read_any']) - with conn.cursor() as cur: - return db_get_subjects(cur, active, limit, skip) - - -@app.post("/subjects") -async def post_subjects( - request: Request, - account: Annotated[tuple[str, str], Depends(auth)], - conn: Annotated[connection, Depends(get_conn)], -): - """post approvals to this endpoint""" - check_role(account, roles=ROLES['admin']) - content_type = request.headers['content-type'] - if content_type not in ('application/json', ): - raise HTTPException(status_code=500, detail="Unsupported content type") - body = await request.body() - document = json.loads(body.decode("utf-8")) - records = [document] if isinstance(document, dict) else document - with conn.cursor() as cur: - for record in records: - db_patch_subject(cur, record) - conn.commit() +def pick_filter(results, subject, scope): + """Jinja filter to pick scope results from `results` for given `subject` and `scope`""" + return results.get(subject, {}).get(scope, {}) -def passed_filter(results, subject, scope): - """Jinja filter to pick list of passed versions from `results` for given `subject` and `scope`""" - subject_data = results.get(subject) - if not subject_data: - return "" - scope_data = subject_data.get(scope) - if not scope_data: - return "" - return scope_data['passed_str'] +def summary_filter(scope_results): + """Jinja filter to construct summary from `scope_results`""" + passed_str = scope_results.get('passed_str', '') or '–' + best_passed = scope_results.get('best_passed') + # avoid simple 🟢🔴 (hard to distinguish for color-blind folks) + color = { + 'effective': '✅', + 'warn': '✅', # forgo differentiation here in favor of simplicity (will be apparent in version list) + 'deprecated': '🟧', + }.get(best_passed, '🛑') + return f'{color} {passed_str}' def verdict_filter(value): @@ -713,22 +747,31 @@ def verdict_check_filter(value): return {1: '✔', -1: '✘'}.get(value, '⚠') +def reload_static_config(*args, do_ensure_schema=False): + # allow arbitrary arguments so it can readily be used as signal handler + logger.info("loading static config") + scopes = {} + import_cert_yaml_dir(settings.yaml_path, scopes) + # import successful: only NOW destructively update global _scopes + _scopes.clear() + _scopes.update(scopes) + import_templates(settings.template_path, env=env, templates=templates_map) + validate_templates(templates=templates_map) + with mk_conn(settings=settings) as conn: + if do_ensure_schema: + db_ensure_schema(conn) + import_bootstrap(settings.bootstrap_path, conn=conn) + + if __name__ == "__main__": logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) env.filters.update( - passed=passed_filter, + pick=pick_filter, + summary=summary_filter, verdict=verdict_filter, verdict_check=verdict_check_filter, markdown=markdown, ) - with mk_conn(settings=settings) as conn: - db_ensure_schema(conn) - import_bootstrap(settings.bootstrap_path, conn=conn) - _scopes.update({ - '_yaml_path': settings.yaml_path, - '_counter': 0, - }) - _ = get_scopes() # make sure they can be read - import_templates(settings.template_path, env=env, templates=templates_map) - validate_templates(templates=templates_map) + reload_static_config(do_ensure_schema=True) + signal.signal(signal.SIGHUP, reload_static_config) uvicorn.run(app, host='0.0.0.0', port=8080, log_level="info", workers=1) diff --git a/compliance-monitor/sql.py b/compliance-monitor/sql.py index e0a1160a9..b4c2549e0 100644 --- a/compliance-monitor/sql.py +++ b/compliance-monitor/sql.py @@ -7,7 +7,6 @@ # use ... (Ellipsis) here to indicate that no default value exists (will lead to error if no value is given) ACCOUNT_DEFAULTS = {'subject': ..., 'api_key': ..., 'roles': ...} PUBLIC_KEY_DEFAULTS = {'public_key': ..., 'public_key_type': ..., 'public_key_name': ...} -SUBJECT_DEFAULTS = {'subject': ..., 'name': ..., 'provider': None, 'active': False} class SchemaVersionError(Exception): @@ -78,12 +77,6 @@ def db_ensure_schema_common(cur: cursor): accountid integer NOT NULL REFERENCES account ON DELETE CASCADE ON UPDATE CASCADE, UNIQUE (accountid, keyname) ); - CREATE TABLE IF NOT EXISTS subject ( - subject text PRIMARY KEY, - active boolean, - name text, - provider text - ); CREATE TABLE IF NOT EXISTS report ( reportid SERIAL PRIMARY KEY, reportuuid text UNIQUE, @@ -409,29 +402,3 @@ def db_patch_approval2(cur: cursor, record): RETURNING resultid;''', record) resultid, = cur.fetchone() return resultid - - -def db_get_subjects(cur: cursor, active: bool, limit, skip): - """list subjects""" - columns = ('subject', 'active', 'name', 'provider') - cur.execute(sql.SQL(''' - SELECT subject, active, name, provider - FROM subject - {where_clause} - LIMIT %(limit)s OFFSET %(skip)s;''').format( - where_clause=make_where_clause( - None if active is None else sql.SQL('active = %(active)s'), - ), - ), {"limit": limit, "skip": skip, "active": active}) - return [{col: val for col, val in zip(columns, row)} for row in cur.fetchall()] - - -def db_patch_subject(cur: cursor, record: dict): - sanitized = sanitize_record(record, SUBJECT_DEFAULTS) - cur.execute(''' - INSERT INTO subject (subject, active, name, provider) - VALUES (%(subject)s, %(active)s, %(name)s, %(provider)s) - ON CONFLICT (subject) - DO UPDATE - SET active = EXCLUDED.active, name = EXCLUDED.name, provider = EXCLUDED.provider - ;''', sanitized) diff --git a/compliance-monitor/templates/details.md.j2 b/compliance-monitor/templates/details.md.j2 index d84ebc012..30136b149 100644 --- a/compliance-monitor/templates/details.md.j2 +++ b/compliance-monitor/templates/details.md.j2 @@ -1,17 +1,21 @@ {% for subject, subject_result in results.items() -%} -# {{ subject }} +{# omit h1 title here because we can only have one of those, + and the html wrapper template will add one anyway -#} {% for scopeuuid, scope_result in subject_result.items() -%} -{% if not scope_result.relevant -%} ## {{ scope_result.name }} +- [spec overview]({{ scope_url(scopeuuid) }}) + +{% if not scope_result.relevant -%} + No recent test results available. {% endif -%} {% for version in scope_result.relevant -%} {%- set version_result = scope_result.versions[version] -%} -## {{ scope_result.name }} {{ version }} ({{ version_result.validity }}): {{ version_result.result | verdict }} +### {{ version }} ({{ version_result.validity }}): {{ version_result.result | verdict }} {% for target, target_result in version_result.targets.items() -%} -### Target {{ target }}: {{ target_result.result | verdict }} +#### Target {{ target }}: {{ target_result.result | verdict }} | testcase id | result | description | |---|---|---| @@ -20,7 +24,7 @@ No recent test results available. {% set res = version_result.results[testcase_id] if testcase_id in version_result.results else dict(result=0) -%} | {% if res.result != 1 %}⚠️ {% endif %}{{ testcase.id }} | {#- #} {% if res.report -%} -[{{ res.result | verdict_check }}]({{ report_url(res.report) }}) +[{{ res.result | verdict_check }}]({{ report_url(res.report, version, testcase_id) }}) {%- else -%} {{ res.result | verdict_check }} {%- endif -%} diff --git a/compliance-monitor/templates/overview.html.j2 b/compliance-monitor/templates/overview.html.j2 index 7562ce368..830b94121 100644 --- a/compliance-monitor/templates/overview.html.j2 +++ b/compliance-monitor/templates/overview.html.j2 @@ -1,14 +1,18 @@ + + +{{ title or 'SCS compliance overview' }} + + - - -{{ title or 'SCS compliance overview' }} - -{{fragment}} +{% if title %}

{{title}}

+{% endif %}{{fragment}} diff --git a/compliance-monitor/templates/overview.md.j2 b/compliance-monitor/templates/overview.md.j2 index 8da599af1..77ba6bcc9 100644 --- a/compliance-monitor/templates/overview.md.j2 +++ b/compliance-monitor/templates/overview.md.j2 @@ -2,41 +2,47 @@ we could of course iterate over results etc., but hardcode the table (except the actual results, of course) for the time being to have the highest degree of control -#} + +Version numbers are suffixed by a symbol depending on state: * for _draft_, † for _warn_ (soon to be deprecated), and †† for _deprecated_. + {% set iaas = '50393e6f-2ae1-4c5c-a62c-3b75f2abef3f' -%} -| Name | Description | Operator | SCS-compatible IaaS | HealthMon | +| Name | Description | Operator | [SCS-compatible IaaS](https://docs.scs.community/standards/scs-compatible-iaas/) | HealthMon | |-------|--------------|-----------|----------------------|:----------:| | [gx-scs](https://github.com/SovereignCloudStack/docs/blob/main/community/cloud-resources/plusserver-gx-scs.md) | Dev environment provided for SCS & GAIA-X context | plusserver GmbH | -{#- #} [{{ results | passed('gx-scs', iaas) or '–' }}]({{ detail_url('gx-scs', iaas) }}) {# -#} +{#- #} [{{ results | pick('gx-scs', iaas) | summary }}]({{ detail_url('gx-scs', iaas) }}) {# -#} | [HM](https://health.gx-scs.sovereignit.cloud:3000/) | | [aov.cloud](https://www.aov.de/) | Community cloud for customers | aov IT.Services GmbH | -{#- #} [{{ results | passed('aov', iaas) or '–' }}]({{ detail_url('aov', iaas) }}) {# -#} +{#- #} [{{ results | pick('aov', iaas) | summary }}]({{ detail_url('aov', iaas) }}) {# -#} | [HM](https://health.aov.cloud/) | | [CNDS](https://cnds.io/) | Public cloud for customers | artcodix GmbH | -{#- #} [{{ results | passed('artcodix', iaas) or '–' }}]({{ detail_url('artcodix', iaas) }}) {# -#} +{#- #} [{{ results | pick('artcodix', iaas) | summary }}]({{ detail_url('artcodix', iaas) }}) {# -#} | [HM](https://ohm.muc.cloud.cnds.io/) | | [pluscloud open](https://www.plusserver.com/en/products/pluscloud-open)
(4 regions) | Public cloud for customers | plusserver GmbH | {# #} -{#- #}prod1: [{{ results | passed('pco-prod1', iaas) or '–' }}]({{ detail_url('pco-prod1', iaas) }}){# -#} +{#- #}prod1: [{{ results | pick('pco-prod1', iaas) | summary }}]({{ detail_url('pco-prod1', iaas) }}){# -#}
-{#- #}prod2: [{{ results | passed('pco-prod2', iaas) or '–' }}]({{ detail_url('pco-prod2', iaas) }}){# -#} +{#- #}prod2: [{{ results | pick('pco-prod2', iaas) | summary }}]({{ detail_url('pco-prod2', iaas) }}){# -#}
-{#- #}prod3: [{{ results | passed('pco-prod3', iaas) or '–' }}]({{ detail_url('pco-prod3', iaas) }}){# -#} +{#- #}prod3: [{{ results | pick('pco-prod3', iaas) | summary }}]({{ detail_url('pco-prod3', iaas) }}){# -#}
-{#- #}prod4: [{{ results | passed('pco-prod4', iaas) or '–' }}]({{ detail_url('pco-prod4', iaas) }}) {# -#} +{#- #}prod4: [{{ results | pick('pco-prod4', iaas) | summary }}]({{ detail_url('pco-prod4', iaas) }}) {# -#} | [HM1](https://health.prod1.plusserver.sovereignit.cloud:3000/d/9ltTEmlnk/openstack-health-monitor2?orgId=1&var-mycloud=plus-pco)
[HM2](https://health.prod1.plusserver.sovereignit.cloud:3000/d/9ltTEmlnk/openstack-health-monitor2?orgId=1&var-mycloud=plus-prod2)
[HM3](https://health.prod1.plusserver.sovereignit.cloud:3000/d/9ltTEmlnk/openstack-health-monitor2?orgId=1&var-mycloud=plus-prod3)
[HM4](https://health.prod1.plusserver.sovereignit.cloud:3000/d/9ltTEmlnk/openstack-health-monitor2?orgId=1&var-mycloud=plus-prod4) | | PoC KDO | Cloud PoC for FITKO | KDO Service GmbH / OSISM GmbH | -{#- #} [{{ results | passed('poc-kdo', iaas) or '–' }}]({{ detail_url('poc-kdo', iaas) }}) {# -#} +{#- #} [{{ results | pick('poc-kdo', iaas) | summary }}]({{ detail_url('poc-kdo', iaas) }}) {# -#} | (soon) | | PoC WG-Cloud OSBA | Cloud PoC for FITKO | Cloud&Heat Technologies GmbH | -{#- #} [{{ results | passed('poc-wgcloud', iaas) or '–' }}]({{ detail_url('poc-wgcloud', iaas) }}) {# -#} +{#- #} [{{ results | pick('poc-wgcloud', iaas) | summary }}]({{ detail_url('poc-wgcloud', iaas) }}) {# -#} | [HM](https://health.poc-wgcloud.osba.sovereignit.cloud:3000/d/9ltTEmlnk/openstack-health-monitor2?var-mycloud=poc-wgcloud&orgId=1) | | [REGIO.cloud](https://regio.digital) | Public cloud for customers | OSISM GmbH | -{#- #} [{{ results | passed('regio-a', iaas) or '–' }}]({{ detail_url('regio-a', iaas) }}) {# -#} +{#- #} [{{ results | pick('regio-a', iaas) | summary }}]({{ detail_url('regio-a', iaas) }}) {# -#} | [HM](https://apimon.services.regio.digital/public-dashboards/17cf094a47404398a5b8e35a4a3968d4?orgId=1&refresh=5m) | +| [ScaleUp Open Cloud](https://www.scaleuptech.com/cloud-hosting/) | Public cloud for customers | ScaleUp Technologies GmbH & Co. KG | +{#- #} [{{ results | pick('scaleup-occ2', iaas) | summary }}]({{ detail_url('scaleup-occ2', iaas) }}) {# -#} +| [HM](https://health.occ2.scaleup.sovereignit.cloud) | | [syseleven](https://www.syseleven.de/en/products-services/openstack-cloud/)
(2 SCS regions) | Public OpenStack Cloud | SysEleven GmbH | {# #} -{#- #}dus2: [{{ results | passed('syseleven-dus2', iaas) or '–' }}]({{ detail_url('syseleven-dus2', iaas) }}){# -#} +{#- #}dus2: [{{ results | pick('syseleven-dus2', iaas) | summary }}]({{ detail_url('syseleven-dus2', iaas) }}){# -#}
-{#- #}ham1: [{{ results | passed('syseleven-ham1', iaas) or '–' }}]({{ detail_url('syseleven-ham1', iaas) }}) {# -#} +{#- #}ham1: [{{ results | pick('syseleven-ham1', iaas) | summary }}]({{ detail_url('syseleven-ham1', iaas) }}) {# -#} | (soon)
(soon) | | [Wavestack](https://www.noris.de/wavestack-cloud/) | Public cloud for customers | noris network AG/Wavecon GmbH | -{#- #} [{{ results | passed('wavestack', iaas) or '–' }}]({{ detail_url('wavestack', iaas) }}) {# -#} +{#- #} [{{ results | pick('wavestack', iaas) | summary }}]({{ detail_url('wavestack', iaas) }}) {# -#} | [HM](https://health.wavestack1.sovereignit.cloud:3000/) | diff --git a/compliance-monitor/templates/report.md.j2 b/compliance-monitor/templates/report.md.j2 new file mode 100644 index 000000000..e46c2e086 --- /dev/null +++ b/compliance-monitor/templates/report.md.j2 @@ -0,0 +1,66 @@ +## General info + +- uuid: [{{ report.run.uuid }}]({{ report_url(report.run.uuid, download=True) }}) +- subject: {{ report.subject }} +- scope: [{{ report.spec.name }}]({{ scope_url(report.spec.uuid) }}) +- checked at: {{ report.checked_at }} + +## Results + +{% for version, version_results in report.versions.items() %}{% if version_results %} +### {{ version }} + +| test case | result | invocation | +|---|---|---| +{% for testcase_id, result_data in version_results.items() -%} +| {{ testcase_id }} {: #{{ version + '_' + testcase_id }} } | {{ result_data.result | verdict_check }} | [{{ result_data.invocation }}](#{{ result_data.invocation }}) | +{% endfor %} +{% endif %}{% endfor %} + +## Run + +### Variable assignment + +| key | value | +|---|---| +{% for key, value in report.run.assignment.items() -%} +| `{{ key }}` | `{{ value }}` | +{% endfor %} + +### Check tool invocations + +{% for invid, invdata in report.run.invocations.items() %} +#### Invocation {{invid}} {: #{{ invid }} } + +- cmd: `{{ invdata.cmd }}` +- rc: {{ invdata.rc }} +- channel summary +{%- for channel in ('critical', 'error', 'warning') %} +{%- if invdata[channel] %} + - **{{ channel }}: {{ invdata[channel] }}** +{%- else %} + - {{ channel }}: – +{%- endif %} +{%- endfor %} +- results +{%- for resultid, result in invdata.results.items() %} + - {{ resultid }}: {{ result | verdict_check }} +{%- endfor %} + +{% if invdata.stdout -%} +
Captured stdout +```text +{{ '\n'.join(invdata.stdout) }} +``` +
+{%- endif %} + +{% if invdata.stderr -%} +
Captured stderr +{%- for line in invdata.stderr %} +
{% if line.split(':', 1)[0].lower() in ('warning', 'error', 'critical') %}{{ '' + line + '' }}{% else %}{{ line }}{% endif %}
+{%- endfor %} +
+{%- endif %} + +{% endfor %} diff --git a/compliance-monitor/templates/scope.md.j2 b/compliance-monitor/templates/scope.md.j2 new file mode 100644 index 000000000..7c46abce6 --- /dev/null +++ b/compliance-monitor/templates/scope.md.j2 @@ -0,0 +1,26 @@ +| Scope versions -> | {% for name in relevant %} {{name}} |{% endfor %} +| :----------------- | {% for name in relevant %} :-- |{% endfor %} +| State | {% for name in relevant %} {{spec.versions[name].validity | capitalize}} |{% endfor %} +| Stabilized at | {% for name in relevant %} {{spec.versions[name].stabilized_at}} |{% endfor %} +| **Modules** | {% for name in relevant %} |{% endfor %} +{% for row in rows -%} +| [{% if row.module.id.startswith('scs-') %}{{row.module.id}}: {% endif %}{{row.module.name}}]({{row.module.url}}) |{% +for name in relevant + %} {% set column = row.columns[name] %}{% + if column + %}X{% + if column.parameters + %} ({% + for key, value in column.parameters.items() + %}{% + if value.startswith("https://") + %}[{{key}}]({{value}}){% + else + %}{{key}}={{value}}{% + endif %}{{ ", " if not loop.last }}){% + endfor %}{% + endif %}{% + endif %} |{% +endfor +%} +{% endfor %} diff --git a/playbooks/clouds.yaml.j2 b/playbooks/clouds.yaml.j2 index da0d3602d..2df1cdbd8 100644 --- a/playbooks/clouds.yaml.j2 +++ b/playbooks/clouds.yaml.j2 @@ -83,6 +83,15 @@ clouds: application_credential_id: "{{ clouds_conf.regio_a_ac_id }}" application_credential_secret: "{{ clouds_conf.regio_a_ac_secret }}" auth_type: "v3applicationcredential" + scaleup-occ2: + auth_type: v3applicationcredential + auth: + auth_url: https://keystone.occ2.scaleup.cloud + application_credential_id: "{{ clouds_conf.scaleup_occ2_ac_id }}" + application_credential_secret: "{{ clouds_conf.scaleup_occ2_ac_secret }}" + region_name: "RegionOne" + interface: "public" + identity_api_version: 3 syseleven-dus2: interface: public identity_api_verion: 3