diff --git a/.devcontainer.json b/.devcontainer.json new file mode 100644 index 00000000..9b21c4ef --- /dev/null +++ b/.devcontainer.json @@ -0,0 +1,131 @@ +{ + "name": "${localWorkspaceFolder}", + + "dockerComposeFile": [ + "docker-compose.yml", + "docker-compose.override.yml" + ], + + "service": "hetzner-k3s", + + "workspaceFolder": "/home/app/hetzner-k3s", + + "shutdownAction": "stopCompose", + + "customizations": { + "vscode": { + "extensions": [ + "formulahendry.auto-close-tag", + "formulahendry.auto-rename-tag", + "aaron-bond.better-comments", + "angelomollame.conflict-squeezer", + "ms-vscode-remote.remote-containers", + "ms-azuretools.vscode-docker", + "mikestead.dotenv", + "kaiwood.endwise", + "usernamehw.errorlens", + "ninoseki.vscode-gem-lens", + "mhutchie.git-graph", + "donjayamanne.githistory", + "github.vscode-github-actions", + "GitHub.vscode-pull-request-github", + "GitHub.remotehub", + "codezombiech.gitignore", + "eamodio.gitlens", + "ZainChen.json", + "ziyasal.vscode-open-in-github", + "christian-kohler.path-intellisense", + "wayou.vscode-todo-highlight", + "redhat.vscode-xml", + "redhat.vscode-yaml", + "mutantdino.resourcemonitor", + "technosophos.vscode-helm", + "jgillich.crystal-lang-fixed" + // "crystal-lang-tools.crystal-lang" + ], + "recommendations": [ + "GitHub.copilot", + "GitHub.copilot-chat", + "JeroenV.github-copilot-with-context" + ], + "settings": { + "editor.tabSize": 2, + "editor.wordWrapColumn": 200, + "editor.renderWhitespace": "all", + "editor.accessibilitySupport": "off", + "editor.inlineSuggest.enabled": true, + "editor.bracketPairColorization.enabled": true, + "editor.foldingImportsByDefault": true, + "editor.tabCompletion": "on", + "editor.fontFamily": "Source Code Pro", + "editor.linkedEditing": true, + "editor.stickyScroll.enabled": true, + + "diffEditor.codeLens": true, + + "files.trimTrailingWhitespace": true, + "files.insertFinalNewline": true, + "files.exclude": { + "**/.history": true + }, + + "terminal.integrated.defaultProfile.linux": "zsh", + "terminal.integrated.shell.linux": "/bin/zsh", + "terminal.integrated.cwd": "/home/app/hetzner-k3s", + "terminal.integrated.allowChords": false, + "terminal.explorerKind": "integrated", + "terminal.integrated.shellIntegration.enabled": true, + "terminal.integrated.scrollback": 20000, + + "scm.autoReveal": false, + + "git.autofetch": true, + "git.pruneOnFetch": true, + "git.openRepositoryInParentFolders": "never", + + "gitlens.ai.experimental.provider": "openai", + "gitlens.ai.experimental.openai.model": "gpt-4-turbo-preview", + + "githubPullRequests.pullBranch": "never", + + "github.copilot.enable": { + "*": true, + "plaintext": false, + "markdown": false, + "scminput": false, + "yaml": true + }, + + "security.workspace.trust.untrustedFiles": "open", + + "redhat.telemetry.enabled": false, + + "yaml.schemas": { + "kubernetes": "*.yaml,*.yml" + }, + + "workbench.commandPalette.preserveInput": true, + "workbench.startupEditor": "none", + "workbench.editor.closeOnFileDelete": true, + "workbench.editor.highlightModifiedTabs": true, + "workbench.editor.autoLockGroups": { + "terminalEditor": false + }, + + "explorer.openEditors.visible": 1, + "explorer.autoReveal": false, + "explorer.confirmDragAndDrop": false, + + "[json]": { + "editor.defaultFormatter": "vscode.json-language-features" + }, + + "telemetry.telemetryLevel": "off" + } + } + }, + + "containerEnv": { + "PROJECT": "${localWorkspaceFolder}" + } +} diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 00000000..163eb75c --- /dev/null +++ b/.editorconfig @@ -0,0 +1,9 @@ +root = true + +[*.cr] +charset = utf-8 +end_of_line = lf +insert_final_newline = true +indent_style = space +indent_size = 2 +trim_trailing_whitespace = true diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 00000000..89e1e5ea --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,4 @@ +# These are supported funding model platforms + +github: [vitobotta] + diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 00000000..b46682be --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,85 @@ +name: Create release + +on: + push: + tags: + - 'v*' + +jobs: + + build_releases: + name: Build binary - ${{ matrix.binary_os_suffix }}-${{ matrix.binary_arch_suffix }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + VERSION: ${{ github.ref_name }} + MESSAGE: "WIP" + FILENAME: hetzner-k3s-${{ matrix.binary_os_suffix }}-${{ matrix.binary_arch_suffix }} + + strategy: + matrix: + include: + - runs_on_tag: macos-13 + binary_os_suffix: macos + binary_arch_suffix: amd64 + - runs_on_tag: macos-14 + binary_os_suffix: macos + binary_arch_suffix: arm64 + architecture: arm + - runs_on_tag: ubuntu-22.04 + binary_os_suffix: linux + binary_arch_suffix: arm64 + arch: aarch64 + distro: alpine_latest + - runs_on_tag: ubuntu-24.04 + binary_os_suffix: linux + binary_arch_suffix: amd64 + arch: none + distro: none + base_image: amd64/alpine + + runs-on: ${{ matrix.runs_on_tag }} + + steps: + - name: Checkout code + uses: actions/checkout@v3 + with: + set-safe-directory: '/home/runner/work/hetzner-k3s/hetzner-k3s' + + - if: matrix.binary_os_suffix == 'macos' + name: macos build step + run: | + brew install crystal libssh2 openssl@3 + shards install --without-development + crystal build src/hetzner-k3s.cr --release + chmod +x hetzner-k3s + cp hetzner-k3s ${{ env.FILENAME }} + + - if: matrix.binary_os_suffix == 'linux' + uses: uraimo/run-on-arch-action@v2 + name: linux build step + with: + arch: ${{ matrix.arch }} + distro: ${{ matrix.distro }} + base_image: ${{ matrix.base_image }} + + shell: /bin/sh + + install: | + apk update + apk add --update --no-cache gcc gmp-dev libevent-static musl-dev pcre-dev pcre2-dev libxml2-dev \ + libxml2-static openssl-dev openssl-libs-static tzdata yaml-static zlib-static xz-static \ + make git autoconf automake libtool patch libssh2-static libssh2-dev crystal shards + + run: | + shards install --without-development + crystal build src/hetzner-k3s.cr --release --static + cp hetzner-k3s ${{ env.FILENAME }} + + - name: Publish new version + uses: svenstaro/upload-release-action@v2 + with: + repo_token: ${{ env.GITHUB_TOKEN }} + file: ${{ env.FILENAME }} + tag: ${{ env.VERSION }} + overwrite: true + body: ${{ env.MESSAGE }} diff --git a/.gitignore b/.gitignore index be321203..d379665e 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,27 @@ .rspec_status /kubeconfig /cluster_config.yaml +dist/hetzner-k3s.jar +dist/hetzner-k3s + +/lib/ +/bin/ +/.shards/ +*.dwarf +hetzner-k3s +k3s-releases.yaml +actions-runner +/.idea/ +.DS_Store +.env.vars +.zsh_history +temp +docker-compose.override.yml +cilium +calicoctl + +e2e-tests/env +e2e-tests/sshkey* +e2e-tests/test-* +create.log +delete.log diff --git a/.rspec b/.rspec deleted file mode 100644 index 34c5164d..00000000 --- a/.rspec +++ /dev/null @@ -1,3 +0,0 @@ ---format documentation ---color ---require spec_helper diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index a12ad5e1..00000000 --- a/.travis.yml +++ /dev/null @@ -1,6 +0,0 @@ ---- -language: ruby -cache: bundler -rvm: - - 2.7.2 -before_install: gem install bundler -v 2.1.4 diff --git a/.zshrc b/.zshrc new file mode 100644 index 00000000..1d86132f --- /dev/null +++ b/.zshrc @@ -0,0 +1,63 @@ +ZSH=$HOME/.oh-my-zsh + +autoload -U +X compinit && compinit +autoload -U +X bashcompinit && bashcompinit + +plugins=(zsh-autosuggestions kubectl kubectx git-prompt) + +export LC_CTYPE=en_US.UTF-8 +export LC_ALL=en_US.UTF-8 +export EDITOR='code --wait' +export VISUAL='code --wait' +export TERM=xterm-256color +export XDG_CONFIG_HOME=$HOME/.config +export ZSH_AUTOSUGGEST_ACCEPT_WIDGETS=("${(@)ZSH_AUTOSUGGEST_ACCEPT_WIDGETS:#forward-char}") +export PATH="/home/app/hetzner-k3s/bin:$HOME/.krew/bin:./bin:$HOME/bin:$GOPATH/bin:$PATH" +export HISTFILE="/home/app/hetzner-k3s/.zsh_history" + +source $ZSH/oh-my-zsh.sh +source <(kubectl completion zsh) +source <(stern --completion=zsh) + +alias k="kubectl" +alias stern="stern -s 1s" + +bindkey '^a' autosuggest-accept +bindkey '\C-[OC' forward-word # ctrl-right +bindkey "\e[1;5C" forward-word # ctrl-right + +ulimit -n 65536 + +setTerminalText () { + local mode=$1 ; shift + echo -ne "\033]$mode;$@\007" +} + +stt_title () { + setTerminalText 2 $@; +} + +k8s_prompt_info() { + local ctx=$(kubectl config current-context 2>/dev/null) + local ns=$(kubectl config view --minify --output 'jsonpath={..namespace}' 2>/dev/null) + + if [[ -n $ctx ]]; then + echo "[%{$fg_bold[green]%}$ctx%{$reset_color%}:%{$fg_bold[blue]%}$ns%{$reset_color%}]" + fi +} + +PROMPT='[${${PROJECT##*/}%}] %~%b $(git_super_status) $(k8s_prompt_info)%\> ' +RPROMPT='%T' + +set -a +source /home/app/hetzner-k3s/.env.vars +set +a + +echo "*" > /home/app/hetzner-k3s/tmp/.gitignore +echo "!.gitignore" >> /home/app/hetzner-k3s/tmp/.gitignore + +eval "$(ssh-agent -s)" + +ssh-add -k ~/.ssh/id_*[!.pub] + +clear diff --git a/Dockerfile.dev b/Dockerfile.dev new file mode 100644 index 00000000..f2ae910d --- /dev/null +++ b/Dockerfile.dev @@ -0,0 +1,41 @@ +FROM alpine:3.20.1 + +RUN apk update \ + && apk add --update --no-cache gcc gmp-dev libevent-static musl-dev pcre-dev pcre2-dev libxml2-dev \ + libxml2-static openssl-dev openssl-libs-static tzdata yaml-static zlib-static xz-static \ + make git autoconf automake libtool patch libssh2-static libssh2-dev crystal shards \ + curl docker zsh bash openssl k9s shadow go envsubst util-linux \ + gcc g++ libc-dev libxml2-dev openssl-dev yaml-dev zlib-dev crystal openssh-client + +RUN curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/$(uname -m | sed -e 's/x86_64/amd64/' -e 's/aarch64/arm64/')/kubectl" && chmod +x ./kubectl && mv ./kubectl /usr/local/bin/kubectl + +RUN sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" \ + && git clone https://github.com/zsh-users/zsh-autosuggestions ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-autosuggestions + +RUN set -x; cd "$(mktemp -d)" \ + && OS="$(uname | tr '[:upper:]' '[:lower:]')" \ + && ARCH="$(uname -m | sed -e 's/x86_64/amd64/' -e 's/\(arm\)\(64\)\?.*/\1\2/' -e 's/aarch64$/arm64/')" \ + && KREW="krew-${OS}_${ARCH}" \ + && curl -fsSLO "https://github.com/kubernetes-sigs/krew/releases/latest/download/${KREW}.tar.gz" \ + && tar zxvf "${KREW}.tar.gz" \ + && ./"${KREW}" install krew \ + && PATH="$HOME/.krew/bin:$PATH" kubectl krew install ctx \ + && PATH="$HOME/.krew/bin:$PATH" kubectl krew install ns + +RUN curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + +ENV GOPATH=/root/go + +RUN mkdir -p $GOPATH/bin + +RUN go install github.com/stern/stern@latest + +SHELL ["/bin/zsh", "-c"] + +RUN echo '/bin/zsh' | chsh `whoami` + +RUN rm /root/.zshrc && ln -s /home/app/hetzner-k3s/.zshrc /root/.zshrc + +ENV DEVCONTAINER=true + +CMD ["tail", "-f", "/dev/null"] diff --git a/Gemfile b/Gemfile deleted file mode 100644 index 69d0b2b4..00000000 --- a/Gemfile +++ /dev/null @@ -1,7 +0,0 @@ -source "https://rubygems.org" - -# Specify your gem's dependencies in k3s.gemspec -gemspec - -gem "rake", "~> 12.0" -gem "rspec", "~> 3.0" diff --git a/Gemfile.lock b/Gemfile.lock deleted file mode 100644 index 261f21b6..00000000 --- a/Gemfile.lock +++ /dev/null @@ -1,111 +0,0 @@ -PATH - remote: . - specs: - hetzner-k3s (0.3.0) - http - k8s-ruby - net-ssh - sshkey - thor - -GEM - remote: https://rubygems.org/ - specs: - addressable (2.8.0) - public_suffix (>= 2.0.2, < 5.0) - concurrent-ruby (1.1.9) - diff-lcs (1.4.4) - domain_name (0.5.20190701) - unf (>= 0.0.5, < 1.0.0) - dry-configurable (0.12.1) - concurrent-ruby (~> 1.0) - dry-core (~> 0.5, >= 0.5.0) - dry-container (0.8.0) - concurrent-ruby (~> 1.0) - dry-configurable (~> 0.1, >= 0.1.3) - dry-core (0.7.1) - concurrent-ruby (~> 1.0) - dry-equalizer (0.3.0) - dry-inflector (0.2.1) - dry-logic (0.6.1) - concurrent-ruby (~> 1.0) - dry-core (~> 0.2) - dry-equalizer (~> 0.2) - dry-struct (0.5.1) - dry-core (~> 0.4, >= 0.4.3) - dry-equalizer (~> 0.2) - dry-types (~> 0.13) - ice_nine (~> 0.11) - dry-types (0.13.4) - concurrent-ruby (~> 1.0) - dry-container (~> 0.3) - dry-core (~> 0.4, >= 0.4.4) - dry-equalizer (~> 0.2) - dry-inflector (~> 0.1, >= 0.1.2) - dry-logic (~> 0.4, >= 0.4.2) - excon (0.85.0) - ffi (1.15.3) - ffi-compiler (1.0.1) - ffi (>= 1.0.0) - rake - hashdiff (1.0.1) - http (4.4.1) - addressable (~> 2.3) - http-cookie (~> 1.0) - http-form_data (~> 2.2) - http-parser (~> 1.2.0) - http-cookie (1.0.4) - domain_name (~> 0.5) - http-form_data (2.3.0) - http-parser (1.2.3) - ffi-compiler (>= 1.0, < 2.0) - ice_nine (0.11.2) - jsonpath (0.9.9) - multi_json - to_regexp (~> 0.2.1) - k8s-ruby (0.10.5) - dry-struct (~> 0.5.0) - dry-types (~> 0.13.0) - excon (~> 0.71) - hashdiff (~> 1.0.0) - jsonpath (~> 0.9.5) - recursive-open-struct (~> 1.1.0) - yajl-ruby (~> 1.4.0) - yaml-safe_load_stream (~> 0.1) - multi_json (1.15.0) - net-ssh (6.1.0) - public_suffix (4.0.6) - rake (12.3.3) - recursive-open-struct (1.1.3) - rspec (3.10.0) - rspec-core (~> 3.10.0) - rspec-expectations (~> 3.10.0) - rspec-mocks (~> 3.10.0) - rspec-core (3.10.1) - rspec-support (~> 3.10.0) - rspec-expectations (3.10.1) - diff-lcs (>= 1.2.0, < 2.0) - rspec-support (~> 3.10.0) - rspec-mocks (3.10.2) - diff-lcs (>= 1.2.0, < 2.0) - rspec-support (~> 3.10.0) - rspec-support (3.10.2) - sshkey (2.0.0) - thor (1.1.0) - to_regexp (0.2.1) - unf (0.1.4) - unf_ext - unf_ext (0.0.7.7) - yajl-ruby (1.4.1) - yaml-safe_load_stream (0.1.1) - -PLATFORMS - ruby - -DEPENDENCIES - hetzner-k3s! - rake (~> 12.0) - rspec (~> 3.0) - -BUNDLED WITH - 2.1.4 diff --git a/LICENSE.txt b/LICENSE.txt index 458dd912..a1d7f945 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2021 Vito Botta +Copyright (c) 2023 Vito Botta Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 0d87b298..08f83602 100644 --- a/README.md +++ b/README.md @@ -1,238 +1,76 @@ -# Create production grade Kubernetes clusters in Hetzner Cloud in a couple of minutes or less +![GitHub release (latest SemVer)](https://img.shields.io/github/v/release/vitobotta/hetzner-k3s) +![GitHub Release Date](https://img.shields.io/github/release-date/vitobotta/hetzner-k3s) +![GitHub last commit](https://img.shields.io/github/last-commit/vitobotta/hetzner-k3s) +![GitHub issues](https://img.shields.io/github/issues-raw/vitobotta/hetzner-k3s) +![GitHub pull requests](https://img.shields.io/github/issues-pr-raw/vitobotta/hetzner-k3s) +![GitHub](https://img.shields.io/github/license/vitobotta/hetzner-k3s) +![GitHub Discussions](https://img.shields.io/github/discussions/vitobotta/hetzner-k3s) +![GitHub top language](https://img.shields.io/github/languages/top/vitobotta/hetzner-k3s) + +![GitHub forks](https://img.shields.io/github/forks/vitobotta/hetzner-k3s?style=social) +![GitHub Repo stars](https://img.shields.io/github/stars/vitobotta/hetzner-k3s?style=social) -This is a CLI tool - based on a Ruby gem - to quickly create and manage Kubernetes clusters in [Hetzner Cloud](https://www.hetzner.com/cloud) using the lightweight Kubernetes distribution [k3s](https://k3s.io/) from [Rancher](https://rancher.com/). - -Hetzner Cloud is an awesome cloud provider which offers a truly great service with the best performance/cost ratio in the market. I highly recommend them if European locations (Germany and Finland) are OK for your projects (the Nuremberg data center has decent latency for US users as well). With Hetzner's Cloud Controller Manager and CSI driver you can provision load balancers and persistent volumes very easily. - -k3s is my favorite Kubernetes distribution now because it uses much less memory and CPU, leaving more resources to workloads. It is also super quick to deploy because it's a single binary. - -Using this tool, creating a highly available k3s cluster with 3 masters for the control plane and 3 worker nodes takes about **a couple of minutes** only. This includes - -- creating the infra resources (servers, private network, firewall, load balancer for the API server for HA clusters) -- deploying k3s to the nodes -- installing the [Hetzner Cloud Controller Manager](https://github.com/hetznercloud/hcloud-cloud-controller-manager) to provision load balancers right away -- installing the [Hetzner CSI Driver](https://github.com/hetznercloud/csi-driver) to provision persistent volumes using Hetzner's block storage -- installing the [Rancher System Upgrade Controller](https://github.com/rancher/system-upgrade-controller) to make upgrades to a newer version of k3s easy and quick - - -## Requirements - -All that is needed to use this tool is - -- an Hetzner Cloud account -- an Hetzner Cloud token: for this you need to create a project from the cloud console, and then an API token with **both read and write permissions** (sidebar > Security > API Tokens); you will see the token only once, so ensure you take note of it somewhere safe -- a recent Ruby runtime installed (see [this page](https://www.ruby-lang.org/en/documentation/installation/) for instructions if you are not familiar with Ruby). I am also going to try and create single binaries for this tool that will include the Ruby runtime, for easier installation. - -## Installation - -Once you have the Ruby runtime up and running, you just need to install the gem: - -```bash -gem install hetzner-k3s -``` - -This will install the `hetzner-k3s` executable in your PATH. - -## Creating a cluster - -The tool requires a simple configuration file in order to create/upgrade/delete clusters, in the YAML format like in the example below: - -```yaml --- -hetzner_token: -cluster_name: test -kubeconfig_path: "./kubeconfig" -k3s_version: v1.21.3+k3s1 -ssh_key_path: "~/.ssh/id_rsa.pub" -verify_host_key: false -location: nbg1 -masters: - instance_type: cpx21 - instance_count: 3 -worker_node_pools: -- name: small - instance_type: cpx21 - instance_count: 4 -- name: big - instance_type: cpx31 - instance_count: 2 -``` - -It should hopefully be self explanatory; you can run `hetzner-k3s releases` to see a list of the available releases from the most recent to the oldest available. - -If you set `masters.instance_count` to 1 then the tool will create a non highly available control plane; for production clusters you may want to set it to a number greater than 1. This number must be odd to avoid split brain issues with etcd and the recommended number is 3. - -You can specify any number of worker node pools for example to have mixed nodes with different specs for different workloads. - -At the moment Hetzner Cloud has three locations: two in Germany (`nbg1`, Nuremberg and `fsn1`, Falkensteing) and one in Finland (`hel1`, Helsinki). - -For the available instance types and their specs, either check from inside a project when adding a server manually or run the following with your Hetzner token: - -```bash -curl \ - -H "Authorization: Bearer $API_TOKEN" \ - 'https://api.hetzner.cloud/v1/server_types' -``` - - -Note: the option `verify_host_key` is by default set to `false` to disable host key verification. This is because sometimes when creating new servers, Hetzner may assign IP addresses that were previously used by other servers you owned in the past. Therefore the host key verification would fail. If you set this option to `true` and this happens, the tool won't be able to continue creating the cluster until you resolve the issue with one of the suggestions it will give you. - -Finally, to create the cluster run: - -```bash -hetzner-k3s create-cluster --config-file cluster_config.yaml -``` - -This will take a couple of minutes or less depending on the number of masters and worker nodes. - -If you are creating an HA cluster and see the following in the output you can safely ignore it - it happens when additional masters are joining the first one: - -``` -Job for k3s.service failed because the control process exited with error code. -See "systemctl status k3s.service" and "journalctl -xe" for details. -``` - - -### Idempotency - -The `create-cluster` command can be run any number of times with the same configuration without causing any issue, since the process is idempotent. This means that if for some reason the create process gets stuck or throws errors (for example if the Hetzner API is unavailable or there are timeouts etc), you can just stop the current command, and re-run it with the same configuration to continue from where it left. - -### Adding nodes - -To add one or more nodes to a node pool, just change the instance count in the configuration file for that node pool and re-run the create command. -### Scaling down a node pool +# The easiest and fastest way to create production grade Kubernetes clusters in Hetzner Cloud -To make a node pool smaller: -- decrease the instance count for the node pool in the configuration file so that those extra nodes are not recreated in the future -- delete the nodes from Kubernetes (`kubectl delete node `) -- delete the instances from the cloud console (make sure you delete the correct ones :p) -In a future relese I will add some automation for the cleanup. +## What is this? -### Replacing a problematic node +This is a CLI tool to super quickly and super easily create and manage Kubernetes clusters in [Hetzner Cloud](https://www.hetzner.com/cloud) using the lightweight Kubernetes distribution [k3s](https://k3s.io/) from [Rancher](https://rancher.com/). In a recent test I created a 200 node HA cluster (3 masters, 197 worker nodes) in just **under 4 minutes** (when using only public network since private networks are limited to 100 instances per network). I believe this is a world record :) -- delete the node from Kubernetes (`kubectl delete node `) -- delete the correct instance from the cloud console -- re-run the create script. This will re-create the missing node and have it join to the cluster +Hetzner Cloud is an awesome cloud provider which offers a truly great service with the best performance/cost ratio in the market and locations in both Europe and USA. +k3s is my favorite Kubernetes distribution because it uses much less memory and CPU, leaving more resources to workloads. It is also super quick to deploy and upgrade because it's a single binary. -### Converting a non-HA cluster to HA +Using `hetzner-k3s`, creating a highly available k3s cluster with 3 masters for the control plane and 3 worker nodes takes **2-3 minutes** only. This includes -It's easy to convert a non-HA with a single master cluster to HA with multiple masters. Just change the masters instance count and re-run the create command. This will create a load balancer for the API server and update the kubeconfig so that all the API requests go through the load balancer. - -## Upgrading to a new version of k3s - -If it's the first time you upgrade the cluster, all you need to do to upgrade it to a newer version of k3s is run the following command: - -```bash -hetzner-k3s upgrade-cluster --config-file cluster_config.yaml --new-k3s-version v1.21.3+k3s1 -``` - -So you just need to specify the new k3s version as an additional parameter and the configuration file will be updated with the new version automatically during the upgrade. To see the list of available k3s releases run the command `hetzner-k3s releases`. - -Note that the API server will briefly be unavailable during the upgrade of the controlplane. - -To check the upgrade progress, run `watch kubectl get nodes -owide`. You will see the masters being upgraded one per time, followed by the worker nodes. - - -### What to do if the upgrade doesn't go smoothly - -If the upgrade gets stuck for some reason, or it doesn't upgrade all the nodes: - -1. Clean up the existing upgrade plans and jobs, and restart the upgrade controller - -```bash -kubectl -n system-upgrade delete job --all -kubectl -n system-upgrade delete plan --all - -kubectl label node --all plan.upgrade.cattle.io/k3s-server- plan.upgrade.cattle.io/k3s-agent- - -kubectl -n system-upgrade rollout restart deployment system-upgrade-controller -kubectl -n system-upgrade rollout status deployment system-upgrade-controller -``` - -I recommend running the above commands also when upgrading a cluster that has already been upgraded at least once previously, since the upgrade leaves some stuff behind that needs to be cleaned up. - -2. Re-run the `upgrade-cluster` command with an additiona parameter `--force true`. - -I have noticed that sometimes I need to re-run the upgrade command a couple of times to complete an upgrade successfully. Must be some bug in the system upgrade controller but I haven't investigated further. - -You can also check the logs of the system upgrade controller's pod: - -```bash -kubectl -n system-upgrade logs -f $(kubectl -n system-upgrade get pod -l pod-template-hash -o jsonpath="{.items[0].metadata.name}") -``` - -A final note about upgrades is that if for some reason the upgrade gets stuck after upgrading the masters and before upgrading the worker nodes, just cleaning up the resources as described above might not be enough. In that case also try running the following to tell the upgrade job for the workers that the masters have already been upgraded, so the upgrade can continue for the workers: - -```bash -kubectl label node plan.upgrade.cattle.io/k3s-server=upgraded -``` - -## Deleting a cluster - -To delete a cluster, running - -```bash -hetzner-k3s delete-cluster --config-file cluster_config.yam -``` - -This will delete all the resources in the Hetzner Cloud project for the cluster being deleted. - - -## Additional info - -### Load balancers - -Once the cluster is ready, you can already provision services of type LoadBalancer for your workloads (such as the Nginx ingress controller for example) thanks to the Hetzner Cloud Controller Manager that is installed automatically. - -There are some annotations that you can add to your services to configure the load balancers. I personally use the following: - -```yaml - service: - annotations: - load-balancer.hetzner.cloud/hostname: - load-balancer.hetzner.cloud/http-redirect-https: 'false' - load-balancer.hetzner.cloud/location: nbg1 - load-balancer.hetzner.cloud/name: - load-balancer.hetzner.cloud/uses-proxyprotocol: 'true' - load-balancer.hetzner.cloud/use-private-ip: "true" -``` - -I set `load-balancer.hetzner.cloud/hostname` to a valid hostname that I configure (after creating the load balancer) with the IP of the load balancer; I use this together with the annotation `load-balancer.hetzner.cloud/uses-proxyprotocol: 'true'` to enable the proxy protocol. Reason: I enable the proxy protocol on the load balancers so that my ingress controller and applications can "see" the real IP address of the client. However when this is enabled, there is a problem where [cert-manager](https://cert-manager.io/docs/) fails http01 challenges; you can find an explanation of why [here](https://github.com/compumike/hairpin-proxy) but the easy fix provided by some providers - including Hetzner - is to configure the load balancer so that it uses a hostname instead of an IP. Again, read the explanation for the reason but if you care about seeing the actual IP of the client then I recommend you use these two annotations. +- creating all the infrastructure resources (instances, private network, firewall) +- deploying k3s to the nodes +- installing the [Hetzner Cloud Controller Manager](https://github.com/hetznercloud/hcloud-cloud-controller-manager) to provision load balancers right away +- installing the [Hetzner CSI Driver](https://github.com/hetznercloud/csi-driver) to provision persistent volumes using Hetzner's block storage +- installing the [Rancher System Upgrade Controller](https://github.com/rancher/system-upgrade-controller) to make upgrades to a newer version of k3s easy and quick +- installing the [Cluster Autoscaler](https://github.com/kubernetes/autoscaler) to allow for autoscaling node pools -The annotation `load-balancer.hetzner.cloud/use-private-ip: "true"` ensures that the communication between the load balancer and the nodes happens through the private network, so we don't have to open any ports on the nodes (other than the port 6443 for the Kubernetes API server). +Also see this [documentation page](https://github.com/vitobotta/hetzner-k3s/blob/main/docs/Setting%20up%20a%20cluster.md) for a tutorial on how to set up a cluster with the most common setup to get you started. -The other annotations should be self explanatory. You can find a list of the available annotations here. +If you like this project and would like to help its development, consider [becoming a sponsor](https://github.com/sponsors/vitobotta). -## Persistent volumes +___ +## Who am I? -Once the cluster is ready you can create persistent volumes out of the box with the default storage class `hcloud-volumes`, since the Hetzner CSI driver is installed automatically. This will use Hetzner's block storage (based on Ceph so it's replicated and highly available) for your persistent volumes. Note that the minimum size of a volume is 10Gi. If you specify a smaller size for a volume, the volume will be created with a capacity of 10Gi anyway. +I'm the Lead Platform Architect for event management platform [Brella](https://www.brella.io/), based in Finland. I am responsible for all the technical aspects of the platform including development, infrastructure and mentoring developers. In my free time I act as a bug bounty hunter to find and responsibly disclose vulnerabilities in web applications. +See my public profile with links for connecting with me [here](https://vitobotta.com/). -## changelog +--- -- 0.3.1 - - Allow enabling/disabling the host key verification +## Docs -- 0.3.0 - - Handle case when an SSH key with the given fingerprint already exists in the Hetzner project - - Handle a timeout of 5 seconds for requests to the Hetzner API - - Retry waiting for server to be up when timeouts/host-unreachable errors occur - - Ignore known_hosts entry to prevent errors when recreating servers with IPs that have been used previously +- [Installation](docs/Installation.md) +- [Creating a cluster](docs/Creating_a_cluster.md) +- [Setting up a cluster](docs/Setting%20up%20a%20cluster.md) +- [Recommendations](docs/Recommendations.md) +- [Maintenance](docs/Maintenance.md) +- [Deleting a cluster](docs/Deleting_a_cluster.md) +- [Load balancers](docs/Load_balancers.md) +- [Storage](docs/Storage.md) +- [Troubleshooting](docs/Troubleshooting.md) +- [Contributing and support](docs/Contributing_and_support.md) -- 0.2.0 - - Allow mixing servers of different series Intel/AMD -## Contributing and support +___ +## Code of conduct -Please create a PR if you want to propose any changes, or open an issue if you are having trouble with the tool - I will do my best to help if I can. +Everyone interacting in the hetzner-k3s project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/vitobotta/hetzner-k3s/blob/main/CODE_OF_CONDUCT.md). +___ ## License -The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT). +This tool is available as open source under the terms of the [MIT License](https://github.com/vitobotta/hetzner-k3s/blob/main/LICENSE.txt). + +___ -## Code of Conduct +## Stargazers over time -Everyone interacting in the hetzner-k3s project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/vitobotta/k3s/blob/master/CODE_OF_CONDUCT.md). +[![Stargazers over time](https://starchart.cc/vitobotta/hetzner-k3s.svg)](https://starchart.cc/vitobotta/hetzner-k3s) diff --git a/Rakefile b/Rakefile deleted file mode 100644 index b7e9ed54..00000000 --- a/Rakefile +++ /dev/null @@ -1,6 +0,0 @@ -require "bundler/gem_tasks" -require "rspec/core/rake_task" - -RSpec::Core::RakeTask.new(:spec) - -task :default => :spec diff --git a/bin/console b/bin/console deleted file mode 100755 index 29c22426..00000000 --- a/bin/console +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env ruby - -require "bundler/setup" -require "k3s" - -# You can add fixtures and/or initialization code here to make experimenting -# with your gem easier. You can also use a different console, if you like. - -# (If you use this, don't forget to add pry to your Gemfile!) -# require "pry" -# Pry.start - -require "irb" -IRB.start(__FILE__) diff --git a/bin/setup b/bin/setup deleted file mode 100755 index dce67d86..00000000 --- a/bin/setup +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail -IFS=$'\n\t' -set -vx - -bundle install - -# Do any other automated setup that you need to do here diff --git a/cluster_config.yaml.example b/cluster_config.yaml.example deleted file mode 100644 index 1c6ae74a..00000000 --- a/cluster_config.yaml.example +++ /dev/null @@ -1,18 +0,0 @@ ---- -hetzner_token: blah -cluster_name: test -kubeconfig_path: "../kubeconfig" -k3s_version: v1.21.3+k3s1 -ssh_key_path: "~/.ssh/id_rsa.pub" -verify_host_key: false -location: nbg1 -masters: - instance_type: cpx21 - instance_count: 3 -worker_node_pools: -- name: small - instance_type: cpx21 - instance_count: 4 -- name: big - instance_type: cp321 - instance_count: 2 diff --git a/docker-compose.override.yml.example b/docker-compose.override.yml.example new file mode 100644 index 00000000..3b5ca842 --- /dev/null +++ b/docker-compose.override.yml.example @@ -0,0 +1,10 @@ +version: '3.8' + +volumes: + kube: + vscode_cache: + +services: + hetzner-k3s: + volumes: + - ... diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..da7876c5 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,23 @@ +version: '3.8' + +volumes: + kube: + vscode_cache: + +services: + hetzner-k3s: + # If you want to connect to your servers over IPv6 when running in a container, + # you will need to configure Docker to provide IPv6 connectivity to containers. + # Alternatively, if you're working on Linux and your host machine has IPv6, + # you can use "network_mode: host". + #network_mode: host + build: + context: . + dockerfile: Dockerfile.dev + volumes: + - ${PWD}:/home/app/hetzner-k3s:cached + - ${HOME}/.ssh:/root/.ssh + - /var/run/docker.sock:/var/run/docker.sock + - kube:/root/.kube + - vscode_cache:/vscode/vscode-server:cache + working_dir: /home/app/hetzner-k3s diff --git a/docs/Contributing_and_support.md b/docs/Contributing_and_support.md new file mode 100644 index 00000000..9de7fd32 --- /dev/null +++ b/docs/Contributing_and_support.md @@ -0,0 +1,50 @@ +# Contributing and support + +Please create a PR if you want to propose any changes, or open an issue if you are having trouble with the tool - I will do my best to help if I can. + +If you would like to financially support the project, consider [becoming a sponsor](https://github.com/sponsors/vitobotta). + +___ +## Building from source + +This tool is written in [Crystal](https://crystal-lang.org/). To build it, or to make some changes in the code and try them, you will need to install Crystal locally, or to work in a container. + +This repository contains a Dockerfile that builds a container image with Crystal as well as the other required dependencies. There is also a Compose file to conveniently run a container using that image, and mount the source code into the container. Finally, there is a devcontainer file that you can use with compatible IDEs like Visual Studio Code and the Dev Containers extension. + + +### Developing with VSCode + +You need [Visual Studio Code](https://code.visualstudio.com/) and the [Dev Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers&ssr=false). Open the project in VSCode (for instance, by executing `code .` in the root directory of this git repository). You should see a pop-up dialog prompting you to "Reopen in Container". Do that, then wait until the build is complete and the server has started; then click on "+" to open a terminal inside the container. + +Note: if for some reason you can't find the Dev Containers extension in the Marketplace (for instance, if the first result is the Docker extension instead of Dev Containers), check that you have the official build of VSCode. It looks like if you're running an Open Source build, some extensions are disabled. + + +### Developing with Compose + +If you can't or won't install VSCode, you can also develop in the exact same container with Docker and Compose. + +To build and run the development container, run: +```bash +docker compose up -d +``` + +Then, to enter the container: +```bash +docker compose exec hetzner-k3s bash +``` + + +### Inside the container + +Once you are inside the dev container (whether you used VSCode or directly Docker Compose), you can run `hetzner-k3s` like this: +```bash +crystal run ./src/hetzner-k3s.cr -- create --config cluster_config.yaml +``` + +To generate a binary, you can do: +```bash +crystal build ./src/hetzner-k3s.cr --static +``` + +The `--static` flag will make sure that the resulting binary is statically linked, and doesn't have dependencies on libraries that may or may not be available on the system where you will want to run it. + diff --git a/docs/Creating_a_cluster.md b/docs/Creating_a_cluster.md new file mode 100644 index 00000000..cb6c716c --- /dev/null +++ b/docs/Creating_a_cluster.md @@ -0,0 +1,206 @@ +# Creating a cluster + +The tool requires a simple configuration file in order to create/upgrade/delete clusters, in the YAML format like in the example below (commented lines are for optional settings): + +```yaml +--- +hetzner_token: +cluster_name: test +kubeconfig_path: "./kubeconfig" +k3s_version: v1.30.3+k3s1 + +networking: + ssh: + port: 22 + use_agent: false # set to true if your key has a passphrase + public_key_path: "~/.ssh/id_ed25519.pub" + private_key_path: "~/.ssh/id_ed25519" + allowed_networks: + ssh: + - 0.0.0.0/0 + api: # this will firewall port 6443 on the nodes; it will NOT firewall the API load balancer + - 0.0.0.0/0 + public_network: + ipv4: true + ipv6: true + private_network: + enabled : true + subnet: 10.0.0.0/16 + existing_network_name: "" + cni: + enabled: true + encryption: false + mode: flannel + + # cluster_cidr: 10.244.0.0/16 # optional: a custom IPv4/IPv6 network CIDR to use for pod IPs + # service_cidr: 10.43.0.0/16 # optional: a custom IPv4/IPv6 network CIDR to use for service IPs. Warning, if you change this, you should also change cluster_dns! + # cluster_dns: 10.43.0.10 # optional: IPv4 Cluster IP for coredns service. Needs to be an address from the service_cidr range + + +# manifests: +# cloud_controller_manager_manifest_url: "https://github.com/hetznercloud/hcloud-cloud-controller-manager/releases/download/v1.20.0/ccm-networks.yaml" +# csi_driver_manifest_url: "https://raw.githubusercontent.com/hetznercloud/csi-driver/v2.9.0/deploy/kubernetes/hcloud-csi.yml" +# system_upgrade_controller_deployment_manifest_url: "https://github.com/rancher/system-upgrade-controller/releases/download/v0.13.4/system-upgrade-controller.yaml" +# system_upgrade_controller_crd_manifest_url: "https://github.com/rancher/system-upgrade-controller/releases/download/v0.13.4/crd.yaml" +# cluster_autoscaler_manifest_url: "https://raw.githubusercontent.com/kubernetes/autoscaler/master/cluster-autoscaler/cloudprovider/hetzner/examples/cluster-autoscaler-run-on-master.yaml" + +datastore: + mode: etcd # etcd (default) or external + external_datastore_endpoint: postgres://.... + +schedule_workloads_on_masters: false + +# image: rocky-9 # optional: default is ubuntu-24.04 +# autoscaling_image: 103908130 # optional, defaults to the `image` setting +# snapshot_os: microos # optional: specified the os type when using a custom snapshot + +masters_pool: + instance_type: cpx21 + instance_count: 3 + location: nbg1 + +worker_node_pools: +- name: small-static + instance_type: cpx21 + instance_count: 4 + location: hel1 + # image: debian-11 + # labels: + # - key: purpose + # value: blah + # taints: + # - key: something + # value: value1:NoSchedule +- name: medium-autoscaled + instance_type: cpx31 + instance_count: 2 + location: fsn1 + autoscaling: + enabled: true + min_instances: 0 + max_instances: 3 + +embedded_registry_mirror: + enabled: true + +# additional_packages: +# - somepackage + +# post_create_commands: +# - apt update +# - apt upgrade -y +# - apt autoremove -y + +# kube_api_server_args: +# - arg1 +# - ... +# kube_scheduler_args: +# - arg1 +# - ... +# kube_controller_manager_args: +# - arg1 +# - ... +# kube_cloud_controller_manager_args: +# - arg1 +# - ... +# kubelet_args: +# - arg1 +# - ... +# kube_proxy_args: +# - arg1 +# - ... +# api_server_hostname: k8s.example.com # optional: DNS for the k8s API LoadBalancer. After the script has run, create a DNS record with the address of the API LoadBalancer. +``` + +Most settings should be self explanatory; you can run `hetzner-k3s releases` to see a list of the available k3s releases. + +If you don't want to specify the Hetzner token in the config file (for example if you want to use the tool with CI or want to safely commit the config file to a repository), then you can use the `HCLOUD_TOKEN` environment variable instead, which has precedence. + +If you set `masters_pool.instance_count` to 1 then the tool will create a non highly available control plane; for production clusters you may want to set it to a number greater than 1. This number must be odd to avoid split brain issues with etcd and the recommended number is 3. + +You can specify any number of worker node pools, static or autoscaled, and have mixed nodes with different specs for different workloads. + +Hetzner cloud init settings (`additional_packages` & `post_create_commands`) can be defined in the configuration file at root level as well as for each pool if different settings are needed for different pools. If these settings are configured for a pool, these override the settings at root level. + +At the moment Hetzner Cloud has five locations: two in Germany (`nbg1`, Nuremberg and `fsn1`, Falkenstein), one in Finland (`hel1`, Helsinki) and two in the USA (`ash`, Ashburn, Virginia, and `hil`, Hillsboro, Oregon). Please keep in mind that US locations only offer instances with AMD CPUs at the moment, while the newly introduced ARM instances are only available in Falkenstein-fsn1 for now. + +For the available instance types and their specs, either check from inside a project when adding an instance manually or run the following with your Hetzner token: + +```bash +curl -H "Authorization: Bearer $API_TOKEN" 'https://api.hetzner.cloud/v1/server_types' +``` + +To create the cluster run: + +```bash +hetzner-k3s create --config cluster_config.yaml | tee create.log +``` + +This will take a few minutes depending on the number of masters and worker nodes. + +### Disabling public IPs (IPv4 or IPv6 or both) on nodes + +With `enable_public_net_ipv4: false` and `enable_public_net_ipv6: false` you can disable the public interface for all nodes for improved security and saving on ipv4 addresses costs. These settings are global and effects all master and worker nodes. If you disable public IPs be sure to run hetzer-k3s from a machine that has access to the same private network as the nodes either directly or via some VPN. +Additional networking setup is required via cloud init, so it's important that the machine from which you run hetzner-k3s have internet access and DNS configured correctly, otherwise the cluster creation process will get stuck after creating the nodes. See [this discussion](https://github.com/vitobotta/hetzner-k3s/discussions/252) for additional information and instructions. + +### Using alternative OS images + +By default, the image in use is `ubuntu-24.04` for all the nodes, but you can specify a different default image with the root level `image` config option or even different images for different static node pools by setting the `image` config option in each node pool. This way you can, for example, have some node pools with ARM instances use the correct OS image for ARM. To do this and use say Ubuntu 24.04 on ARM instances, set `image` to `103908130` with a specific image ID. With regard to autoscaling, due to a limitation in the Cluster Autoscaler for Hetzner it is not possible yet to specify a different image for each autoscaled pool, so for now you can specify the image for all autoscaled pools by setting the `autoscaling_image` setting if you want to use an image different from the one specified in `image`. + +To see the list of available images, run the following: + +```bash +export API_TOKEN=... + +curl -H "Authorization: Bearer $API_TOKEN" 'https://api.hetzner.cloud/v1/images?per_page=100' +``` + +Besides the default OS images, It's also possible to use a snapshot that you have already created from an existing instance. Also with custom snapshots you'll need to specify the **ID** of the snapshot/image, not the description you gave when you created the template instance. + +I've tested snapshots for [openSUSE MicroOS](https://microos.opensuse.org/) but others might work too. You can easily create a snapshot for MicroOS using [this tool](https://github.com/kube-hetzner/packer-hcloud-microos). Creating the snapshot takes just a couple of minutes and then you can use it with hetzner-k3s by setting the config option `image` to the **ID** of the snapshot, and `snapshot_os` to `microos`. + + +### Keeping a project per cluster + +If you want to create multiple clusters per project, see [Configuring Cluster-CIDR and Service-CIDR](#configuring-cluster-cidr-and-service-cidr). Make sure, that every cluster has its own dedicated Cluster- and Service-CIDR. If they overlap, it will cause problems. But I still recommend keeping clusters separated from each other. This way, if you want to delete a cluster with all the resources created for it, you can just delete the project. + +### Configuring Cluster-CIDR and Service-CIDR + +Cluster-CIDR and Service-CIDR describe the IP-Ranges that are used for pods and services respectively. Under normal circumstances you should not need to change these values. However, advanced scenarios may require you to change them to avoid networking conflicts. + +**Changing the Cluster-CIDR (Pod IP-Range):** + +To change the Cluster-CIDR, uncomment/add the `cluster_cidr` option in your cluster configuration file and provide a valid CIDR notated network to use. The provided network must not be a subnet of your private network. + +**Changing the Service-CIDR (Service IP-Range):** + +To change the Service-CIDR, uncomment/add the `service_cidr` option in your cluster configuration file and provide a valid CIDR notated network to use. The provided network must not be a subnet of your private network. + +Also uncomment the `cluster_dns` option and provide a single IP-Address from your `service_cidr` range. `cluster_dns` sets the IP-Address of the coredns service. + +**Sizing the Networks** + +The networks you provide should provide enough space for the expected amount of pods/services. By default `/16` networks are used. Please make sure you chose an adequate size, as changing the CIDR afterwards is not supported. + +### Idempotency + +The `create` command can be run any number of times with the same configuration without causing any issue, since the process is idempotent. This means that if for some reason the create process gets stuck or throws errors (for example if the Hetzner API is unavailable or there are timeouts etc), you can just stop the current command, and re-run it with the same configuration to continue from where it left. + +Note that the kubeconfig will be overwritten when you re-run the `create` command. + + +### Limitations: + +- if possible, please use modern SSH keys since some operating systems have deprecated old crypto based on SHA1; therefore I recommend you use ECDSA keys instead of the old RSA type +- if you use a snapshot instead of one of the default images, the creation of the instances will take longer than when using a regular image +- the setting `networking`.`allowed_networks`.`api` allows specifying which networks can access the Kubernetes API, but this only works with single master clusters currently. Multi-master HA clusters require a load balancer for the API, but load balancers are not yet covered by Hetzner's firewalls +- if you enable autoscaling for one or more nodepools, do not change that setting afterwards as it can cause problems to the autoscaler +- autoscaling is only supported when using Ubuntu or one of the other default images, not snapshots +- worker nodes created by the autoscaler must be deleted manually from the Hetzner Console when deleting the cluster (this will be addressed in a future update) +- SSH keys with passphrases can only be used if you set `networking`.`ssh`.`use_ssh_agent` to `true` and use an SSH agent to access your key. To start and agent e.g. on macOS: + +```bash +eval "$(ssh-agent -s)" +ssh-add --apple-use-keychain ~/.ssh/ +``` + diff --git a/docs/Deleting_a_cluster.md b/docs/Deleting_a_cluster.md new file mode 100644 index 00000000..876e6772 --- /dev/null +++ b/docs/Deleting_a_cluster.md @@ -0,0 +1,12 @@ +# Deleting a cluster + +To delete a cluster, running + +```bash +hetzner-k3s delete --config cluster_config.yaml +``` + +This will delete all the resources in the Hetzner Cloud project created by `hetzner-k3s` directly. + +**NOTE:** at the moment instances created by the cluster autoscaler, as well as load balancers and persistent volumes created by deploying your applications must be deleted manually. This may be addressed in a future release. + diff --git a/docs/Installation.md b/docs/Installation.md new file mode 100644 index 00000000..fbd8654e --- /dev/null +++ b/docs/Installation.md @@ -0,0 +1,71 @@ +## Prerequisites + +All that is needed to use this tool is + +- an Hetzner Cloud account + +- an Hetzner Cloud token: for this you need to create a project from the cloud console, and then an API token with **both read and write permissions** (sidebar > Security > API Tokens); you will see the token only once, so be sure to take note of it somewhere safe + +- kubectl and Helm installed + +___ +# Installation + +Before using the tool, be sure to have kubectl installed as it's required to install some components in the cluster and perform k3s upgrades. + +### macOS + +#### With Homebrew + +```bash +brew install vitobotta/tap/hetzner_k3s +``` + +#### Binary installation + +You need to install these dependencies first: +- libssh2 +- libevent +- bdw-gc +- libyaml +- pcre +- gmp + +##### Intel / x86 + +```bash +wget https://github.com/vitobotta/hetzner-k3s/releases/download/v2.0.8/hetzner-k3s-macos-amd64 +chmod +x hetzner-k3s-macos-amd64 +sudo mv hetzner-k3s-macos-amd64 /usr/local/bin/hetzner-k3s +``` + +##### Apple Silicon / ARM + +```bash +wget https://github.com/vitobotta/hetzner-k3s/releases/download/v2.0.8/hetzner-k3s-macos-arm64 +chmod +x hetzner-k3s-macos-arm64 +sudo mv hetzner-k3s-macos-arm64 /usr/local/bin/hetzner-k3s +``` + +### Linux + +#### amd64 + +```bash +wget https://github.com/vitobotta/hetzner-k3s/releases/download/v2.0.8/hetzner-k3s-linux-amd64 +chmod +x hetzner-k3s-linux-amd64 +sudo mv hetzner-k3s-linux-amd64 /usr/local/bin/hetzner-k3s +``` + +#### arm + +```bash +wget https://github.com/vitobotta/hetzner-k3s/releases/download/v2.0.8/hetzner-k3s-linux-arm64 +chmod +x hetzner-k3s-linux-arm64 +sudo mv hetzner-k3s-linux-arm64 /usr/local/bin/hetzner-k3s +``` + +### Windows + +I recommend using the Linux binary under [WSL](https://learn.microsoft.com/en-us/windows/wsl/install). + diff --git a/docs/Load_balancers.md b/docs/Load_balancers.md new file mode 100644 index 00000000..5771d374 --- /dev/null +++ b/docs/Load_balancers.md @@ -0,0 +1,25 @@ +# Load balancers + +Once the cluster is ready, you can already provision services of type LoadBalancer for your workloads (such as the Nginx ingress controller for example) thanks to the Hetzner Cloud Controller Manager that is installed automatically. + +There are some annotations that you can add to your services to configure the load balancers. At a minimum your need these two: + +```yaml +load-balancer.hetzner.cloud/location: nbg1 # must ensure the network location of the load balancer is same as for the nodes +load-balancer.hetzner.cloud/use-private-ip: "true" # ensures the traffic between LB and nodes goes through the private network, so you don't need to change anything in the firewall +``` + +The above are required, but I also recommend these: + +```yaml +load-balancer.hetzner.cloud/hostname: +load-balancer.hetzner.cloud/http-redirect-https: 'false' +load-balancer.hetzner.cloud/name: +load-balancer.hetzner.cloud/uses-proxyprotocol: 'true' +``` + +I set `load-balancer.hetzner.cloud/hostname` to a valid hostname that I configure (after creating the load balancer) with the IP of the load balancer; I use this together with the annotation `load-balancer.hetzner.cloud/uses-proxyprotocol: 'true'` to enable the proxy protocol. Reason: I enable the proxy protocol on the load balancers so that my ingress controller and applications can "see" the real IP address of the client. However when this is enabled, there is a problem where [cert-manager](https://cert-manager.io/docs/) fails http01 challenges; you can find an explanation of why [here](https://github.com/compumike/hairpin-proxy) but the easy fix provided by some providers - including Hetzner - is to configure the load balancer so that it uses a hostname instead of an IP. Again, read the explanation for the reason but if you care about seeing the actual IP of the client then I recommend you use these two annotations. + +The other annotations should be self explanatory. You can find a list of the available annotations [here](https://pkg.go.dev/github.com/hetznercloud/hcloud-cloud-controller-manager/internal/annotation). + +**Note**: in a future release it will be possible to configure ingress controllers with host ports, so it will be possible to use an ingress without having to buy a load balancer, but for the time being a load balancer is still required. diff --git a/docs/Maintenance.md b/docs/Maintenance.md new file mode 100644 index 00000000..8b45dc71 --- /dev/null +++ b/docs/Maintenance.md @@ -0,0 +1,102 @@ +# Maintenance + +## Adding nodes + +To add one or more nodes to a node pool, just change the instance count in the configuration file for that node pool and re-run the create command. + +**Important**: if you are increasing the size of a node pool created prior to v0.5.7, please see [this thread](https://github.com/vitobotta/hetzner-k3s/issues/80). + +## Scaling down a node pool + +To make a node pool smaller: + +- decrease the instance count for the node pool in the configuration file so that those extra nodes are not recreated in the future +- delete the nodes from Kubernetes (`kubectl delete node `) +- delete the instances from the cloud console if the Cloud Controller Manager doesn't delete it automatically (make sure you delete the correct ones 🤭) + +In a future release I will add some automation for the cleanup. + +## Replacing a problematic node + +- delete the node from Kubernetes (`kubectl delete node `) +- delete the correct instance from the cloud console +- re-run the `create` command. This will re-create the missing node and have it join to the cluster + +## Converting a non-HA cluster to HA + +It's easy to convert a non-HA with a single master cluster to HA with multiple masters. Just change the masters instance count and re-run the `create` command. This will create a load balancer for the API server and update the kubeconfig so that all the API requests go through the load balancer. + +## Replacing the seed master + +When creating a new cluster, the seed master (or first master) in a HA configuration is `master1`. The seed master will change if you delete `master1` due to some issues with the node so it gets recreated. Whenever the seed master changes, k3s must be restarted on the existing masters. + +___ +## Upgrading to a new version of k3s + +If it's the first time you upgrade the cluster, all you need to do to upgrade it to a newer version of k3s is run the following command: + +```bash +hetzner-k3s upgrade --config cluster_config.yaml --new-k3s-version v1.27.1-rc2+k3s1 +``` + +So you just need to specify the new k3s version as an additional parameter and the configuration file will be updated with the new version automatically during the upgrade. To see the list of available k3s releases run the command `hetzner-k3s releases`. + +Note: (single master clusters only) the API server will briefly be unavailable during the upgrade of the controlplane. + +To check the upgrade progress, run `watch kubectl get nodes -owide`. You will see the masters being upgraded one per time, followed by the worker nodes. + +NOTE: if you haven't used the tool in a while before upgrading, you may need to delete the file `cluster_config.yaml.example` in your temp folder to refresh the list of available k3s versions. + + +### What to do if the upgrade doesn't go smoothly + +If the upgrade gets stuck for some reason, or it doesn't upgrade all the nodes: + +1. Clean up the existing upgrade plans and jobs, and restart the upgrade controller + +```bash +kubectl -n system-upgrade delete job --all +kubectl -n system-upgrade delete plan --all + +kubectl label node --all plan.upgrade.cattle.io/k3s-server- plan.upgrade.cattle.io/k3s-agent- + +kubectl -n system-upgrade rollout restart deployment system-upgrade-controller +kubectl -n system-upgrade rollout status deployment system-upgrade-controller +``` + +You can also check the logs of the system upgrade controller's pod: + +```bash +kubectl -n system-upgrade \ + logs -f $(kubectl -n system-upgrade get pod -l pod-template-hash -o jsonpath="{.items[0].metadata.name}") +``` + +A final note about upgrades is that if for some reason the upgrade gets stuck after upgrading the masters and before upgrading the worker nodes, just cleaning up the resources as described above might not be enough. In that case also try running the following to tell the upgrade job for the workers that the masters have already been upgraded, so the upgrade can continue for the workers: + +```bash +kubectl label node plan.upgrade.cattle.io/k3s-server=upgraded +``` + +___ +## Upgrading the OS on nodes + +- consider adding a temporary node during the process if you don't have enough spare capacity in the cluster +- drain one node +- update etc +- reboot +- uncordon +- proceed with the next node + +If you want to automate this process I recommend you install the [Kubernetes Reboot Daemon ](https://kured.dev/) ("Kured"). For this to work properly, make sure the OS you choose for the nodes has unattended upgrades enabled at least for security updates. For example if the image is Ubuntu, you can add this to the configuration file before running the `create` command: + +```yaml +additional_packages: +- unattended-upgrades +- update-notifier-common +post_create_commands: +- sudo systemctl enable unattended-upgrades +- sudo systemctl start unattended-upgrades +``` + +Check the Kured documentation for configuration options like maintenance window etc. + diff --git a/docs/Recommendations.md b/docs/Recommendations.md new file mode 100644 index 00000000..b317b323 --- /dev/null +++ b/docs/Recommendations.md @@ -0,0 +1,23 @@ +# Recommendations + +## Larger clusters + +The default configuration settings are pretty good for most small-medium clusters, so you can leave most settings unchanged if you want to go with a configuration that has been tested extensively. + +However keep in mind that this default configuration - that uses a Hetzner private network with the default Flannel CNI built into k3s - may not be optimal for larger clusters: + +1. Private networks in Hetzner cloud supports max 100 nodes, so I recommend you disable the private network in the configuration if you expect your cluster to grow beyond 100 nodes. +2. Flannel is fine for small to medium clusters, but performance starts to degrade with clusters made of several hundreds or thousands of nodes, so I recommend to switch to Cilium as CNI as its performance is excellent and scales well with very large clusters. + +Notes: +- if you disable the private network due to the limitation mentioned above, encryption will be enforced at CNI level to secure the traffic between nodes over the public network. +- if you want to use something other than Cilium or Flannel (e.g. Calico), then you can disable the automatic setup of the CNI so you can install a CNI of your choice. We may add built in support for more CNIs in future releases. +- from v2.0.0 on you can also use an external SQL datastore like Postgres instead of the embedded etcd as datastore for the Kubernetes API. This can also help scaling larger clusters. + +## Registry mirror + +v2.0.0 introduces a setting to optionally enable the `embedded registry mirror` in k3s (see [this page](https://docs.k3s.io/installation/registry-mirror) for more information. This is basically an installation of [Spegel](https://github.com/spegel-org/spegel) which enables peer-to-peer distribution of container images between the nodes of a cluster. This can help avoid problems with nodes not being able to pull images because their IPs have been banned by registry (due to malicious use of the same IPs in the past or similar reason), because a node will try pulling an image from other nodes via the embedded registry mirror, before pulling the image from the upstream registry. This also speeds up pods creation because less time is spent downloading images from the upstream registries when deployments have many replicas spread across many nodes. + +## Clusters using only the public network + +If you disable the private network to be able to create a cluster with more than 100 nodes, then you cannot restrict access to the Kubernetes API by IP address because otherwise the API would not be accessible from the nodes. This limitation may be removed in a future release if a workaround is found, but for the time being the API must be accessible to 0.0.0.0/0 when the private network is disabled. diff --git a/docs/Setting up a cluster.md b/docs/Setting up a cluster.md new file mode 100644 index 00000000..5d563847 --- /dev/null +++ b/docs/Setting up a cluster.md @@ -0,0 +1,311 @@ +By [TitanFighter](https://github.com/TitanFighter) + + +## Instructions + +### Installation of a "hello-world" project + +For testing we are going to use this "hello-world" app - https://gist.githubusercontent.com/vitobotta/6e73f724c5b94355ec21b9eee6f626f1/raw/3036d4c4283a08ab82b99fffea8df3dded1d1f78/deployment.yaml + +1. Install `kubectl` on your computer: https://kubernetes.io/docs/tasks/tools/#kubectl +2. Install `Helm` on your computer: https://helm.sh/docs/intro/install/ +3. Install `hetzner-k3s` on your computer: [Installation](Installation.md) +4. Create file `hetzner-k3s_cluster_config.yaml` with the config below (this is a config for High Available (HA) cluster with 3 master nodes + 3 worker nodes. You can use 1+1 for testing): + +```yaml +hetzner_token: ... +cluster_name: hello-world +kubeconfig_path: "./kubeconfig"  # or /cluster/kubeconfig if you are going to use Docker +k3s_version: v1.23.3+k3s1 + +networking: + ssh: + port: 22 + use_agent: false + public_key_path: "~/.ssh/id_rsa.pub" + private_key_path: "~/.ssh/id_rsa" + allowed_networks: + ssh: +   - 0.0.0.0/0 + api: + - 0.0.0.0/0 + +masters_pool: +  instance_type: cpx21 +  instance_count: 3 + location: nbg1 + +worker_node_pools: +- name: small + instance_type: cpx21 + instance_count: 4 + location: hel1 +- name: big + instance_type: cpx31 + instance_count: 2 + location: fsn1 + autoscaling: + enabled: true + min_instances: 0 + max_instances: 3 +``` + +Refer to the full config example in - [Creating a cluster](Creating_a_cluster.md) for details on all the settings that can be customized + +5. Create cluster: `hetzner-k3s create --config hetzner-k3s_cluster_config.yaml` +6. `hetzner-k3s` automatically creates a `kubeconfig`file for the cluster in the directory of your computer where you run the tool, +so you can either copy the `kubeconfig` file to `~/.kube/config` if it's the only cluster, or run `export KUBECONFIG=./kubeconfig` +in the same directory to access the cluster. Then you can interact with your cluster via `kubectl` installed in the 1st step. + +TIP: If you don't want to run `kubectl apply ...` every time, you can store all configs in some folders and then run `kubectl apply -f /path/to/configs/ -R`. + +7. Create file: `touch ingress-nginx-annotations.yaml` +8. Add annotations to the file: `nano ingress-nginx-annotations.yaml` + +```yaml +# INSTALLATION +# 1. Install Helm: https://helm.sh/docs/intro/install/ +# 2. Add ingress-nginx help repo: helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx +# 3. Update information of available charts locally from chart repositories: helm repo update +# 4. Install ingress-nginx: +# helm upgrade --install \ +# ingress-nginx ingress-nginx/ingress-nginx \ +# --set controller.ingressClassResource.default=true \ # remove this line if you don't want Nginx to become the default Ingress Controller +# -f ./ingress-nginx-annotations.yaml \ +# --namespace ingress-nginx \ +# --create-namespace + +# LIST of all ANNOTATIONS: https://github.com/hetznercloud/hcloud-cloud-controller-manager/blob/master/internal/annotation/load_balancer.go + +controller: +  kind: DaemonSet +  service: +    annotations: +      # Germany: +      # - nbg1 (Nuremberg) +      # - fsn1 (Falkensteing) +      # Finland: +      # - hel1 (Helsinki) +      # USA: +      # - ash (Ashburn, Virginia) +      # Without this the load balancer won't be provisioned and will stay in "pending" state. +      # The state you can check via "kubectl get svc -n ingress-nginx" +      load-balancer.hetzner.cloud/location: nbg1 + +      # Name of load balancer. This name you will see in your Hetzner's cloud console (site) at the "Your project -> Load Balancers" page + # NOTE: This is NOT the load balancer that the tool creates automatically for clusters with multiple masters (HA configuration). You need + # to specify a different name here so it will create a separate load balancer for ingress Nginx. +      load-balancer.hetzner.cloud/name: WORKERS_LOAD_BALANCER_NAME + +      # Ensures that the communication between the load balancer and the cluster nodes happens through the private network +      load-balancer.hetzner.cloud/use-private-ip: "true" + +      # [ START: If you care about seeing the actual IP of the client then use these two annotations ] +      # - "uses-proxyprotocol" enables the proxy protocol on the load balancers so that ingress controller and +      # applications can "see" the real IP address of the client. +      # - "hostname" is needed just if you use cert-manager (LetsEncrypt SSL certificates). You need to use it in order +      # to fix fails http01 challenges of "cert-manager" (https://cert-manager.io/docs/). +      # Here (https://github.com/compumike/hairpin-proxy) you can find a description of this problem. +      # To be short: the easiest fix provided by some providers (including Hetzner) is to configure the load balancer so +      # that it uses a hostname instead of an IP. +      load-balancer.hetzner.cloud/uses-proxyprotocol: 'true' + +      # 1. "yourDomain.com" must be configured in the DNS correctly to point to the Nginx load balancer, +      # otherwise the provision of certificates won't work; +      # 2. if you use a few domains, specify any one. +      load-balancer.hetzner.cloud/hostname: yourDomain.com +      # [ END: If you care about seeing the actual IP of the client then use these two annotations ] + +      load-balancer.hetzner.cloud/http-redirect-https: 'false' +``` + +9. Add ingress-nginx Helm repo: `helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx` +10. Update information of available charts locally from chart repositories: `helm repo update` +11. Install ingress-nginx: + +```bash +helm upgrade --install \ +ingress-nginx ingress-nginx/ingress-nginx \ +--set controller.ingressClassResource.default=true \ +-f ~/.kube/ingress-nginx-annotations.yaml \ +--namespace ingress-nginx \ +--create-namespace +``` + +`--set controller.ingressClassResource.default=true` will configure this to be the default Ingress Class for your cluster. +Without a default, you must specify an Ingress Class for every Ingress object you deploy, which is often difficult when deploying Helm Charts. +If you do not set a default Ingress Class nor specify one on the Ingress resource, Nginx will serve a 404 Not Found page, as it did not "pick up" the Ingress resource. + +TIP: Just in case you need to delete it: `helm uninstall ingress-nginx -n ingress-nginx`. +Be careful, this will delete current Hetzner's load balancer and as a result when you install a new ingress controller, +a new Hetzner's load balancer possibly will be created with a new public IP address. + +12. In a few minutes check that the "EXTERNAL-IP" column has IP instead of "pending": `kubectl get svc -n ingress-nginx` + +13. `load-balancer.hetzner.cloud/uses-proxyprotocol: "true"` annotation requires `use-proxy-protocol: "true"` for ingress-nginx, so let's create file: `touch ingress-nginx-configmap.yaml` +14. Add content to just created file: `nano ingress-nginx-configmap.yaml` + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: +  # Do not change name - this is the name required by Nginx Ingress Controller +  name: ingress-nginx-controller +  namespace: ingress-nginx +data: +  use-proxy-protocol: "true" +``` + +15. Apply config map: `kubectl apply -f ./ingress-nginx-configmap.yaml` +16. Open your Hetzner's cloud console (site), "Your project -> Load Balancers" and find PUBLIC IP in front of the name you used with "load-balancer.hetzner.cloud/name: WORKERS_LOAD_BALANCER_NAME" annotation. Copy/Remember this IP. +17. Download hello-world app: `curl https://gist.githubusercontent.com/vitobotta/6e73f724c5b94355ec21b9eee6f626f1/raw/3036d4c4283a08ab82b99fffea8df3dded1d1f78/deployment.yaml --output hello-world.yaml` +18. Edit the file (add annotation + add Hetzner's Load Balancer IP Address) and set the hostname: + +```yaml +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: +  name: hello-world +  annotations:                             # <<<--- Add annotation +    kubernetes.io/ingress.class: nginx     # <<<--- Add annotation +spec: +  rules: +  - host: hello-world.IP_FROM_STEP_13.nip.io # <<<--- ADD IP FROM THE STEP 16. +  .... +``` + +19. Install hello-world app: `kubectl apply -f hello-world.yaml` +20. Check http://hello-world.IP_FROM_STEP_13.nip.io +You should see the RANCHER Hello world! page. +"host.IP_FROM_STEP_13.nip.io" (the key part is ".nip.io") is just a quick way to test things without configuring DNS (a query to a hostname ending in nip.io simply returns the IP address it finds in the hostname itself). Also, if you enabled [proxy-protocol](https://www.haproxy.org/download/1.8/doc/proxy-protocol.txt) as shown above, you should find your own current public IP address in the `X-Forwarded-For` header - i.e. the application can "see" it. +21. In order to connect yourDomain.com, you need to: + - assign IP address from the step 13 to your domain in DNS panel of your domain registrar + - change `- host: hello-world.IP_FROM_STEP_13.nip.io` to `- host: yourDomain.com`; + - `kubectl apply -f hello-world.yaml` + - wait until DNS records are updated. + +#### If you need LetsEncrypt + +22. Add LetsEncrypt Helm repo: `helm repo add jetstack https://charts.jetstack.io` +23. Update information of available charts locally from chart repositories: `helm repo update` +24. Install LetsEncrypt certificates issuer: + +```bash +helm upgrade --install \ +--namespace cert-manager \ +--create-namespace \ +--set crds.enabled=true \ +cert-manager jetstack/cert-manager +``` + +25. Create file `lets-encrypt.yaml` with content: + +```yaml +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: +  name: letsencrypt-prod +  namespace: cert-manager +spec: +  acme: +    email: YOUR@EMAIL.com +    server: https://acme-v02.api.letsencrypt.org/directory +    privateKeySecretRef: +      name: letsencrypt-prod-account-key +    solvers: +    - http01: +        ingress: +          class: nginx +``` + +26. Apply file: `kubectl apply -f ./lets-encrypt.yaml` +27. Edit `hello-world.yaml` and add settings for TLS encryption: + +```yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: +  name: hello-world +  annotations: +    cert-manager.io/cluster-issuer: "letsencrypt-prod"    # <<<--- Add annotation +    kubernetes.io/tls-acme: "true"                        # <<<--- Add annotation +spec: +  rules: +  - host: yourDomain.com  # <<<---- Your real domain +  tls: # <<<---- Add this block +  - hosts: +    - yourDomain.com + secretName: yourDomain.com-tls # <<<--- Add reference to secret + +  .... +``` + +TIP: if you chose not to configure Nginx as the default Ingress Class, you must add the `kubernetes.io/ingress.class: nginx` annotation. + +28. Apply changes: `kubectl apply -f ./hello-world.yaml` + + + +## FAQs + +#### 1. Is it possible to use for example MetalLB instead of Hetzner's LB? + +There is a way to use MetalLB with floating IPs in Hetzner Cloud but I don't recommend it. The setup with standard load balancers is much simpler and load balancers are not that much more expensive than floating IPs so IMO there's no point using MetalLB. + +#### 2. How to create and push docker images to a repository and how to allow kubernetes to work with this image (gitlab example)? +On a computer which creates an image: +- `docker login registry.gitlab.com` +- `docker build -t registry.gitlab.com/COMPANY_NAME/REPO_NAME:IMAGE_NAME -f /some/path/to/Dockerfile .` +- `docker push registry.gitlab.com/COMPANY_NAME/REPO_NAME:IMAGE_NAME` + +On a computer which runs kubernetes: +- generate secret to access images: `kubectl create secret docker-registry gitlabcreds --docker-server=https://registry.gitlab.com --docker-username=MYUSER --docker-password=MYPWD --docker-email=MYEMAIL -n NAMESPACE_OF_YOUR_APP -o yaml > docker-secret.yaml` +- apply secret: `kubectl apply -f docker-secret.yaml -n NAMESPACE_OF_YOUR_APP` + +#### 3. How to check how much resources nodes/pods use? +- Install metrics-server https://github.com/kubernetes-sigs/metrics-server +- Then use either `kubectl top nodes` or `kubectl top pods -A` + +#### 4. What is Ingress? +There are 2 types of "ingress" -> `Ingress Controller` and `Ingress Resources`. +To simplify everything, in the case of Nginx... + +- Ingress Controller is Nginx itself (this is `kind: Ingress`), Ingress Resources are services (ie. `kind: Service`). +- Ingress Controller has different annotations (rules). You can use them inside `kind: Ingress` as a result such rules become "global" and inside `kind: Service` as a result such rules become "local" (service-specific). +- Ingress Controller consists of a Pod and a Service. The Pod runs the Controller, which constantly polls the /ingresses endpoint on the API server of your cluster for updates to available Ingress Resources. + +#### 5. How to make autoscaling configure automatically IP routes to use a NAT server for new nodes? +- You need to have a NAT server - as explained in this [Hetzner community tutorial](https://community.hetzner.com/tutorials/how-to-set-up-nat-for-cloud-networks#step-2---adding-the-route-to-the-network). +- Use `post_create_commands` (multiple lines commands don't seem to be supported at the moment): +```yaml +additional_packages: + - ifupdown +post_create_commands: + - apt update + - apt upgrade -y + - apt autoremove -y + - ip route add default via 10.0.0.1 # Adapt this to your gateway IP +``` + +## Useful commands + +```bash +kubectl get service [serviceName] -A or -n [nameSpace] +kubectl get ingress [ingressName] -A or -n [nameSpace] +kubectl get pod [podName] -A or -n [nameSpace] +kubectl get all -A +kubectl get events -A +helm ls -A +helm uninstall [name] -n [nameSpace] +kubectl -n ingress-nginx get svc +kubectl describe ingress -A +kubectl describe svc -n ingress-nginx +kubectl delete configmap nginx-config -n ingress-nginx +kubectl rollout restart deployment -n NAMESPACE_OF_YOUR_APP +kubectl get all -A` does not include "ingress", as a result you need to use `kubectl get ing -A +``` + +##  Useful links +Cheat Sheet - https://kubernetes.io/docs/reference/kubectl/cheatsheet/ +A visual guide on troubleshooting Kubernetes deployments - https://learnk8s.io/troubleshooting-deployments diff --git a/docs/Storage.md b/docs/Storage.md new file mode 100644 index 00000000..2e46ad53 --- /dev/null +++ b/docs/Storage.md @@ -0,0 +1,5 @@ +# Storage + +Once the cluster is ready you can create persistent volumes out of the box with the default storage class `hcloud-volumes`, since the Hetzner CSI driver is installed automatically. This will use Hetzner's block storage (based on Ceph so it's replicated and highly available) for your persistent volumes. Note that the minimum size of a volume is 10Gi. If you specify a smaller size for a volume, the volume will be created with a capacity of 10Gi anyway. + +For workloads like databases that benefit from max IOPS there's also the `local-path` storage class. See [this page](https://docs.k3s.io/storage) for more details. diff --git a/docs/Troubleshooting.md b/docs/Troubleshooting.md new file mode 100644 index 00000000..e97647a7 --- /dev/null +++ b/docs/Troubleshooting.md @@ -0,0 +1,3 @@ +# Troubleshooting + +If the tool hangs forever after creating instances and you see timeouts, this may be caused by problems with your SSH key, for example if you use a key with a passphrase or an older key (due to the deprecation of some crypto stuff in newwer operating systems). In this case you may want to try setting `use_ssh_agent` to `true` to use the SSH agent. If you are not familiar with what an SSH agent is, take a look at [this page](https://smallstep.com/blog/ssh-agent-explained/) for an explanation. diff --git a/e2e-tests/README.md b/e2e-tests/README.md new file mode 100644 index 00000000..25319658 --- /dev/null +++ b/e2e-tests/README.md @@ -0,0 +1,72 @@ +# End-to-end test harness + +This directory contains a few scripts to automate testing of hetzner-k3s across different combinations of configurations. + +## How to use this? + +Copy `env.sample` to `env` and edit it to indicate your Hetzner API token. You can also change the instance location if you'd like, and the location of the hetzner-k3s binary that you would like to use for testing (convenient if you're building multiple, different versions and want to check for regressions). + +Then to run a single test: + +```bash +./run-single-test.sh config-sshport-image.yaml IMAGE=ubuntu-22.04 SSHPORT=222 +``` + +The first argument is a configuration file template; the rest of the command line is an optional list of variables to substitute in the template. + +To run all the tests: + +```bash +./run-all-tests.sh +``` + +To view test results: + +``` +./list-test-results.sh +``` + +The output will look like this: +``` +$ ./list-test-results.sh +config-sshport-image.yaml IMAGE=alma-8 SSHPORT=222 error creating test-c59dc574 +config-sshport-image.yaml IMAGE=alma-8 SSHPORT=22 error creating test-15c94339 +config-sshport-image.yaml IMAGE=alma-9 SSHPORT=222 ok tested ok test-e9acedda +config-sshport-image.yaml IMAGE=alma-9 SSHPORT=22 ok tested ok test-3a378dbe +config-sshport-image.yaml IMAGE=centos-stream-8 SSHPORT=222 error done test-9063a269 +config-sshport-image.yaml IMAGE=centos-stream-8 SSHPORT=22 error done test-0a523221 +config-sshport-image.yaml IMAGE=centos-stream-9 SSHPORT=222 ok done test-857926a8 +config-sshport-image.yaml IMAGE=debian-11 SSHPORT=222 ok done test-fe655f1c +config-sshport-image.yaml IMAGE=debian-11 SSHPORT=22 ok tested ok test-77bf45fe +... +config-sshport-image.yaml IMAGE=ubuntu-24.04 SSHPORT=222 ok tested ok test-b7c132d6 +``` + +## Re-running a test + +The test uses a caching system: if you run the same test (same configuration file and same parameters) twice, it will skip it the second time. This is so that you can add a test in the "run-all-tests.sh" script, and re-run it to execute only the new tests that you added. + +If you want to re-run a test, delete the corresponding directory: it's the `test-xxxxxxxx` directory shown by `list-test-results.sh`. + +## What does it test, exactly? + +It executes `hetzner-k3s create`, then executes a few very basic `kubectl` commands, then executes `hetzner-k3s delete`. + +Each test is executed in a separate directory (`test-xxxxxxxx` show by `list-test-results.sh`), and the output of each phase is put in a log file in that directory. Status files are also created to track test success or failure. + +## That seems very primitive. + +It is! The goal was to test if the SSH port option worked correctly across all distros. I thought this could be useful to test other options and combinations of options later. + +## It takes a very long time! + +Yes, because the tests are executed sequentially, not in parallel. This is because the default Hetzner quotas are fairly low (10 instance, I believe?) and executing more than a couple of tests simultaneously (or in an account that already has a couple of instances running) would exceed the quota and cause the tests to fail. + +It would be fairly easy to parallelize the tests if the needs arise, but we should then keep in mind that most folks will have this conservative instance quota, that will cause tests to fail. + +## How much will this cost to run? + +The instances will only run for a couple of minutes each time. I ran a bunch of tests with a bunch of different configurations and it probably cost me 1-2 EUR, but of course, the size of the instances will influence this; and if you interrupt the test while instances are running (or if it crashes badly enough during the test) some instances might still be running and you will need to clean them up manually! + + + diff --git a/e2e-tests/config-simple.yaml b/e2e-tests/config-simple.yaml new file mode 100644 index 00000000..be018b6b --- /dev/null +++ b/e2e-tests/config-simple.yaml @@ -0,0 +1,47 @@ +--- +cluster_name: $NAME +kubeconfig_path: $KUBECONFIG +k3s_version: v1.30.2+k3s2 + +networking: + ssh: + port: 22 + use_agent: false # set to true if your key has a passphrase + public_key_path: sshkey.pub + private_key_path: sshkey + allowed_networks: + ssh: + - 0.0.0.0/0 + api: # this will firewall port 6443 on the nodes; it will NOT firewall the API load balancer + - 0.0.0.0/0 + public_network: + ipv4: true + ipv6: true + private_network: + enabled : true + subnet: 10.0.0.0/16 + existing_network_name: "" + cni: + enabled: true + encryption: false + mode: flannel + +datastore: + mode: etcd # etcd (default) or external + +schedule_workloads_on_masters: false + +masters_pool: + instance_type: cpx11 + instance_count: 3 + location: $LOCATION + +worker_node_pools: +- name: pool1 + instance_type: cpx11 + instance_count: 1 + location: $LOCATION + +embedded_registry_mirror: + enabled: true + diff --git a/e2e-tests/config-sshport-image.yaml b/e2e-tests/config-sshport-image.yaml new file mode 100644 index 00000000..8225337d --- /dev/null +++ b/e2e-tests/config-sshport-image.yaml @@ -0,0 +1,49 @@ +--- +cluster_name: $NAME +kubeconfig_path: $KUBECONFIG +k3s_version: v1.30.2+k3s2 + +image: $IMAGE + +networking: + ssh: + port: $SSHPORT + use_agent: false # set to true if your key has a passphrase + public_key_path: sshkey.pub + private_key_path: sshkey + allowed_networks: + ssh: + - 0.0.0.0/0 + api: # this will firewall port 6443 on the nodes; it will NOT firewall the API load balancer + - 0.0.0.0/0 + public_network: + ipv4: true + ipv6: true + private_network: + enabled : true + subnet: 10.0.0.0/16 + existing_network_name: "" + cni: + enabled: true + encryption: false + mode: flannel + +datastore: + mode: etcd # etcd (default) or external + +schedule_workloads_on_masters: false + +masters_pool: + instance_type: cpx11 + instance_count: 3 + location: $LOCATION + +worker_node_pools: +- name: pool1 + instance_type: cpx11 + instance_count: 1 + location: $LOCATION + +embedded_registry_mirror: + enabled: true + diff --git a/e2e-tests/env.sample b/e2e-tests/env.sample new file mode 100644 index 00000000..7783d563 --- /dev/null +++ b/e2e-tests/env.sample @@ -0,0 +1,3 @@ +export HCLOUD_TOKEN=... +export LOCATION=ash +export HK3S=../hetzner-k3s diff --git a/e2e-tests/list-test-results.sh b/e2e-tests/list-test-results.sh new file mode 100755 index 00000000..5a681c77 --- /dev/null +++ b/e2e-tests/list-test-results.sh @@ -0,0 +1,9 @@ +#!/bin/sh +for T in test-*; do + printf "%s\t%s\t%s\t%s\t%s\n" \ + "$(cat $T/config)" \ + "$(cat $T/args)" \ + "$(cat $T/result)" \ + "$(cat $T/status)" \ + "$T" +done | sort | column -t -s " " diff --git a/e2e-tests/run-all-tests.sh b/e2e-tests/run-all-tests.sh new file mode 100755 index 00000000..295328ce --- /dev/null +++ b/e2e-tests/run-all-tests.sh @@ -0,0 +1,22 @@ +#!/bin/sh +T=./run-single-test.sh + +$T config-sshport-image.yaml IMAGE=debian-11 SSHPORT=222 +$T config-sshport-image.yaml IMAGE=debian-11 SSHPORT=22 +$T config-sshport-image.yaml IMAGE=debian-12 SSHPORT=222 +$T config-sshport-image.yaml IMAGE=ubuntu-20.04 SSHPORT=222 +$T config-sshport-image.yaml IMAGE=ubuntu-22.04 SSHPORT=222 +$T config-sshport-image.yaml IMAGE=ubuntu-24.04 SSHPORT=222 +$T config-sshport-image.yaml IMAGE=alma-8 SSHPORT=22 +$T config-sshport-image.yaml IMAGE=alma-8 SSHPORT=222 +$T config-sshport-image.yaml IMAGE=alma-9 SSHPORT=22 +$T config-sshport-image.yaml IMAGE=alma-9 SSHPORT=222 +$T config-sshport-image.yaml IMAGE=rocky-8 SSHPORT=22 +$T config-sshport-image.yaml IMAGE=rocky-8 SSHPORT=222 +$T config-sshport-image.yaml IMAGE=rocky-9 SSHPORT=22 +$T config-sshport-image.yaml IMAGE=fedora-38 SSHPORT=222 +$T config-sshport-image.yaml IMAGE=fedora-39 SSHPORT=222 +$T config-sshport-image.yaml IMAGE=fedora-40 SSHPORT=222 +$T config-sshport-image.yaml IMAGE=centos-stream-8 SSHPORT=222 +$T config-sshport-image.yaml IMAGE=centos-stream-9 SSHPORT=222 +$T config-sshport-image.yaml IMAGE=centos-stream-8 SSHPORT=22 diff --git a/e2e-tests/run-single-test.sh b/e2e-tests/run-single-test.sh new file mode 100755 index 00000000..f30640c8 --- /dev/null +++ b/e2e-tests/run-single-test.sh @@ -0,0 +1,54 @@ +#!/bin/sh +set -eu +set -o pipefail +. ./env # this env file should set HCLOUD_TOKEN, and perhaps LOCATION and HK3S +HK3S=${HK3S-hetzner-k3s} +CONFIG=$1 +shift +HASH=$({ + echo "---" + echo "$@" + echo "---" + cat "$CONFIG" + } | sha256sum | cut -c1-8) +export NAME="test-$HASH" +OUTDIR="$NAME" +export LOCATION=${LOCATION-ash} +export KUBECONFIG="$OUTDIR/kubeconfig" +export "$@" +if [ -d "$OUTDIR" ]; then + echo "Output directory '$OUTDIR' already exists." + echo "Remove or rename it if you want to run that test again." + exit 1 +fi +echo "Creating output directory: $OUTDIR" +mkdir -p "$OUTDIR" +envsubst < "$CONFIG" > "$OUTDIR/config.yaml" +echo "$*" > "$OUTDIR/args" +echo "$CONFIG" > "$OUTDIR/config" + +if ! [ -f sshkey ]; then + ssh-keygen -f sshkey -t ed25519 -N "" +fi + +echo "pending" > "$OUTDIR/result" +echo "creating" > "$OUTDIR/status" +if timeout 5m "$HK3S" create --config "$OUTDIR/config.yaml" | tee "$OUTDIR/create.log" 2>&1 ; then +( + echo "testing" > "$OUTDIR/status" + set +x -e + kubectl get nodes -o wide + kubectl create deployment blue --image jpetazzo/color + kubectl expose deployment blue --port 80 + kubectl wait deployment blue --for=condition=Available + kubectl run --rm -it --restart=Never --image curlimages/curl curl http://blue + echo "ok" > "$OUTDIR/result" +) | tee "$OUTDIR/kubectl.log" 2>&1 || true +fi +echo "deleting" > "$OUTDIR/status" +timeout 5m "$HK3S" delete --config "$OUTDIR/config.yaml" | tee "$OUTDIR/delete.log" 2>&1 +if ! grep -qw ok "$OUTDIR/result"; then + echo "error" > "$OUTDIR/result" +fi +echo "done" > "$OUTDIR/status" + diff --git a/exe/hetzner-k3s b/exe/hetzner-k3s deleted file mode 100755 index fd39ef21..00000000 --- a/exe/hetzner-k3s +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env ruby - -require_relative '../lib/hetzner/k3s/cli' -Hetzner::K3s::CLI.start diff --git a/hetzner-k3s.gemspec b/hetzner-k3s.gemspec deleted file mode 100644 index 061816d4..00000000 --- a/hetzner-k3s.gemspec +++ /dev/null @@ -1,35 +0,0 @@ -require_relative 'lib/hetzner/k3s/version' - -Gem::Specification.new do |spec| - spec.name = "hetzner-k3s" - spec.version = Hetzner::K3s::VERSION - spec.authors = ["Vito Botta"] - spec.email = ["vito@botta.me"] - - spec.summary = %q{A CLI to create a Kubernetes cluster in Hetzner Cloud very quickly using k3s.} - spec.description = %q{A CLI to create a Kubernetes cluster in Hetzner Cloud very quickly using k3s.} - spec.homepage = "https://github.com/vitobotta/hetzner-k3s" - spec.license = "MIT" - spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0") - - # spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'" - - spec.metadata["homepage_uri"] = spec.homepage - spec.metadata["source_code_uri"] = "https://github.com/vitobotta/hetzner-k3s" - spec.metadata["changelog_uri"] = "https://github.com/vitobotta/hetzner-k3s" - - spec.add_dependency "thor" - spec.add_dependency "http" - spec.add_dependency "net-ssh" - spec.add_dependency "k8s-ruby" - spec.add_dependency "sshkey" - - # Specify which files should be added to the gem when it is released. - # The `git ls-files -z` loads the files in the RubyGem that have been added into git. - spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do - `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) } - end - spec.bindir = "exe" - spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } - spec.require_paths = ["lib"] -end diff --git a/lib/hetzner.rb b/lib/hetzner.rb deleted file mode 100644 index b3b9f2d5..00000000 --- a/lib/hetzner.rb +++ /dev/null @@ -1,2 +0,0 @@ -module Hetzner -end diff --git a/lib/hetzner/infra.rb b/lib/hetzner/infra.rb deleted file mode 100644 index 764f0f4e..00000000 --- a/lib/hetzner/infra.rb +++ /dev/null @@ -1,2 +0,0 @@ -module Hetzner::Infra -end diff --git a/lib/hetzner/infra/client.rb b/lib/hetzner/infra/client.rb deleted file mode 100644 index 8771b207..00000000 --- a/lib/hetzner/infra/client.rb +++ /dev/null @@ -1,46 +0,0 @@ -module Hetzner - class Client - BASE_URI = "https://api.hetzner.cloud/v1" - - attr_reader :token - - def initialize(token:) - @token = token - end - - def get(path) - make_request do - JSON.parse HTTP.headers(headers).get(BASE_URI + path).body - end - end - - def post(path, data) - make_request do - HTTP.headers(headers).post(BASE_URI + path, json: data) - end - end - - def delete(path, id) - make_request do - HTTP.headers(headers).delete(BASE_URI + path + "/" + id.to_s) - end - end - - private - - def headers - { - "Authorization": "Bearer #{@token}", - "Content-Type": "application/json" - } - end - - def make_request &block - Timeout::timeout(5) do - block.call - end - rescue Timeout::Error - retry - end - end -end diff --git a/lib/hetzner/infra/firewall.rb b/lib/hetzner/infra/firewall.rb deleted file mode 100644 index 0d2379f0..00000000 --- a/lib/hetzner/infra/firewall.rb +++ /dev/null @@ -1,103 +0,0 @@ -module Hetzner - class Firewall - def initialize(hetzner_client:, cluster_name:) - @hetzner_client = hetzner_client - @cluster_name = cluster_name - end - - def create - puts - - if firewall = find_firewall - puts "Firewall already exists, skipping." - puts - return firewall["id"] - end - - puts "Creating firewall..." - - response = hetzner_client.post("/firewalls", firewall_config).body - puts "...firewall created." - puts - - JSON.parse(response)["firewall"]["id"] - end - - def delete - if firewall = find_firewall - puts "Deleting firewall..." - hetzner_client.delete("/firewalls", firewall["id"]) - puts "...firewall deleted." - else - puts "Firewall no longer exists, skipping." - end - - puts - end - - private - - attr_reader :hetzner_client, :cluster_name, :firewall - - def firewall_config - { - name: cluster_name, - rules: [ - { - "direction": "in", - "protocol": "tcp", - "port": "22", - "source_ips": [ - "0.0.0.0/0", - "::/0" - ], - "destination_ips": [] - }, - { - "direction": "in", - "protocol": "icmp", - "port": nil, - "source_ips": [ - "0.0.0.0/0", - "::/0" - ], - "destination_ips": [] - }, - { - "direction": "in", - "protocol": "tcp", - "port": "6443", - "source_ips": [ - "0.0.0.0/0", - "::/0" - ], - "destination_ips": [] - }, - { - "direction": "in", - "protocol": "tcp", - "port": "any", - "source_ips": [ - "10.0.0.0/16" - ], - "destination_ips": [] - }, - { - "direction": "in", - "protocol": "udp", - "port": "any", - "source_ips": [ - "10.0.0.0/16" - ], - "destination_ips": [] - } - ] - } - end - - def find_firewall - hetzner_client.get("/firewalls")["firewalls"].detect{ |firewall| firewall["name"] == cluster_name } - end - - end -end diff --git a/lib/hetzner/infra/load_balancer.rb b/lib/hetzner/infra/load_balancer.rb deleted file mode 100644 index d69b2b48..00000000 --- a/lib/hetzner/infra/load_balancer.rb +++ /dev/null @@ -1,84 +0,0 @@ -module Hetzner - class LoadBalancer - def initialize(hetzner_client:, cluster_name:) - @hetzner_client = hetzner_client - @cluster_name = cluster_name - end - - def create(location:, network_id:) - @location = location - @network_id = network_id - - puts - - if load_balancer = find_load_balancer - puts "API load balancer already exists, skipping." - puts - return load_balancer["id"] - end - - puts "Creating API load_balancer..." - - response = hetzner_client.post("/load_balancers", load_balancer_config).body - puts "...API load balancer created." - puts - - JSON.parse(response)["load_balancer"]["id"] - end - - def delete(ha:) - if load_balancer = find_load_balancer - puts "Deleting API load balancer..." unless ha - hetzner_client.delete("/load_balancers", load_balancer["id"]) - puts "...API load balancer deleted." unless ha - elsif ha - puts "API load balancer no longer exists, skipping." - end - - puts - end - - private - - attr_reader :hetzner_client, :cluster_name, :load_balancer, :location, :network_id - - def load_balancer_name - "#{cluster_name}-api" - end - - def load_balancer_config - { - "algorithm": { - "type": "round_robin" - }, - "load_balancer_type": "lb11", - "location": location, - "name": load_balancer_name, - "network": network_id, - "public_interface": true, - "services": [ - { - "destination_port": 6443, - "listen_port": 6443, - "protocol": "tcp", - "proxyprotocol": false - } - ], - "targets": [ - { - "label_selector": { - "selector": "cluster=#{cluster_name},role=master" - }, - "type": "label_selector", - "use_private_ip": true - } - ] - } - end - - def find_load_balancer - hetzner_client.get("/load_balancers")["load_balancers"].detect{ |load_balancer| load_balancer["name"] == load_balancer_name } - end - - end -end diff --git a/lib/hetzner/infra/network.rb b/lib/hetzner/infra/network.rb deleted file mode 100644 index 6decf95f..00000000 --- a/lib/hetzner/infra/network.rb +++ /dev/null @@ -1,62 +0,0 @@ -module Hetzner - class Network - def initialize(hetzner_client:, cluster_name:) - @hetzner_client = hetzner_client - @cluster_name = cluster_name - end - - def create - puts - - if network = find_network - puts "Private network already exists, skipping." - puts - return network["id"] - end - - puts "Creating private network..." - - response = hetzner_client.post("/networks", network_config).body - - puts "...private network created." - puts - - JSON.parse(response)["network"]["id"] - end - - def delete - if network = find_network - puts "Deleting network..." - hetzner_client.delete("/networks", network["id"]) - puts "...network deleted." - else - puts "Network no longer exists, skipping." - end - - puts - end - - private - - attr_reader :hetzner_client, :cluster_name - - def network_config - { - name: cluster_name, - ip_range: "10.0.0.0/16", - subnets: [ - { - ip_range: "10.0.0.0/16", - network_zone: "eu-central", - type: "cloud" - } - ] - } - end - - def find_network - hetzner_client.get("/networks")["networks"].detect{ |network| network["name"] == cluster_name } - end - - end -end diff --git a/lib/hetzner/infra/server.rb b/lib/hetzner/infra/server.rb deleted file mode 100644 index 8f4a1883..00000000 --- a/lib/hetzner/infra/server.rb +++ /dev/null @@ -1,81 +0,0 @@ -module Hetzner - class Server - def initialize(hetzner_client:, cluster_name:) - @hetzner_client = hetzner_client - @cluster_name = cluster_name - end - - def create(location:, instance_type:, instance_id:, firewall_id:, network_id:, ssh_key_id:) - puts - - server_name = "#{cluster_name}-#{instance_type}-#{instance_id}" - - if server = find_server(server_name) - puts "Server #{server_name} already exists, skipping." - puts - return server - end - - puts "Creating server #{server_name}..." - - server_config = { - name: server_name, - location: location, - image: "ubuntu-20.04", - firewalls: [ - { firewall: firewall_id } - ], - networks: [ - network_id - ], - server_type: instance_type, - ssh_keys: [ - ssh_key_id - ], - user_data: user_data, - labels: { - cluster: cluster_name, - role: (server_name =~ /master/ ? "master" : "worker") - } - } - - response = hetzner_client.post("/servers", server_config).body - - puts "...server #{server_name} created." - puts - - JSON.parse(response)["server"] - end - - def delete(server_name:) - if server = find_server(server_name) - puts "Deleting server #{server_name}..." - hetzner_client.delete "/servers", server["id"] - puts "...server #{server_name} deleted." - else - puts "Server #{server_name} no longer exists, skipping." - end - end - - private - - attr_reader :hetzner_client, :cluster_name - - def find_server(server_name) - hetzner_client.get("/servers")["servers"].detect{ |network| network["name"] == server_name } - end - - def user_data - <<~EOS - #cloud-config - packages: - - fail2ban - runcmd: - - sed -i 's/[#]*PermitRootLogin yes/PermitRootLogin prohibit-password/g' /etc/ssh/sshd_config - - sed -i 's/[#]*PasswordAuthentication yes/PasswordAuthentication no/g' /etc/ssh/sshd_config - - systemctl restart sshd - EOS - end - - end -end diff --git a/lib/hetzner/infra/ssh_key.rb b/lib/hetzner/infra/ssh_key.rb deleted file mode 100644 index d7a8cf9c..00000000 --- a/lib/hetzner/infra/ssh_key.rb +++ /dev/null @@ -1,81 +0,0 @@ -module Hetzner - class SSHKey - def initialize(hetzner_client:, cluster_name:) - @hetzner_client = hetzner_client - @cluster_name = cluster_name - end - - def create(ssh_key_path:) - @ssh_key_path = ssh_key_path - - puts - - if ssh_key = find_ssh_key - puts "SSH key already exists, skipping." - puts - return ssh_key["id"] - end - - puts "Creating SSH key..." - - response = hetzner_client.post("/ssh_keys", ssh_key_config).body - - puts "...SSH key created." - puts - - JSON.parse(response)["ssh_key"]["id"] - end - - def delete(ssh_key_path:) - @ssh_key_path = ssh_key_path - - if ssh_key = find_ssh_key - if ssh_key["name"] == cluster_name - puts "Deleting ssh_key..." - hetzner_client.delete("/ssh_keys", ssh_key["id"]) - puts "...ssh_key deleted." - else - puts "The SSH key existed before creating the cluster, so I won't delete it." - end - else - puts "SSH key no longer exists, skipping." - end - - puts - end - - private - - attr_reader :hetzner_client, :cluster_name, :ssh_key_path - - def public_key - @public_key ||= File.read(ssh_key_path).chop - end - - def ssh_key_config - { - name: cluster_name, - public_key: public_key - } - end - - def fingerprint - @fingerprint ||= ::SSHKey.fingerprint(public_key) - end - - def find_ssh_key - key = hetzner_client.get("/ssh_keys")["ssh_keys"].detect do |ssh_key| - ssh_key["fingerprint"] == fingerprint - end - - unless key - key = hetzner_client.get("/ssh_keys")["ssh_keys"].detect do |ssh_key| - ssh_key["name"] == cluster_name - end - end - - key - end - - end -end diff --git a/lib/hetzner/k3s/cli.rb b/lib/hetzner/k3s/cli.rb deleted file mode 100644 index 942c0bb9..00000000 --- a/lib/hetzner/k3s/cli.rb +++ /dev/null @@ -1,304 +0,0 @@ -require "thor" -require "http" -require "sshkey" - -require_relative "cluster" - -module Hetzner - module K3s - class CLI < Thor - def self.exit_on_failure? - true - end - - desc "create-cluster", "Create a k3s cluster in Hetzner Cloud" - option :config_file, required: true - - def create_cluster - validate_config_file :create - - Cluster.new(hetzner_client: hetzner_client).create configuration: configuration - end - - desc "delete-cluster", "Delete an existing k3s cluster in Hetzner Cloud" - option :config_file, required: true - - def delete_cluster - validate_config_file :delete - Cluster.new(hetzner_client: hetzner_client).delete configuration: configuration - end - - desc "upgrade-cluster", "Upgrade an existing k3s cluster in Hetzner Cloud to a new version" - option :config_file, required: true - option :new_k3s_version, required: true - option :force, default: "false" - - def upgrade_cluster - validate_config_file :upgrade - Cluster.new(hetzner_client: hetzner_client).upgrade configuration: configuration, new_k3s_version: options[:new_k3s_version], config_file: options[:config_file] - end - - desc "releases", "List available k3s releases" - def releases - find_available_releases.each do |release| - puts release - end - end - - private - - attr_reader :configuration, :hetzner_client, :k3s_version - attr_accessor :errors, :used_server_types - - def validate_config_file(action) - config_file_path = options[:config_file] - - if File.exists?(config_file_path) - begin - @configuration = YAML.load_file(options[:config_file]) - raise "invalid" unless configuration.is_a? Hash - rescue - puts "Please ensure that the config file is a correct YAML manifest." - return - end - else - puts "Please specify a correct path for the config file." - return - end - - @errors = [] - @used_server_types = [] - - validate_token - validate_cluster_name - validate_kubeconfig_path - - case action - when :create - validate_ssh_key - validate_location - validate_k3s_version - validate_masters - validate_worker_node_pools - validate_verify_host_key - when :delete - validate_kubeconfig_path_must_exist - when :upgrade - validate_kubeconfig_path_must_exist - validate_new_k3s_version - validate_new_k3s_version_must_be_more_recent - end - - errors.flatten! - - unless errors.empty? - puts "Some information in the configuration file requires your attention:" - errors.each do |error| - puts " - #{error}" - end - - exit 1 - end - end - - def validate_token - token = configuration.dig("hetzner_token") - @hetzner_client = Hetzner::Client.new(token: token) - hetzner_client.get("/locations") - rescue - errors << "Invalid Hetzner Cloid token" - end - - def validate_cluster_name - errors << "Cluster name is an invalid format" unless configuration["cluster_name"] =~ /\A([A-Za-z0-9\-\_]+)\Z/ - end - - def validate_kubeconfig_path - path = File.expand_path(configuration.dig("kubeconfig_path")) - errors << "kubeconfig path cannot be a directory" and return if File.directory? path - - directory = File.dirname(path) - errors << "Directory #{directory} doesn't exist" unless File.exists? directory - rescue - errors << "Invalid path for the kubeconfig" - end - - def validate_ssh_key - path = File.expand_path(configuration.dig("ssh_key_path")) - errors << "Invalid Public SSH key path" and return unless File.exists? path - - key = File.read(path) - errors << "Public SSH key is invalid" unless ::SSHKey.valid_ssh_public_key? key - rescue - errors << "Invalid Public SSH key path" - end - - def validate_kubeconfig_path_must_exist - path = File.expand_path configuration.dig("kubeconfig_path") - errors << "kubeconfig path is invalid" and return unless File.exists? path - errors << "kubeconfig path cannot be a directory" if File.directory? path - rescue - errors << "Invalid kubeconfig path" - end - - def server_types - @server_types ||= hetzner_client.get("/server_types")["server_types"].map{ |server_type| server_type["name"] } - rescue - @errors << "Cannot fetch server types with Hetzner API, please try again later" - false - end - - def locations - @locations ||= hetzner_client.get("/locations")["locations"].map{ |location| location["name"] } - rescue - @errors << "Cannot fetch locations with Hetzner API, please try again later" - false - end - - def validate_location - errors << "Invalid location - available locations: nbg1 (Nuremberg, Germany), fsn1 (Falkenstein, Germany), hel1 (Helsinki, Finland)" unless locations.include? configuration.dig("location") - end - - def find_available_releases - @available_releases ||= begin - response = HTTP.get("https://api.github.com/repos/k3s-io/k3s/tags").body - JSON.parse(response).map { |hash| hash["name"] } - end - rescue - errors << "Cannot fetch the releases with Hetzner API, please try again later" - end - - def validate_k3s_version - k3s_version = configuration.dig("k3s_version") - available_releases = find_available_releases - errors << "Invalid k3s version" unless available_releases.include? k3s_version - end - - def validate_new_k3s_version - new_k3s_version = options[:new_k3s_version] - available_releases = find_available_releases - errors << "The new k3s version is invalid" unless available_releases.include? new_k3s_version - end - - def validate_masters - masters_pool = nil - - begin - masters_pool = configuration.dig("masters") - rescue - errors << "Invalid masters configuration" - return - end - - if masters_pool.nil? - errors << "Invalid masters configuration" - return - end - - validate_instance_group masters_pool, workers: false - end - - def validate_worker_node_pools - worker_node_pools = nil - - begin - worker_node_pools = configuration.dig("worker_node_pools") - rescue - errors << "Invalid node pools configuration" - return - end - - if !worker_node_pools.is_a? Array - errors << "Invalid node pools configuration" - elsif worker_node_pools.size == 0 - errors << "At least one node pool is required in order to schedule workloads" - elsif worker_node_pools.map{ |worker_node_pool| worker_node_pool["name"]}.uniq.size != worker_node_pools.size - errors << "Each node pool must have an unique name" - elsif server_types - worker_node_pools.each do |worker_node_pool| - validate_instance_group worker_node_pool - end - end - end - - def validate_new_k3s_version_must_be_more_recent - return if options[:force] == "true" - return unless kubernetes_client - - begin - Timeout::timeout(5) do - servers = kubernetes_client.api("v1").resource("nodes").list - - if servers.size == 0 - errors << "The cluster seems to have no nodes, nothing to upgrade" - else - available_releases = find_available_releases - - current_k3s_version = servers.first.dig(:status, :nodeInfo, :kubeletVersion) - current_k3s_version_index = available_releases.index(current_k3s_version) || 1000 - - new_k3s_version = options[:new_k3s_version] - new_k3s_version_index = available_releases.index(new_k3s_version) || 1000 - - unless new_k3s_version_index < current_k3s_version_index - errors << "The new k3s version must be more recent than the current one" - end - end - end - - rescue Timeout::Error - puts "Cannot upgrade: Unable to fetch nodes from Kubernetes API. Is the cluster online?" - end - end - - def validate_instance_group(instance_group, workers: true) - instance_group_errors = [] - - instance_group_type = workers ? "Worker mode pool #{instance_group["name"]}" : "Masters pool" - - unless !workers || instance_group["name"] =~ /\A([A-Za-z0-9\-\_]+)\Z/ - instance_group_errors << "#{instance_group_type} has an invalid name" - end - - unless instance_group.is_a? Hash - instance_group_errors << "#{instance_group_type} is in an invalid format" - end - - unless server_types.include?(instance_group["instance_type"]) - instance_group_errors << "#{instance_group_type} has an invalid instance type" - end - - if instance_group["instance_count"].is_a? Integer - if instance_group["instance_count"] < 1 - instance_group_errors << "#{instance_group_type} must have at least one node" - elsif !workers - instance_group_errors << "Masters count must equal to 1 for non-HA clusters or an odd number (recommended 3) for an HA cluster" unless instance_group["instance_count"].odd? - end - else - instance_group_errors << "#{instance_group_type} has an invalid instance count" - end - - used_server_types << instance_group["instance_type"] - - errors << instance_group_errors - end - - def kubernetes_client - return @kubernetes_client if @kubernetes_client - - config_hash = YAML.load_file(File.expand_path(configuration["kubeconfig_path"])) - config_hash['current-context'] = configuration["cluster_name"] - @kubernetes_client = K8s::Client.config(K8s::Config.new(config_hash)) - rescue - errors << "Cannot connect to the Kubernetes cluster" - false - end - - - def validate_verify_host_key - return unless [true, false].include?(configuration.fetch("ssh_key_path", false)) - errors << "Please set the verify_host_key option to either true or false" - end - end - end -end diff --git a/lib/hetzner/k3s/client_patch.rb b/lib/hetzner/k3s/client_patch.rb deleted file mode 100644 index b5643bcf..00000000 --- a/lib/hetzner/k3s/client_patch.rb +++ /dev/null @@ -1,38 +0,0 @@ -module K8s - class ResourceClient - def initialize(transport, api_client, api_resource, namespace: nil, resource_class: K8s::Resource) - @transport = transport - @api_client = api_client - @api_resource = api_resource - @namespace = namespace - @resource_class = resource_class - - if @api_resource.name.include? '/' - @resource, @subresource = @api_resource.name.split('/', 2) - else - @resource = @api_resource.name - @subresource = nil - end - - # fail "Resource #{api_resource.name} is not namespaced" unless api_resource.namespaced || !namespace - end - - def path(name = nil, subresource: @subresource, namespace: @namespace) - namespace_part = namespace ? ['namespaces', namespace] : [] - - if namespaced? - if name && subresource - @api_client.path(*namespace_part, @resource, name, subresource) - elsif name - @api_client.path(*namespace_part, @resource, name) - else namespaced? - @api_client.path(*namespace_part, @resource) - end - elsif name - @api_client.path(@resource, name) - else - @api_client.path(@resource) - end - end - end -end diff --git a/lib/hetzner/k3s/cluster.rb b/lib/hetzner/k3s/cluster.rb deleted file mode 100644 index 80ffae4c..00000000 --- a/lib/hetzner/k3s/cluster.rb +++ /dev/null @@ -1,627 +0,0 @@ -require 'thread' -require 'net/ssh' -require "securerandom" -require "base64" -require "k8s-ruby" -require 'timeout' - -require_relative "../infra/client" -require_relative "../infra/firewall" -require_relative "../infra/network" -require_relative "../infra/ssh_key" -require_relative "../infra/server" -require_relative "../infra/load_balancer" - -require_relative "../k3s/client_patch" - - -class Cluster - def initialize(hetzner_client:) - @hetzner_client = hetzner_client - end - - def create(configuration:) - @hetzner_token = configuration.dig("hetzner_token") - @cluster_name = configuration.dig("cluster_name") - @kubeconfig_path = File.expand_path(configuration.dig("kubeconfig_path")) - @ssh_key_path = File.expand_path(configuration.dig("ssh_key_path")) - @k3s_version = configuration.dig("k3s_version") - @masters_config = configuration.dig("masters") - @worker_node_pools = configuration.dig("worker_node_pools") - @location = configuration.dig("location") - @verify_host_key = configuration.fetch("verify_host_key", false) - @servers = [] - - create_resources - - deploy_kubernetes - - sleep 10 - - deploy_cloud_controller_manager - deploy_csi_driver - deploy_system_upgrade_controller - end - - def delete(configuration:) - @cluster_name = configuration.dig("cluster_name") - @kubeconfig_path = File.expand_path(configuration.dig("kubeconfig_path")) - @ssh_key_path = File.expand_path(configuration.dig("ssh_key_path")) - - delete_resources - end - - def upgrade(configuration:, new_k3s_version:, config_file:) - @configuration = configuration - @cluster_name = configuration.dig("cluster_name") - @kubeconfig_path = File.expand_path(configuration.dig("kubeconfig_path")) - @new_k3s_version = new_k3s_version - @config_file = config_file - - upgrade_cluster - end - - private - - attr_accessor :servers - - attr_reader :hetzner_client, :cluster_name, :kubeconfig_path, :k3s_version, - :masters_config, :worker_node_pools, - :location, :ssh_key_path, :kubernetes_client, - :hetzner_token, :tls_sans, :new_k3s_version, :configuration, - :config_file, :verify_host_key - - - def latest_k3s_version - response = HTTP.get("https://api.github.com/repos/k3s-io/k3s/tags").body - JSON.parse(response).first["name"] - end - - def create_resources - firewall_id = Hetzner::Firewall.new( - hetzner_client: hetzner_client, - cluster_name: cluster_name - ).create - - network_id = Hetzner::Network.new( - hetzner_client: hetzner_client, - cluster_name: cluster_name - ).create - - ssh_key_id = Hetzner::SSHKey.new( - hetzner_client: hetzner_client, - cluster_name: cluster_name - ).create(ssh_key_path: ssh_key_path) - - server_configs = [] - - master_instance_type = masters_config["instance_type"] - masters_count = masters_config["instance_count"] - - masters_count.times do |i| - server_configs << { - location: location, - instance_type: master_instance_type, - instance_id: "master#{i+1}", - firewall_id: firewall_id, - network_id: network_id, - ssh_key_id: ssh_key_id - } - end - - if masters_count > 1 - Hetzner::LoadBalancer.new( - hetzner_client: hetzner_client, - cluster_name: cluster_name - ).create(location: location, network_id: network_id) - end - - worker_node_pools.each do |worker_node_pool| - worker_node_pool_name = worker_node_pool["name"] - worker_instance_type = worker_node_pool["instance_type"] - worker_count = worker_node_pool["instance_count"] - - worker_count.times do |i| - server_configs << { - location: location, - instance_type: worker_instance_type, - instance_id: "pool-#{worker_node_pool_name}-worker#{i+1}", - firewall_id: firewall_id, - network_id: network_id, - ssh_key_id: ssh_key_id - } - end - end - - threads = server_configs.map do |server_config| - Thread.new do - servers << Hetzner::Server.new(hetzner_client: hetzner_client, cluster_name: cluster_name).create(server_config) - end - end - - threads.each(&:join) unless threads.empty? - - puts - threads = servers.map do |server| - Thread.new { wait_for_ssh server } - end - - threads.each(&:join) unless threads.empty? - end - - def delete_resources - # Deleting nodes defined according to Kubernetes first - begin - Timeout::timeout(5) do - servers = kubernetes_client.api("v1").resource("nodes").list - - threads = servers.map do |node| - Thread.new do - Hetzner::Server.new(hetzner_client: hetzner_client, cluster_name: cluster_name).delete(server_name: node.metadata[:name]) - end - end - - threads.each(&:join) unless threads.empty? - end - rescue Timeout::Error, Excon::Error::Socket - puts "Unable to fetch nodes from Kubernetes API. Is the cluster online?" - end - - # Deleting nodes defined in the config file just in case there are leftovers i.e. nodes that - # were not part of the cluster for some reason - - threads = all_servers.map do |server| - Thread.new do - Hetzner::Server.new(hetzner_client: hetzner_client, cluster_name: cluster_name).delete(server_name: server["name"]) - end - end - - threads.each(&:join) unless threads.empty? - - puts - - sleep 5 # give time for the servers to actually be deleted - - Hetzner::Firewall.new( - hetzner_client: hetzner_client, - cluster_name: cluster_name - ).delete - - Hetzner::Network.new( - hetzner_client: hetzner_client, - cluster_name: cluster_name - ).delete - - Hetzner::SSHKey.new( - hetzner_client: hetzner_client, - cluster_name: cluster_name - ).delete(ssh_key_path: ssh_key_path) - - Hetzner::LoadBalancer.new( - hetzner_client: hetzner_client, - cluster_name: cluster_name - ).delete(ha: (masters.size > 1)) - - end - - def upgrade_cluster - resources = K8s::Resource.from_files(ugrade_plan_manifest_path) - - begin - kubernetes_client.api("upgrade.cattle.io/v1").resource("plans").get("k3s-server", namespace: "system-upgrade") - - puts "Aborting - an upgrade is already in progress." - - rescue K8s::Error::NotFound - resources.each do |resource| - kubernetes_client.create_resource(resource) - end - - puts "Upgrade will now start. Run `watch kubectl get nodes` to see the nodes being upgraded. This should take a few minutes for a small cluster." - puts "The API server may be briefly unavailable during the upgrade of the controlplane." - - configuration["k3s_version"] = new_k3s_version - - File.write(config_file, configuration.to_yaml) - end - end - - - def master_script(master) - server = master == first_master ? " --cluster-init " : " --server https://#{first_master_private_ip}:6443 " - flannel_interface = find_flannel_interface(master) - - <<~EOF - curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION="#{k3s_version}" K3S_TOKEN="#{k3s_token}" INSTALL_K3S_EXEC="server \ - --disable-cloud-controller \ - --disable servicelb \ - --disable traefik \ - --disable local-storage \ - --disable metrics-server \ - --write-kubeconfig-mode=644 \ - --node-name="$(hostname -f)" \ - --cluster-cidr=10.244.0.0/16 \ - --etcd-expose-metrics=true \ - --kube-controller-manager-arg="address=0.0.0.0" \ - --kube-controller-manager-arg="bind-address=0.0.0.0" \ - --kube-proxy-arg="metrics-bind-address=0.0.0.0" \ - --kube-scheduler-arg="address=0.0.0.0" \ - --kube-scheduler-arg="bind-address=0.0.0.0" \ - --node-taint CriticalAddonsOnly=true:NoExecute \ - --kubelet-arg="cloud-provider=external" \ - --node-ip=$(hostname -I | awk '{print $2}') \ - --node-external-ip=$(hostname -I | awk '{print $1}') \ - --flannel-iface=#{flannel_interface} \ - #{server} #{tls_sans}" sh - - EOF - end - - def worker_script(worker) - flannel_interface = find_flannel_interface(worker) - - <<~EOF - curl -sfL https://get.k3s.io | K3S_TOKEN="#{k3s_token}" INSTALL_K3S_VERSION="#{k3s_version}" K3S_URL=https://#{first_master_private_ip}:6443 INSTALL_K3S_EXEC="agent \ - --node-name="$(hostname -f)" \ - --kubelet-arg="cloud-provider=external" \ - --node-ip=$(hostname -I | awk '{print $2}') \ - --node-external-ip=$(hostname -I | awk '{print $1}') \ - --flannel-iface=#{flannel_interface}" sh - - EOF - end - - def deploy_kubernetes - puts - puts "Deploying k3s to first master (#{first_master["name"]})..." - - ssh first_master, master_script(first_master), print_output: true - - puts - puts "...k3s has been deployed to first master." - - save_kubeconfig - - if masters.size > 1 - threads = masters[1..-1].map do |master| - Thread.new do - puts - puts "Deploying k3s to master #{master["name"]}..." - - ssh master, master_script(master), print_output: true - - puts - puts "...k3s has been deployed to master #{master["name"]}." - end - end - - threads.each(&:join) unless threads.empty? - end - - threads = workers.map do |worker| - Thread.new do - puts - puts "Deploying k3s to worker (#{worker["name"]})..." - - ssh worker, worker_script(worker), print_output: true - - puts - puts "...k3s has been deployed to worker (#{worker["name"]})." - end - end - - threads.each(&:join) unless threads.empty? - end - - def deploy_cloud_controller_manager - puts - puts "Deploying Hetzner Cloud Controller Manager..." - - begin - kubernetes_client.api("v1").resource("secrets").get("hcloud", namespace: "kube-system") - - rescue K8s::Error::NotFound - secret = K8s::Resource.new( - apiVersion: "v1", - kind: "Secret", - metadata: { - namespace: 'kube-system', - name: 'hcloud', - }, - data: { - network: Base64.encode64(cluster_name), - token: Base64.encode64(hetzner_token) - } - ) - - kubernetes_client.api('v1').resource('secrets').create_resource(secret) - end - - - manifest = HTTP.follow.get("https://github.com/hetznercloud/hcloud-cloud-controller-manager/releases/latest/download/ccm-networks.yaml").body - - File.write("/tmp/cloud-controller-manager.yaml", manifest) - - resources = K8s::Resource.from_files("/tmp/cloud-controller-manager.yaml") - - begin - kubernetes_client.api("apps/v1").resource("deployments").get("hcloud-cloud-controller-manager", namespace: "kube-system") - - resources.each do |resource| - kubernetes_client.update_resource(resource) - end - - rescue K8s::Error::NotFound - resources.each do |resource| - kubernetes_client.create_resource(resource) - end - - end - - puts "...Cloud Controller Manager deployed" - rescue Excon::Error::Socket - retry - end - - def deploy_system_upgrade_controller - puts - puts "Deploying k3s System Upgrade Controller..." - - manifest = HTTP.follow.get("https://github.com/rancher/system-upgrade-controller/releases/download/v0.7.3/system-upgrade-controller.yaml").body - - File.write("/tmp/system-upgrade-controller.yaml", manifest) - - resources = K8s::Resource.from_files("/tmp/system-upgrade-controller.yaml") - - begin - kubernetes_client.api("apps/v1").resource("deployments").get("system-upgrade-controller", namespace: "system-upgrade") - - resources.each do |resource| - kubernetes_client.update_resource(resource) - end - - rescue K8s::Error::NotFound - resources.each do |resource| - kubernetes_client.create_resource(resource) - end - - end - - puts "...k3s System Upgrade Controller deployed" - rescue Excon::Error::Socket - retry - end - - def deploy_csi_driver - puts - puts "Deploying Hetzner CSI Driver..." - - begin - kubernetes_client.api("v1").resource("secrets").get("hcloud-csi", namespace: "kube-system") - - rescue K8s::Error::NotFound - secret = K8s::Resource.new( - apiVersion: "v1", - kind: "Secret", - metadata: { - namespace: 'kube-system', - name: 'hcloud-csi', - }, - data: { - token: Base64.encode64(hetzner_token) - } - ) - - kubernetes_client.api('v1').resource('secrets').create_resource(secret) - end - - - manifest = HTTP.follow.get("https://raw.githubusercontent.com/hetznercloud/csi-driver/v1.5.3/deploy/kubernetes/hcloud-csi.yml").body - - File.write("/tmp/csi-driver.yaml", manifest) - - resources = K8s::Resource.from_files("/tmp/csi-driver.yaml") - - begin - kubernetes_client.api("apps/v1").resource("daemonsets").get("hcloud-csi-node", namespace: "kube-system") - - - resources.each do |resource| - begin - kubernetes_client.update_resource(resource) - rescue K8s::Error::Invalid => e - raise e unless e.message =~ /must be specified/i - end - end - - rescue K8s::Error::NotFound - resources.each do |resource| - kubernetes_client.create_resource(resource) - end - - end - - puts "...CSI Driver deployed" - rescue Excon::Error::Socket - retry - end - - def wait_for_ssh(server) - Timeout::timeout(5) do - server_name = server["name"] - - puts "Waiting for server #{server_name} to be up..." - - loop do - result = ssh(server, "echo UP") - break if result == "UP" - end - - puts "...server #{server_name} is now up." - end - rescue Errno::ENETUNREACH, Errno::EHOSTUNREACH, Timeout::Error - retry - end - - def ssh(server, command, print_output: false) - public_ip = server.dig("public_net", "ipv4", "ip") - output = "" - - Net::SSH.start(public_ip, "root", verify_host_key: (verify_host_key ? :always : :never)) do |session| - session.exec!(command) do |channel, stream, data| - output << data - puts data if print_output - end - end - output.chop - rescue Net::SSH::Disconnect => e - retry unless e.message =~ /Too many authentication failures/ - rescue Net::SSH::ConnectionTimeout, Errno::ECONNREFUSED, Errno::ENETUNREACH, Errno::EHOSTUNREACH - retry - rescue Net::SSH::HostKeyMismatch - puts - puts "Cannot continue: Unable to SSH into server with IP #{public_ip} because the existing fingerprint in the known_hosts file does not match that of the actual host key." - puts "This is due to a security check but can also happen when creating a new server that gets assigned the same IP address as another server you've owned in the past." - puts "If are sure no security is being violated here and you're just creating new servers, you can eiher remove the relevant lines from your known_hosts (see IPs from the cloud console) or disable host key verification by setting the option 'verify_host_key' to false in the configuration file for the cluster." - exit 1 - end - - def kubernetes_client - return @kubernetes_client if @kubernetes_client - - config_hash = YAML.load_file(kubeconfig_path) - config_hash['current-context'] = cluster_name - @kubernetes_client = K8s::Client.config(K8s::Config.new(config_hash)) - end - - def find_flannel_interface(server) - if ssh(server, "lscpu | grep Vendor") =~ /Intel/ - "ens10" - else - "enp7s0" - end - end - - def all_servers - @all_servers ||= hetzner_client.get("/servers")["servers"] - end - - def masters - @masters ||= all_servers.select{ |server| server["name"] =~ /master\d+\Z/ }.sort{ |a, b| a["name"] <=> b["name"] } - end - - def workers - @workers = all_servers.select{ |server| server["name"] =~ /worker\d+\Z/ }.sort{ |a, b| a["name"] <=> b["name"] } - end - - def k3s_token - @k3s_token ||= begin - token = ssh(first_master, "{ TOKEN=$(< /var/lib/rancher/k3s/server/node-token); } 2> /dev/null; echo $TOKEN") - - if token.empty? - SecureRandom.hex - else - token.split(":").last - end - end - end - - def first_master_private_ip - @first_master_private_ip ||= first_master["private_net"][0]["ip"] - end - - def first_master - masters.first - end - - def api_server_ip - return @api_server_ip if @api_server_ip - - @api_server_ip = if masters.size > 1 - load_balancer_name = "#{cluster_name}-api" - load_balancer = hetzner_client.get("/load_balancers")["load_balancers"].detect{ |load_balancer| load_balancer["name"] == load_balancer_name } - load_balancer["public_net"]["ipv4"]["ip"] - else - first_master_public_ip - end - end - - def tls_sans - sans = " --tls-san=#{api_server_ip} " - - masters.each do |master| - master_private_ip = master["private_net"][0]["ip"] - sans << " --tls-san=#{master_private_ip} " - end - - sans - end - - def first_master_public_ip - @first_master_public_ip ||= first_master.dig("public_net", "ipv4", "ip") - end - - def save_kubeconfig - kubeconfig = ssh(first_master, "cat /etc/rancher/k3s/k3s.yaml"). - gsub("127.0.0.1", api_server_ip). - gsub("default", cluster_name) - - File.write(kubeconfig_path, kubeconfig) - end - - def ugrade_plan_manifest_path - worker_upgrade_concurrency = workers.size - 1 - worker_upgrade_concurrency = 1 if worker_upgrade_concurrency == 0 - - manifest = <<~EOF - apiVersion: upgrade.cattle.io/v1 - kind: Plan - metadata: - name: k3s-server - namespace: system-upgrade - labels: - k3s-upgrade: server - spec: - concurrency: 1 - version: #{new_k3s_version} - nodeSelector: - matchExpressions: - - {key: node-role.kubernetes.io/master, operator: In, values: ["true"]} - serviceAccountName: system-upgrade - tolerations: - - key: "CriticalAddonsOnly" - operator: "Equal" - value: "true" - effect: "NoExecute" - cordon: true - upgrade: - image: rancher/k3s-upgrade - --- - apiVersion: upgrade.cattle.io/v1 - kind: Plan - metadata: - name: k3s-agent - namespace: system-upgrade - labels: - k3s-upgrade: agent - spec: - concurrency: #{worker_upgrade_concurrency} - version: #{new_k3s_version} - nodeSelector: - matchExpressions: - - {key: node-role.kubernetes.io/master, operator: NotIn, values: ["true"]} - serviceAccountName: system-upgrade - prepare: - image: rancher/k3s-upgrade - args: ["prepare", "k3s-server"] - cordon: true - upgrade: - image: rancher/k3s-upgrade - EOF - - temp_file_path = "/tmp/k3s-upgrade-plan.yaml" - - File.write(temp_file_path, manifest) - - temp_file_path - end - -end diff --git a/lib/hetzner/k3s/version.rb b/lib/hetzner/k3s/version.rb deleted file mode 100644 index 09a490fb..00000000 --- a/lib/hetzner/k3s/version.rb +++ /dev/null @@ -1,5 +0,0 @@ -module Hetzner - module K3s - VERSION = "0.3.1" - end -end diff --git a/logo.png b/logo.png new file mode 100644 index 00000000..16386130 Binary files /dev/null and b/logo.png differ diff --git a/shard.lock b/shard.lock new file mode 100644 index 00000000..0f02d3ec --- /dev/null +++ b/shard.lock @@ -0,0 +1,46 @@ +version: 2.0 +shards: + admiral: + git: https://github.com/jwaldrip/admiral.cr.git + version: 1.12.1 + + crest: + git: https://github.com/mamantoha/crest.git + version: 1.4.1 + + crinja: + git: https://github.com/straight-shoota/crinja.git + version: 0.8.1 + + cron_parser: + git: https://github.com/kostya/cron_parser.git + version: 0.4.0 + + future: + git: https://github.com/crystal-community/future.cr.git + version: 1.0.0 + + http-client-digest_auth: + git: https://github.com/mamantoha/http-client-digest_auth.git + version: 0.6.0 + + http_proxy: + git: https://github.com/mamantoha/http_proxy.git + version: 0.12.0 + + ipaddress: + git: https://github.com/sija/ipaddress.cr.git + version: 0.2.3 + + retriable: + git: https://github.com/sija/retriable.cr.git + version: 0.2.5 + + ssh2: + git: https://github.com/spider-gazelle/ssh2.cr.git + version: 1.6.1 + + tasker: + git: https://github.com/spider-gazelle/tasker.git + version: 2.1.4 + diff --git a/shard.yml b/shard.yml new file mode 100644 index 00000000..7c6d5ae7 --- /dev/null +++ b/shard.yml @@ -0,0 +1,30 @@ +name: hetzner-k3s +version: 0.1.0 + +authors: + - Vito Botta + +targets: + hetzner-k3s: + main: src/hetzner-k3s.cr + +crystal: 1.5.0 + +license: MIT + +dependencies: + admiral: + github: jwaldrip/admiral.cr + crest: + github: mamantoha/crest + ssh2: + github: spider-gazelle/ssh2.cr + ipaddress: + github: sija/ipaddress.cr + retriable: + github: Sija/retriable.cr + tasker: + github: spider-gazelle/tasker + crinja: + github: straight-shoota/crinja + diff --git a/spec/k3s_spec.rb b/spec/k3s_spec.rb deleted file mode 100644 index 2c7de877..00000000 --- a/spec/k3s_spec.rb +++ /dev/null @@ -1,9 +0,0 @@ -RSpec.describe K3s do - it "has a version number" do - expect(K3s::VERSION).not_to be nil - end - - it "does something useful" do - expect(false).to eq(true) - end -end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb deleted file mode 100644 index 49080ea3..00000000 --- a/spec/spec_helper.rb +++ /dev/null @@ -1,14 +0,0 @@ -require "bundler/setup" -require "k3s" - -RSpec.configure do |config| - # Enable flags like --only-failures and --next-failure - config.example_status_persistence_file_path = ".rspec_status" - - # Disable RSpec exposing methods globally on `Module` and `main` - config.disable_monkey_patching! - - config.expect_with :rspec do |c| - c.syntax = :expect - end -end diff --git a/src/cluster/create.cr b/src/cluster/create.cr new file mode 100644 index 00000000..e7243b2f --- /dev/null +++ b/src/cluster/create.cr @@ -0,0 +1,357 @@ +require "../configuration/main" +require "../configuration/loader" +require "../hetzner/client" +require "../hetzner/placement_group/create" +require "../hetzner/placement_group/all" +require "../hetzner/ssh_key/create" +require "../hetzner/firewall/create" +require "../hetzner/network/create" +require "../hetzner/instance/create" +require "../hetzner/load_balancer/create" +require "../util/ssh" +require "../kubernetes/installer" +require "../util/ssh" + +class Cluster::Create + MAX_PLACEMENT_GROUPS = 50 + MAX_INSTANCES_PER_PLACEMENT_GROUP = 10 # Assuming this is the maximum number of instances per placement group + + private getter configuration : Configuration::Loader + private getter hetzner_client : Hetzner::Client do + configuration.hetzner_client + end + private getter settings : Configuration::Main do + configuration.settings + end + private getter autoscaling_worker_node_pools : Array(Configuration::NodePool) do + settings.worker_node_pools.select(&.autoscaling_enabled) + end + private getter ssh_client : Util::SSH do + Util::SSH.new(settings.networking.ssh.private_key_path, settings.networking.ssh.public_key_path) + end + + private getter network : Hetzner::Network? + private getter ssh_key : Hetzner::SSHKey + private getter load_balancer : Hetzner::LoadBalancer? + private getter placement_groups : Hash(String, Hetzner::PlacementGroup?) = Hash(String, Hetzner::PlacementGroup?).new + private property instances : Array(Hetzner::Instance) = [] of Hetzner::Instance + + private getter master_instance_creators : Array(Hetzner::Instance::Create) + private getter worker_instance_creators : Array(Hetzner::Instance::Create) + + private property kubernetes_masters_installation_queue_channel do + Channel(Hetzner::Instance).new(5) + end + private property kubernetes_workers_installation_queue_channel do + Channel(Hetzner::Instance).new(10) + end + + private property completed_channel : Channel(Nil) = Channel(Nil).new + + private property mutex : Mutex = Mutex.new + private property all_placement_groups : Array(Hetzner::PlacementGroup) = Array(Hetzner::PlacementGroup).new + + def initialize(@configuration) + @network = find_or_create_network if settings.networking.private_network.enabled + @ssh_key = create_ssh_key + @all_placement_groups = Hetzner::PlacementGroup::All.new(hetzner_client).delete_unused + @master_instance_creators = initialize_master_instance_creators + @worker_instance_creators = initialize_worker_instance_creators + end + + def run + create_instances_concurrently(master_instance_creators, kubernetes_masters_installation_queue_channel, wait: true) + + configure_firewall + # create_load_balancer if master_instance_creators.size > 1 + + kubernetes_installer = Kubernetes::Installer.new( + configuration, + # load_balancer, + ssh_client, + autoscaling_worker_node_pools + ) + + spawn do + kubernetes_installer.run( + masters_installation_queue_channel: kubernetes_masters_installation_queue_channel, + workers_installation_queue_channel: kubernetes_workers_installation_queue_channel, + completed_channel: completed_channel, + master_count: master_instance_creators.size, + worker_count: worker_instance_creators.size + ) + end + + create_instances_concurrently(worker_instance_creators, kubernetes_workers_installation_queue_channel) + + completed_channel.receive + + delete_unused_placement_groups + end + + private def initialize_master_instance_creators + creators = Array(Hetzner::Instance::Create).new + + masters_pool = settings.masters_pool + placement_group = create_placement_group_for_masters + + masters_pool.instance_count.times do |i| + creators << create_master_instance(i, placement_group) + end + + creators + end + + private def create_placement_group_for_masters + placement_group_name = "#{settings.cluster_name}-masters" + + placement_group = all_placement_groups.find { |pg| pg.name == placement_group_name } + + unless placement_group + placement_group = Hetzner::PlacementGroup::Create.new( + hetzner_client: hetzner_client, + placement_group_name: placement_group_name + ).run + + track_placement_group(placement_group) + end + + placement_group + end + + private def track_placement_group(placement_group) + mutex.synchronize do + unless all_placement_groups.any? { |pg| pg.name == placement_group.name } + all_placement_groups << placement_group + end + end + end + + private def create_master_instance(index : Int32, placement_group : Hetzner::PlacementGroup?) : Hetzner::Instance::Create + instance_type = settings.masters_pool.instance_type + + master_name = if settings.include_instance_type_in_instance_name + "#{settings.cluster_name}-#{instance_type}-master#{index + 1}" + else + "#{settings.cluster_name}-master#{index + 1}" + end + + image = settings.masters_pool.image || settings.image + additional_packages = settings.masters_pool.additional_packages || settings.additional_packages + additional_post_create_commands = settings.masters_pool.post_create_commands || settings.post_create_commands + + Hetzner::Instance::Create.new( + settings: settings, + hetzner_client: hetzner_client, + mutex: mutex, + instance_name: master_name, + instance_type: instance_type, + image: image, + ssh_key: ssh_key, + network: network, + placement_group: placement_group, + additional_packages: additional_packages, + additional_post_create_commands: additional_post_create_commands + ) + end + + private def initialize_worker_instance_creators + creators = Array(Hetzner::Instance::Create).new + no_autoscaling_worker_node_pools = settings.worker_node_pools.reject(&.autoscaling_enabled) + + create_placement_groups_for_worker_node_pools(no_autoscaling_worker_node_pools) + + no_autoscaling_worker_node_pools.each do |node_pool| + node_pool_placement_groups = all_placement_groups.select { |pg| pg.name.includes?("#{settings.cluster_name}-#{node_pool.name}-") } + node_pool.instance_count.times do |i| + placement_group = node_pool_placement_groups[(i // MAX_INSTANCES_PER_PLACEMENT_GROUP) % node_pool_placement_groups.size] + creators << create_worker_instance(i, node_pool, placement_group) + end + end + + creators + end + + private def create_placement_groups_for_worker_node_pools(node_pools) + node_pools = node_pools.sort_by(&.name.not_nil!) + + remaining_placement_groups = MAX_PLACEMENT_GROUPS - all_placement_groups.size + placement_groups_channel = Channel(Hetzner::PlacementGroup).new + created_placement_groups = 0 + + node_pools.each do |node_pool| + next if node_pool.instance_count <= 0 + + created_placement_groups += create_placement_groups_for_node_pool(node_pool, remaining_placement_groups, placement_groups_channel) + remaining_placement_groups -= created_placement_groups + + break if remaining_placement_groups <= 0 + end + + created_placement_groups.times { placement_groups_channel.receive } + end + + private def delete_unused_placement_groups + mutex.synchronize do + @all_placement_groups = Hetzner::PlacementGroup::All.new(hetzner_client).delete_unused + end + end + + private def create_placement_groups_for_node_pool(node_pool, remaining_placement_groups, placement_groups_channel) + placement_groups_count = (node_pool.instance_count / MAX_INSTANCES_PER_PLACEMENT_GROUP).ceil.to_i + placement_groups_count = [placement_groups_count, remaining_placement_groups].min + created_placement_groups = 0 + + ((all_placement_groups.size + 1)..(all_placement_groups.size + placement_groups_count)).each do |index| + placement_group_name = "#{settings.cluster_name}-#{node_pool.name}-#{index}" + + next if all_placement_groups.any? { |pg| pg.name == placement_group_name } + + spawn do + placement_group = Hetzner::PlacementGroup::Create.new( + hetzner_client: hetzner_client, + placement_group_name: placement_group_name + ).run + + track_placement_group(placement_group) + placement_groups_channel.send(placement_group) + end + + created_placement_groups += 1 + break if remaining_placement_groups - created_placement_groups <= 0 + end + + created_placement_groups + end + + private def create_worker_instance(index : Int32, node_pool, placement_group : Hetzner::PlacementGroup?) : Hetzner::Instance::Create + instance_type = node_pool.instance_type + + node_name = if settings.include_instance_type_in_instance_name + "#{settings.cluster_name}-#{instance_type}-pool-#{node_pool.name}-worker#{index + 1}" + else + "#{settings.cluster_name}-pool-#{node_pool.name}-worker#{index + 1}" + end + + image = node_pool.image || settings.image + additional_packages = node_pool.additional_packages || settings.additional_packages + additional_post_create_commands = node_pool.post_create_commands || settings.post_create_commands + + Hetzner::Instance::Create.new( + settings: settings, + hetzner_client: hetzner_client, + mutex: mutex, + instance_name: node_name, + instance_type: instance_type, + image: image, + location: node_pool.location, + ssh_key: ssh_key, + network: network, + placement_group: placement_group, + additional_packages: additional_packages, + additional_post_create_commands: additional_post_create_commands + ) + end + + private def create_load_balancer + @load_balancer = Hetzner::LoadBalancer::Create.new( + settings: settings, + hetzner_client: hetzner_client, + location: configuration.masters_location, + network_id: network.try(&.id) + ).run + end + + private def create_instances_concurrently(instance_creators, kubernetes_installation_queue_channel, wait = false) + wait_channel = Channel(Hetzner::Instance::Create).new + semaphore = Channel(Nil).new(50) + + instance_creators.each do |instance_creator| + semaphore.send(nil) + spawn do + instance = nil + begin + Retriable.retry(max_attempts: 3, on: Tasker::Timeout, backoff: false) do + Tasker.timeout(60.seconds) do + instance = instance_creator.run + end + end + + semaphore.receive # release the semaphore immediately after instance creation + rescue e : Exception + puts "Error creating instance: #{e.message}" + ensure + created_instance = instance + + if created_instance + mutex.synchronize { instances << created_instance } + wait_channel = wait_channel.send(instance_creator) if wait + kubernetes_installation_queue_channel.send(created_instance) + else + puts "Instance creation for #{instance_creator.instance_name} failed. Try rerunning the create command." + end + end + end + end + + return unless wait + + instance_creators.size.times do |instance_creator| + wait_channel.receive + end + end + + private def find_network + existing_network_name = settings.networking.private_network.existing_network_name + + return find_existing_network(existing_network_name) if existing_network_name + create_new_network + end + + private def find_existing_network(existing_network_name) + Hetzner::Network::Find.new(hetzner_client, existing_network_name).run + end + + private def create_new_network + return unless settings.networking.private_network.enabled + + Hetzner::Network::Create.new( + settings: settings, + hetzner_client: hetzner_client, + network_name: settings.cluster_name, + locations: configuration.locations + ).run + end + + private def find_or_create_network + find_network || create_new_network + end + + private def configure_firewall + Hetzner::Firewall::Create.new( + settings: settings, + hetzner_client: hetzner_client, + firewall_name: settings.cluster_name, + masters: masters + ).run + end + + private def create_ssh_key + Hetzner::SSHKey::Create.new( + hetzner_client: hetzner_client, + settings: settings + ).run + end + + private def default_log_prefix + "Cluster create" + end + + private def masters + instances.select { |instance| instance.master? }.sort_by(&.name) + end + + private def workers + instances.select { |instance| instance.master? }.sort_by(&.name) + end +end diff --git a/src/cluster/delete.cr b/src/cluster/delete.cr new file mode 100644 index 00000000..04bb4458 --- /dev/null +++ b/src/cluster/delete.cr @@ -0,0 +1,149 @@ +require "../configuration/loader" +require "../hetzner/placement_group/delete" +require "../hetzner/ssh_key/delete" +require "../hetzner/firewall/delete" +require "../hetzner/network/delete" +require "../hetzner/instance/delete" +require "../hetzner/load_balancer/delete" +require "../hetzner/placement_group/all" +require "../util/shell" +require "../util" + +class Cluster::Delete + include Util + include Util::Shell + + private getter configuration : Configuration::Loader + private getter hetzner_client : Hetzner::Client do + configuration.hetzner_client + end + private getter settings : Configuration::Main do + configuration.settings + end + private property instance_deletors : Array(Hetzner::Instance::Delete) = [] of Hetzner::Instance::Delete + + def initialize(@configuration) + end + + def run + delete_resources + File.delete(settings.kubeconfig_path) if File.exists?(settings.kubeconfig_path) + end + + private def delete_resources + # delete_load_balancer + # sleep 5 + delete_instances + delete_placement_groups + delete_network + delete_firewall + delete_ssh_key + end + + private def delete_load_balancer + Hetzner::LoadBalancer::Delete.new( + hetzner_client: hetzner_client, + cluster_name: settings.cluster_name + ).run + end + + private def delete_instances + initialize_masters + initialize_worker_nodes + detect_nodes_with_kubectl + + channel = Channel(String).new + + instance_deletors.each do |instance_deletor| + spawn do + instance_deletor.run + channel.send(instance_deletor.instance_name) + end + end + + instance_deletors.size.times do + channel.receive + end + end + + private def delete_network + Hetzner::Network::Delete.new( + hetzner_client: hetzner_client, + network_name: settings.cluster_name + ).run + end + + private def delete_firewall + Hetzner::Firewall::Delete.new( + hetzner_client: hetzner_client, + firewall_name: settings.cluster_name + ).run + end + + private def delete_ssh_key + Hetzner::SSHKey::Delete.new( + hetzner_client: hetzner_client, + ssh_key_name: settings.cluster_name, + public_ssh_key_path: settings.networking.ssh.public_key_path + ).run + end + + private def initialize_masters + settings.masters_pool.instance_count.times do |i| + instance_name = if settings.include_instance_type_in_instance_name + "#{settings.cluster_name}-#{settings.masters_pool.instance_type}-master#{i + 1}" + else + "#{settings.cluster_name}-master#{i + 1}" + end + + instance_deletors << Hetzner::Instance::Delete.new( + settings: settings, + hetzner_client: hetzner_client, + instance_name: instance_name + ) + end + end + + private def initialize_worker_nodes + no_autoscaling_worker_node_pools = settings.worker_node_pools.reject(&.autoscaling_enabled) + + no_autoscaling_worker_node_pools.each do |node_pool| + node_pool.instance_count.times do |i| + instance_name = if settings.include_instance_type_in_instance_name + "#{settings.cluster_name}-#{node_pool.instance_type}-pool-#{node_pool.name}-worker#{i + 1}" + else + "#{settings.cluster_name}-pool-#{node_pool.name}-worker#{i + 1}" + end + + instance_deletors << Hetzner::Instance::Delete.new( + settings: settings, + hetzner_client: hetzner_client, + instance_name: instance_name + ) + end + end + end + + private def delete_placement_groups + Hetzner::PlacementGroup::All.new(hetzner_client).delete_all + end + + private def default_log_prefix + "Cluster cleanup" + end + + private def detect_nodes_with_kubectl + result = run_shell_command("kubectl get nodes -o=custom-columns=NAME:.metadata.name | tail -n +2", configuration.kubeconfig_path, settings.hetzner_token, abort_on_error: false, print_output: false) + all_node_names = result.output.split("\n") + + all_node_names.each do |node_name| + unless instance_deletors.find { |deletor| deletor.instance_name == node_name } + instance_deletors << Hetzner::Instance::Delete.new( + settings: settings, + hetzner_client: hetzner_client, + instance_name: node_name + ) + end + end + end +end diff --git a/src/cluster/upgrade.cr b/src/cluster/upgrade.cr new file mode 100644 index 00000000..2b39415f --- /dev/null +++ b/src/cluster/upgrade.cr @@ -0,0 +1,92 @@ +require "crinja" +require "../util" +require "../util/shell" +require "../kubernetes/util" +require "../configuration/main" +require "../configuration/loader" + +class Cluster::Upgrade + include Util + include Util::Shell + include Kubernetes::Util + + UPGRADE_PLAN_MANIFEST_FOR_MASTERS = {{ read_file("#{__DIR__}/../../templates/upgrade_plan_for_masters.yaml") }} + UPGRADE_PLAN_MANIFEST_FOR_WORKERS = {{ read_file("#{__DIR__}/../../templates/upgrade_plan_for_workers.yaml") }} + + getter configuration : Configuration::Loader + getter settings : Configuration::Main do + configuration.settings + end + getter new_k3s_version : String? do + configuration.new_k3s_version + end + + def initialize(@configuration) + end + + def run + log_line "k3s version upgrade started" + + ensure_kubectl_is_installed! + + create_upgrade_plan_for_controlplane + create_upgrade_plan_for_workers + + log_line "Upgrade will now start. Run `watch kubectl get nodes` to see the nodes being upgraded. This should take a few minutes for a small cluster." + log_line "The API server may be briefly unavailable during the upgrade of the controlplane." + + update_k3s_version_in_configuration_file + end + + private def default_log_prefix + "K3s upgrade" + end + + private def masters_upgrade_manifest + Crinja.render(UPGRADE_PLAN_MANIFEST_FOR_MASTERS, { + new_k3s_version: new_k3s_version, + }) + end + + private def worker_upgrade_concurrency + [(workers_count / 4).to_i, 1].max + end + + private def workers_count + settings.worker_node_pools.sum { |pool| pool.instance_count } + end + + private def create_upgrade_plan_for_controlplane + command = String.build do |str| + str << "kubectl apply -f - <<-EOF\n" + str << masters_upgrade_manifest.strip + str << "\nEOF" + end + + run_shell_command command, configuration.kubeconfig_path, settings.hetzner_token, error_message: "Failed to create upgrade plan for controlplane" + end + + private def create_upgrade_plan_for_workers + return if workers_count.zero? + + workers_upgrade_manifest = Crinja.render(UPGRADE_PLAN_MANIFEST_FOR_WORKERS, { + new_k3s_version: new_k3s_version, + worker_upgrade_concurrency: worker_upgrade_concurrency, + }) + + command = String.build do |str| + str << "kubectl apply -f - <<-EOF\n" + str << workers_upgrade_manifest.strip + str << "\nEOF" + end + + run_shell_command command, configuration.kubeconfig_path, settings.hetzner_token, error_message: "Failed to create upgrade plan for workers" + end + + private def update_k3s_version_in_configuration_file + current_configuration = File.read(configuration.configuration_file_path) + new_configuration = current_configuration.gsub(/k3s_version: .*/, "k3s_version: #{new_k3s_version}") + + File.write(configuration.configuration_file_path, new_configuration) + end +end diff --git a/src/configuration/autoscaling.cr b/src/configuration/autoscaling.cr new file mode 100644 index 00000000..cb4b0b26 --- /dev/null +++ b/src/configuration/autoscaling.cr @@ -0,0 +1,9 @@ +require "yaml" + +class Configuration::Autoscaling + include YAML::Serializable + + property enabled : Bool = false + property min_instances : Int32 = 0 + property max_instances : Int32 = 0 +end diff --git a/src/configuration/datastore.cr b/src/configuration/datastore.cr new file mode 100644 index 00000000..4d28fe20 --- /dev/null +++ b/src/configuration/datastore.cr @@ -0,0 +1,11 @@ +require "yaml" + +class Configuration::Datastore + include YAML::Serializable + + getter mode : String = "etcd" + getter external_datastore_endpoint : String = "" + + def initialize(@mode : String = "etcd", @external_datastore_endpoint : String = "") + end +end diff --git a/src/configuration/embedded_registry_mirror.cr b/src/configuration/embedded_registry_mirror.cr new file mode 100644 index 00000000..69bd5218 --- /dev/null +++ b/src/configuration/embedded_registry_mirror.cr @@ -0,0 +1,9 @@ +class Configuration::EmbeddedRegistryMirror + include YAML::Serializable + include YAML::Serializable::Unmapped + + getter enabled : Bool = true + + def initialize + end +end diff --git a/src/configuration/loader.cr b/src/configuration/loader.cr new file mode 100644 index 00000000..5521ad59 --- /dev/null +++ b/src/configuration/loader.cr @@ -0,0 +1,198 @@ +require "yaml" +require "crest" + +require "./main" + +require "../hetzner/client" +require "../hetzner/instance_type" +require "../hetzner/location" + +require "./settings/configuration_file_path" +require "./settings/cluster_name" +require "./settings/kubeconfig_path" +require "./settings/k3s_version" +require "./settings/new_k3s_version" +require "./networking" +require "./settings/node_pool" +require "./settings/node_pool/autoscaling" +require "./settings/node_pool/pool_name" +require "./settings/node_pool/instance_type" +require "./settings/node_pool/location" +require "./settings/node_pool/instance_count" +require "./settings/node_pool/node_labels" +require "./settings/node_pool/node_taints" +require "./settings/datastore" +require "../util" + + +class Configuration::Loader + include Util + + getter hetzner_client : Hetzner::Client? + getter errors : Array(String) = [] of String + getter settings : Configuration::Main + + getter hetzner_client : Hetzner::Client do + if settings.hetzner_token.blank? + errors << "Hetzner API token is missing, please set it in the configuration file or in the environment variable HCLOUD_TOKEN" + print_errors + exit 1 + end + + Hetzner::Client.new(settings.hetzner_token) + end + + getter kubeconfig_path do + Path[settings.kubeconfig_path].expand(home: true).to_s + end + + getter masters_location : String | Nil do + settings.masters_pool.try &.location + end + + getter instance_types : Array(Hetzner::InstanceType) do + hetzner_client.instance_types + end + + getter locations : Array(Hetzner::Location) do + hetzner_client.locations + end + + getter new_k3s_version : String? + getter configuration_file_path : String + + private property instance_types_loaded : Bool = false + private property locations_loaded : Bool = false + + def initialize(@configuration_file_path, @new_k3s_version) + @settings = Configuration::Main.from_yaml(File.read(configuration_file_path)) + + Settings::ConfigurationFilePath.new(errors, configuration_file_path).validate + + print_errors unless errors.empty? + end + + def validate(command) + log_line "Validating configuration..." + + Settings::ClusterName.new(errors, settings.cluster_name).validate + + validate_command_specific_settings(command) + + print_validation_result + end + + private def validate_command_specific_settings(command) + case command + when :create + validate_create_settings + when :delete + when :upgrade + validate_upgrade_settings + end + end + + private def validate_create_settings + Settings::KubeconfigPath.new(errors, kubeconfig_path, file_must_exist: false).validate + Settings::K3sVersion.new(errors, settings.k3s_version).validate + Settings::Datastore.new(errors, settings.datastore).validate + + settings.networking.validate(errors, hetzner_client, settings.networking.private_network) + + validate_masters_pool + validate_worker_node_pools + + validate_kubectl_presence + validate_helm_presence + end + + private def validate_upgrade_settings + Settings::KubeconfigPath.new(errors, kubeconfig_path, file_must_exist: true).validate + Settings::NewK3sVersion.new(errors, settings.k3s_version, new_k3s_version).validate + + validate_kubectl_presence + end + + private def validate_kubectl_presence + errors << "kubectl is not installed or not in PATH" unless which("kubectl") + end + + private def validate_helm_presence + errors << "helm is not installed or not in PATH" unless which("helm") + end + + private def print_validation_result + if errors.empty? + log_line "...configuration seems valid." + else + print_errors + exit 1 + end + end + + private def validate_masters_pool + Settings::NodePool.new( + errors: errors, + pool: settings.masters_pool, + pool_type: :masters, + masters_location: masters_location, + instance_types: instance_types, + locations: locations, + datastore: settings.datastore + ).validate + end + + private def validate_worker_node_pools + if settings.worker_node_pools.nil? + errors << "`worker_node_pools` is required if workloads cannot be scheduled on masters" unless settings.schedule_workloads_on_masters + return + end + + node_pools = settings.worker_node_pools + validate_node_pools_configuration(node_pools) + end + + private def validate_node_pools_configuration(node_pools) + if node_pools.empty? + errors << "At least one worker node pool is required in order to schedule workloads" unless settings.schedule_workloads_on_masters + else + validate_unique_node_pool_names(node_pools) + validate_each_node_pool(node_pools) + end + end + + private def validate_unique_node_pool_names(node_pools) + worker_node_pool_names = node_pools.map(&.name) + errors << "Each worker node pool must have a unique name" if worker_node_pool_names.uniq.size != node_pools.size + end + + private def validate_each_node_pool(node_pools) + node_pools.each do |worker_node_pool| + Settings::NodePool.new( + errors: errors, + pool: worker_node_pool, + pool_type: :workers, + masters_location: masters_location, + instance_types: instance_types, + locations: locations, + datastore: settings.datastore + ).validate + end + end + + private def print_errors + return if errors.empty? + + log_line "Some information in the configuration file requires your attention:" + + errors.each do |error| + STDERR.puts "[#{default_log_prefix}] - #{error}" + end + + exit 1 + end + + private def default_log_prefix + "Configuration" + end +end diff --git a/src/configuration/main.cr b/src/configuration/main.cr new file mode 100644 index 00000000..358e9e66 --- /dev/null +++ b/src/configuration/main.cr @@ -0,0 +1,39 @@ +require "yaml" + +require "./node_pool" +require "./datastore" +require "./manifests" +require "./embedded_registry_mirror" + +class Configuration::Main + include YAML::Serializable + + getter hetzner_token : String = ENV.fetch("HCLOUD_TOKEN", "") + getter cluster_name : String + getter kubeconfig_path : String + getter k3s_version : String + getter api_server_hostname : String? + getter schedule_workloads_on_masters : Bool = false + getter masters_pool : Configuration::NodePool + getter worker_node_pools : Array(Configuration::NodePool) = [] of Configuration::NodePool + getter post_create_commands : Array(String) = [] of String + getter additional_packages : Array(String) = [] of String + getter kube_api_server_args : Array(String) = [] of String + getter kube_scheduler_args : Array(String) = [] of String + getter kube_controller_manager_args : Array(String) = [] of String + getter kube_cloud_controller_manager_args : Array(String) = [] of String + getter kubelet_args : Array(String) = [] of String + getter kube_proxy_args : Array(String) = [] of String + getter image : String = "ubuntu-24.04" + getter autoscaling_image : String? + getter snapshot_os : String = "default" + getter networking : Configuration::Networking = Configuration::Networking.new + getter datastore : Configuration::Datastore = Configuration::Datastore.new + getter manifests : Configuration::Manifests = Configuration::Manifests.new + getter embedded_registry_mirror : Configuration::EmbeddedRegistryMirror = Configuration::EmbeddedRegistryMirror.new + getter include_instance_type_in_instance_name : Bool = false + + def all_kubelet_args + ["cloud-provider=external", "resolv-conf=/etc/k8s-resolv.conf"] + kubelet_args + end +end diff --git a/src/configuration/manifests.cr b/src/configuration/manifests.cr new file mode 100644 index 00000000..d5402dae --- /dev/null +++ b/src/configuration/manifests.cr @@ -0,0 +1,15 @@ +module Configuration + class Manifests + include YAML::Serializable + include YAML::Serializable::Unmapped + + getter cloud_controller_manager_manifest_url : String = "https://github.com/hetznercloud/hcloud-cloud-controller-manager/releases/download/v1.20.0/ccm-networks.yaml" + getter csi_driver_manifest_url : String = "https://raw.githubusercontent.com/hetznercloud/csi-driver/v2.9.0/deploy/kubernetes/hcloud-csi.yml" + getter system_upgrade_controller_deployment_manifest_url : String = "https://github.com/rancher/system-upgrade-controller/releases/download/v0.13.4/system-upgrade-controller.yaml" + getter system_upgrade_controller_crd_manifest_url : String = "https://github.com/rancher/system-upgrade-controller/releases/download/v0.13.4/crd.yaml" + getter cluster_autoscaler_manifest_url : String = "https://raw.githubusercontent.com/kubernetes/autoscaler/master/cluster-autoscaler/cloudprovider/hetzner/examples/cluster-autoscaler-run-on-master.yaml" + + def initialize + end + end +end diff --git a/src/configuration/networking.cr b/src/configuration/networking.cr new file mode 100644 index 00000000..643e0d92 --- /dev/null +++ b/src/configuration/networking.cr @@ -0,0 +1,33 @@ +require "./networking_components/cni" +require "./networking_components/allowed_networks" +require "./networking_components/private_network" +require "./networking_components/public_network" +require "./networking_components/ssh" +require "../hetzner/client" +require "../hetzner/network/find" + +module Configuration + class Networking + include YAML::Serializable + include YAML::Serializable::Unmapped + + getter cni : ::Configuration::NetworkingComponents::CNI = ::Configuration::NetworkingComponents::CNI.new + getter private_network : ::Configuration::NetworkingComponents::PrivateNetwork = ::Configuration::NetworkingComponents::PrivateNetwork.new + getter public_network : ::Configuration::NetworkingComponents::PublicNetwork = ::Configuration::NetworkingComponents::PublicNetwork.new + getter allowed_networks : ::Configuration::NetworkingComponents::AllowedNetworks = ::Configuration::NetworkingComponents::AllowedNetworks.new + getter ssh : ::Configuration::NetworkingComponents::SSH = ::Configuration::NetworkingComponents::SSH.new + getter cluster_cidr : String = "10.244.0.0/16" + getter service_cidr : String = "10.43.0.0/16" + getter cluster_dns : String = "10.43.0.10" + + def initialize + end + + def validate(errors, hetzner_client, private_network) + cni.validate(errors, private_network) + allowed_networks.validate(errors) + private_network.validate(errors, hetzner_client) + end + end +end + diff --git a/src/configuration/networking_components/allowed_networks.cr b/src/configuration/networking_components/allowed_networks.cr new file mode 100644 index 00000000..76682538 --- /dev/null +++ b/src/configuration/networking_components/allowed_networks.cr @@ -0,0 +1,77 @@ +class Configuration::NetworkingComponents::AllowedNetworks + include YAML::Serializable + include YAML::Serializable::Unmapped + + getter ssh : Array(String) = ["0.0.0.0/0"] + getter api : Array(String) = ["0.0.0.0/0"] + + def initialize + end + + def validate(errors) + validate_networks(errors, ssh, "SSH") + validate_networks(errors, api, "API") + end + + private def validate_current_ip_must_be_included_in_at_least_one_network(errors, networks, network_type) + current_ip = IPAddress.new("127.0.0.1") + + begin + current_ip = IPAddress.new(Crest.get("https://ipinfo.io/ip").body) + rescue ex : Crest::RequestFailed + errors << "Unable to determine your current IP (necessary to validate allowed networks for SSH and API)" + return + end + + included = false + + networks.each do |cidr| + included = check_current_ip_in_network(errors, cidr, current_ip, included, network_type) + end + + unless included + errors << "Your current IP #{current_ip} must belong to at least one of the #{network_type} allowed networks" + end + end + + private def check_current_ip_in_network(errors, cidr : String, current_ip : IPAddress, included : Bool, network_type) : Bool + begin + network = IPAddress.new(cidr).network + + if network.includes? current_ip + included = true + end + rescue ex: ArgumentError + if ex.message =~ /Invalid netmask/ + errors << "#{network_type} allowed network #{cidr} has an invalid netmask" + else + errors << "#{network_type} allowed network #{cidr} is not a valid network in CIDR notation" + end + end + included + end + + private def validate_cidr_network(errors, cidr : String, network_type) + begin + IPAddress.new(cidr).network? + rescue ArgumentError + errors << "#{network_type} allowed network #{cidr} is not a valid network in CIDR notation" + end + end + + private def validate_networks(errors, networks, network_type) + if networks + if networks.empty? + errors << "#{network_type} allowed networks are required" + else + networks.each do |cidr| + validate_cidr_network(errors, cidr, network_type) + end + + validate_current_ip_must_be_included_in_at_least_one_network(errors, networks, network_type) + end + else + errors << "#{network_type} allowed networks are required" + end + end +end diff --git a/src/configuration/networking_components/ciium.cr b/src/configuration/networking_components/ciium.cr new file mode 100644 index 00000000..7090b5a8 --- /dev/null +++ b/src/configuration/networking_components/ciium.cr @@ -0,0 +1,9 @@ +class Configuration::NetworkingComponents::Cilium + include YAML::Serializable + include YAML::Serializable::Unmapped + + getter chart_version : String = "v1.15.7" + + def initialize + end +end diff --git a/src/configuration/networking_components/cni.cr b/src/configuration/networking_components/cni.cr new file mode 100644 index 00000000..46021909 --- /dev/null +++ b/src/configuration/networking_components/cni.cr @@ -0,0 +1,48 @@ +require "./ciium" +require "./flannel" + +class Configuration::NetworkingComponents::CNI + include YAML::Serializable + include YAML::Serializable::Unmapped + + getter enabled : Bool = true + getter mode : String = "flannel" + getter encryption : Bool = true + getter cilium : Configuration::NetworkingComponents::Cilium = Configuration::NetworkingComponents::Cilium.new + getter flannel : Configuration::NetworkingComponents::Flannel = Configuration::NetworkingComponents::Flannel.new + + def initialize + end + + def validate(errors, private_network) + return unless enabled + + if !encryption && !private_network.enabled + errors << "CNI encryption must be enabled when private networking is enabled" + end + + unless ["flannel", "cilium"].includes?(mode) + errors << "CNI mode must be either 'flannel' or 'cilium' when CNI is enabled" + end + end + + def flannel? + enabled? && mode == "flannel" + end + + def cilium? + enabled && mode == "cilium" + end + + def encryption? + encryption + end + + def enabled? + enabled + end + + def kube_proxy? + cilium? ? false : !flannel.disable_kube_proxy? + end +end diff --git a/src/configuration/networking_components/flannel.cr b/src/configuration/networking_components/flannel.cr new file mode 100644 index 00000000..7fd0bbfa --- /dev/null +++ b/src/configuration/networking_components/flannel.cr @@ -0,0 +1,13 @@ +class Configuration::NetworkingComponents::Flannel + include YAML::Serializable + include YAML::Serializable::Unmapped + + getter disable_kube_proxy : Bool = false + + def initialize + end + + def disable_kube_proxy? : Bool + disable_kube_proxy + end +end diff --git a/src/configuration/networking_components/private_network.cr b/src/configuration/networking_components/private_network.cr new file mode 100644 index 00000000..7e8ca11d --- /dev/null +++ b/src/configuration/networking_components/private_network.cr @@ -0,0 +1,31 @@ +require "ipaddress" + +class Configuration::NetworkingComponents::PrivateNetwork + include YAML::Serializable + include YAML::Serializable::Unmapped + + getter enabled : Bool = true + getter subnet : String = "10.0.0.0/16" + getter existing_network_name : String = "" + + def initialize + end + + def validate(errors, hetzner_client) + validate_existing_network_name(errors, hetzner_client) + + begin + IPAddress.new(subnet).network? + rescue ArgumentError + errors << "private network subnet #{subnet} is not a valid network in CIDR notation" + end + end + + private def validate_existing_network_name(errors, hetzner_client) + return if existing_network_name.empty? + + return if Hetzner::Network::Find.new(hetzner_client, existing_network_name).run + + errors << "You have specified that you want to use the existing network named '#{existing_network_name}' but this network doesn't exist" + end +end diff --git a/src/configuration/networking_components/public_network.cr b/src/configuration/networking_components/public_network.cr new file mode 100644 index 00000000..2fd237d8 --- /dev/null +++ b/src/configuration/networking_components/public_network.cr @@ -0,0 +1,10 @@ +class Configuration::NetworkingComponents::PublicNetwork + include YAML::Serializable + include YAML::Serializable::Unmapped + + getter ipv4 : Bool = true + getter ipv6 : Bool = true + + def initialize + end +end diff --git a/src/configuration/networking_components/ssh.cr b/src/configuration/networking_components/ssh.cr new file mode 100644 index 00000000..ade243e8 --- /dev/null +++ b/src/configuration/networking_components/ssh.cr @@ -0,0 +1,39 @@ +class Configuration::NetworkingComponents::SSH + include YAML::Serializable + include YAML::Serializable::Unmapped + + getter port : Int32 = 22 + getter use_agent : Bool = false + getter private_key_path : String = "~/.ssh/id_rsa" + getter public_key_path : String = "~/.ssh/id_rsa.pub" + + def initialize + end + + def validate(errors) + validate_path(errors, private_key_path, "private") + validate_path(errors, public_key_path, "public") + end + + def private_key_path + absolute_path(@private_key_path) + end + + def public_key_path + absolute_path(@public_key_path) + end + + private def validate_path(errors, path, key_type) + if ! File.exists?(path) + errors << "#{key_type}_key_path does not exist" + elsif File.directory?(path) + errors << "#{key_type}_key_path is a directory, while we expect a public key file" + end + end + + private def absolute_path(path) + home_dir = ENV["HOME"]? || raise "HOME environment variable not set" + relative_path = path.sub("~/", "#{home_dir}/") + absolute_path = File.expand_path(relative_path) + end +end diff --git a/src/configuration/node_label.cr b/src/configuration/node_label.cr new file mode 100644 index 00000000..a5278ebc --- /dev/null +++ b/src/configuration/node_label.cr @@ -0,0 +1,8 @@ +require "yaml" + +class Configuration::NodeLabel + include YAML::Serializable + + property key : String? + property value : String? +end diff --git a/src/configuration/node_pool.cr b/src/configuration/node_pool.cr new file mode 100644 index 00000000..fdb56854 --- /dev/null +++ b/src/configuration/node_pool.cr @@ -0,0 +1,24 @@ +require "yaml" + +require "./node_label" +require "./node_taint" +require "./autoscaling" + +class Configuration::NodePool + include YAML::Serializable + + property name : String? + property instance_type : String + property location : String + property image : String | Int64 | Nil + property instance_count : Int32 = 1 + property labels : Array(::Configuration::NodeLabel) = [] of ::Configuration::NodeLabel + property taints : Array(::Configuration::NodeTaint) = [] of ::Configuration::NodeTaint + property autoscaling : ::Configuration::Autoscaling? + property post_create_commands : Array(String) | Nil + property additional_packages : Array(String) | Nil + + getter autoscaling_enabled : Bool do + autoscaling.try(&.enabled) || false + end +end diff --git a/src/configuration/node_taint.cr b/src/configuration/node_taint.cr new file mode 100644 index 00000000..e54ae694 --- /dev/null +++ b/src/configuration/node_taint.cr @@ -0,0 +1,8 @@ +require "yaml" + +class Configuration::NodeTaint + include YAML::Serializable + + property key : String? + property value : String? +end diff --git a/src/configuration/settings/cluster_name.cr b/src/configuration/settings/cluster_name.cr new file mode 100644 index 00000000..c9503eb7 --- /dev/null +++ b/src/configuration/settings/cluster_name.cr @@ -0,0 +1,17 @@ +class Configuration::Settings::ClusterName + getter cluster_name : String + getter errors : Array(String) + + def initialize(@errors, @cluster_name) + end + + def validate + if cluster_name.empty? + errors << "cluster_name is an invalid format (only lowercase letters, digits and dashes are allowed)" + elsif ! /\A[a-z\d-]+\z/.match(cluster_name) + errors << "cluster_name is an invalid format (only lowercase letters, digits and dashes are allowed)" + elsif ! /\A[a-z]+.*([a-z]|\d)+\z/.match(cluster_name) + errors << "Ensure that cluster_name starts and ends with a normal letter" + end + end +end diff --git a/src/configuration/settings/configuration_file_path.cr b/src/configuration/settings/configuration_file_path.cr new file mode 100644 index 00000000..cea5292f --- /dev/null +++ b/src/configuration/settings/configuration_file_path.cr @@ -0,0 +1,19 @@ +class Configuration::Settings::ConfigurationFilePath + getter path : String + getter errors : Array(String) + + def initialize(@errors, @path) + end + + def validate + configuration_file_path = Path[@path].expand(home: true).to_s + + if File.exists?(configuration_file_path) + if File.directory?(configuration_file_path) + errors << "Configuration path points to a directory, not a file" + end + else + errors << "Configuration file not found at #{configuration_file_path}" + end + end +end diff --git a/src/configuration/settings/datastore.cr b/src/configuration/settings/datastore.cr new file mode 100644 index 00000000..e199fba7 --- /dev/null +++ b/src/configuration/settings/datastore.cr @@ -0,0 +1,17 @@ +class Configuration::Settings::Datastore + getter errors : Array(String) + getter datastore : Configuration::Datastore + + def initialize(@errors, @datastore) + end + + def validate + case datastore.mode + when "etcd" + when "external" + errors << "external_datastore_endpoint is required for external datastore" if datastore.external_datastore_endpoint.strip.empty? + else + errors << "datastore mode is invalid - allowed values are 'etcd' and 'external'" + end + end +end diff --git a/src/configuration/settings/k3s_version.cr b/src/configuration/settings/k3s_version.cr new file mode 100644 index 00000000..c71f980d --- /dev/null +++ b/src/configuration/settings/k3s_version.cr @@ -0,0 +1,13 @@ +class Configuration::Settings::K3sVersion + getter errors : Array(String) + getter k3s_version : String + + def initialize(@errors, @k3s_version) + end + + def validate + return if K3s.available_releases.includes?(@k3s_version) + + errors << "K3s version is not valid, run `hetzner-k3s releases` to see available versions" + end +end diff --git a/src/configuration/settings/kubeconfig_path.cr b/src/configuration/settings/kubeconfig_path.cr new file mode 100644 index 00000000..bd1ad810 --- /dev/null +++ b/src/configuration/settings/kubeconfig_path.cr @@ -0,0 +1,20 @@ +class Configuration::Settings::KubeconfigPath + getter errors : Array(String) + getter kubeconfig_path : String + getter file_must_exist : Bool + + def initialize(@errors, @kubeconfig_path, @file_must_exist) + end + + def validate + if @kubeconfig_path + if File.exists?(@kubeconfig_path) && File.directory?(@kubeconfig_path) + errors << "kubeconfig_path already exists and it's a directory. We would need to write a kubeconfig file at that path" + elsif @file_must_exist && !File.exists?(@kubeconfig_path) + errors << "kubeconfig_path does not exist" + end + else + errors << "kubeconfig_path is required" + end + end +end diff --git a/src/configuration/settings/new_k3s_version.cr b/src/configuration/settings/new_k3s_version.cr new file mode 100644 index 00000000..9ef30d00 --- /dev/null +++ b/src/configuration/settings/new_k3s_version.cr @@ -0,0 +1,32 @@ +require "yaml" + +class Configuration::Settings::NewK3sVersion + getter errors : Array(String) + getter current_k3s_version : String + getter new_k3s_version : String? + getter releases : Array(String) | Array(YAML::Any) { ::K3s.available_releases } + getter new_version : String { new_k3s_version.not_nil! } + + def initialize(@errors, @current_k3s_version, @new_k3s_version) + end + + def validate + validate_release_number + validate_new_version_must_be_more_recent + end + + private def validate_release_number + return if releases.includes?(new_version) + + errors << "New k3s version is not valid, run `hetzner-k3s releases` to see available versions" + end + + private def validate_new_version_must_be_more_recent + current_version_index = releases.index(current_k3s_version) || -1 + new_version_index = releases.index(new_version) || -1 + + return if new_version_index > current_version_index + + errors << "New k3s version must be more recent than current version" + end +end diff --git a/src/configuration/settings/node_pool.cr b/src/configuration/settings/node_pool.cr new file mode 100644 index 00000000..5e20be78 --- /dev/null +++ b/src/configuration/settings/node_pool.cr @@ -0,0 +1,41 @@ +require "../../hetzner/location" +require "../../hetzner/instance_type" +require "../node_pool" +require "../datastore" + +class Configuration::Settings::NodePool + getter errors : Array(String) = [] of String + getter pool : Configuration::NodePool + getter pool_type : Symbol = :workers + getter masters_location : String? + getter instance_types : Array(Hetzner::InstanceType) = [] of Hetzner::InstanceType + getter locations : Array(Hetzner::Location) = [] of Hetzner::Location + + getter pool_name : String { masters? ? "masters" : pool.try(&.name) || "" } + getter pool_description : String { workers? ? "Worker mode pool '#{pool_name}'" : "Masters pool" } + + getter datastore : Configuration::Datastore + + def initialize(@errors, @pool, @pool_type, @masters_location, @instance_types, @locations, @datastore) + end + + def validate + return unless pool + + PoolName.new(errors, pool_type, pool_name).validate + InstanceType.new(errors, pool, instance_types).validate + Location.new(errors, pool, pool_type, masters_location, locations).validate + InstanceCount.new(errors, pool, pool_type, datastore).validate unless pool.autoscaling_enabled + NodeLabels.new(errors, pool_type, pool.try(&.labels)).validate + NodeTaints.new(errors, pool_type, pool.try(&.taints)).validate + Autoscaling.new(errors, pool).validate if pool_type == :workers + end + + private def workers? + pool_type == :workers + end + + private def masters? + pool_type == :masters + end +end diff --git a/src/configuration/settings/node_pool/autoscaling.cr b/src/configuration/settings/node_pool/autoscaling.cr new file mode 100644 index 00000000..ed73c66b --- /dev/null +++ b/src/configuration/settings/node_pool/autoscaling.cr @@ -0,0 +1,17 @@ +require "../../node_pool" + +class Configuration::Settings::NodePool::Autoscaling + getter errors : Array(String) + getter pool : Configuration::NodePool + + def initialize(@errors, @pool) + end + + def validate + autoscaling_settings = pool.try(&.autoscaling) + + if autoscaling_settings && autoscaling_settings.enabled + errors << "Autoscaling settings for pool #{pool.name} are invalid: max_instances must be greater than min_instances" if autoscaling_settings.max_instances <= autoscaling_settings.min_instances + end + end +end diff --git a/src/configuration/settings/node_pool/instance_count.cr b/src/configuration/settings/node_pool/instance_count.cr new file mode 100644 index 00000000..256482e5 --- /dev/null +++ b/src/configuration/settings/node_pool/instance_count.cr @@ -0,0 +1,24 @@ +require "../../node_pool" +require "../../datastore" + +class Configuration::Settings::NodePool::InstanceCount + getter errors : Array(String) + getter pool : Configuration::NodePool + getter pool_type : Symbol + getter datastore : Configuration::Datastore + + def initialize(@errors, @pool, @pool_type, @datastore) + end + + def validate + validate_master_count if pool_type == :masters + end + + private def validate_master_count + if pool.instance_count > 0 && (pool.instance_count.odd? || datastore.mode == "external") + return + else + errors << "Masters count must equal to 1 for non-HA clusters or an odd number (recommended 3) for an HA cluster" + end + end +end diff --git a/src/configuration/settings/node_pool/instance_type.cr b/src/configuration/settings/node_pool/instance_type.cr new file mode 100644 index 00000000..6fd83ff3 --- /dev/null +++ b/src/configuration/settings/node_pool/instance_type.cr @@ -0,0 +1,21 @@ +require "../../node_pool" +require "../../../hetzner/instance_type" + +class Configuration::Settings::NodePool::InstanceType + getter errors : Array(String) + getter pool : Configuration::NodePool + getter instances_types : Array(Hetzner::InstanceType) + + def initialize(@errors, @pool, @instances_types) + end + + def validate + return if valid_instance_type? + + errors << "#{pool.name || "masters"} node pool has an invalid instance type" + end + + private def valid_instance_type? + instances_types.any? { |instance_type| instance_type.name == pool.instance_type } + end +end diff --git a/src/configuration/settings/node_pool/location.cr b/src/configuration/settings/node_pool/location.cr new file mode 100644 index 00000000..618d008b --- /dev/null +++ b/src/configuration/settings/node_pool/location.cr @@ -0,0 +1,41 @@ +require "../../node_pool" +require "../../../hetzner/location" + +class Configuration::Settings::NodePool::Location + getter errors : Array(String) + getter pool : Configuration::NodePool + getter pool_type : Symbol + getter masters_location : String? + getter locations : Array(Hetzner::Location) + + def initialize(@errors, @pool, @pool_type, @masters_location, @locations) + end + + def validate + location = pool.location + + if valid_location?(location) + validate_network_zone(location) if pool_type == :workers && masters_location + else + errors << "#{pool_type} pool has an invalid location" + end + end + + private def valid_location?(location) + locations.any? { |loc| loc.name == location } + end + + private def validate_network_zone(location) + in_network_zone = if masters_location == "ash" + location == "ash" + elsif masters_location == "hil" + location == "hil" + else + !%w(ash hil).includes?(location) + end + + unless in_network_zone + errors << "#{pool_type} pool must be in the same network zone as the masters when using a private network. If the masters are located in Ashburn, then all the node pools must be located in Ashburn too, otherwise none of the node pools should be located in Ashburn. Same thing for Hillsboro. If the masters are located in Germany or Finland, then also the worker node pools must be located in either Germany or Finland since these locations belong to the same network zone." + end + end +end diff --git a/src/configuration/settings/node_pool/node_labels.cr b/src/configuration/settings/node_pool/node_labels.cr new file mode 100644 index 00000000..d3fbcb28 --- /dev/null +++ b/src/configuration/settings/node_pool/node_labels.cr @@ -0,0 +1,21 @@ +require "../../node_label" + +class Configuration::Settings::NodePool::NodeLabels + getter errors : Array(String) + getter pool_type : Symbol + getter labels : Array(Configuration::NodeLabel)? + + def initialize(@errors, @pool_type, @labels) + end + + def validate + return unless labels + + labels.try &.each do |label| + if label.key.nil? || label.value.nil? + errors << "#{pool_type} has invalid labels" + break + end + end + end +end diff --git a/src/configuration/settings/node_pool/node_taints.cr b/src/configuration/settings/node_pool/node_taints.cr new file mode 100644 index 00000000..67e4d256 --- /dev/null +++ b/src/configuration/settings/node_pool/node_taints.cr @@ -0,0 +1,21 @@ +require "../../node_taint" + +class Configuration::Settings::NodePool::NodeTaints + getter errors : Array(String) + getter pool_type : Symbol + getter taints : Array(Configuration::NodeTaint)? + + def initialize(@errors, @pool_type, @taints) + end + + def validate + return unless taints + + taints.try &.each do |taint| + if taint.key.nil? || taint.value.nil? + errors << "#{pool_type} has invalid taints" + break + end + end + end +end diff --git a/src/configuration/settings/node_pool/pool_name.cr b/src/configuration/settings/node_pool/pool_name.cr new file mode 100644 index 00000000..74151dbb --- /dev/null +++ b/src/configuration/settings/node_pool/pool_name.cr @@ -0,0 +1,18 @@ +class Configuration::Settings::NodePool::PoolName + getter errors : Array(String) + getter pool_type : Symbol + getter pool_name : String + + def initialize(@errors, @pool_type, @pool_name) + end + + def validate + return if pool_type == :masters || valid_pool_name?(pool_name) + + errors << "#{pool_type} has an invalid name" + end + + private def valid_pool_name?(name : String) : Bool + (name =~ /\A([A-Za-z0-9\-_]+)\Z/) != nil + end +end diff --git a/src/hetzner-k3s.cr b/src/hetzner-k3s.cr new file mode 100644 index 00000000..0fbe0fbf --- /dev/null +++ b/src/hetzner-k3s.cr @@ -0,0 +1,96 @@ +require "admiral" + +require "./configuration/loader" +require "./k3s" +require "./cluster/create" +require "./cluster/delete" +require "./cluster/upgrade" + +module Hetzner::K3s + class CLI < Admiral::Command + VERSION = "2.0.8" + + class Create < Admiral::Command + define_help description: "create - Create a cluster" + + define_flag configuration_file_path : String, + description: "The path of the YAML configuration file", + long: "config", + short: "c", + required: true + + def run + configuration = Configuration::Loader.new(flags.configuration_file_path, nil) + configuration.validate(:create) + + Cluster::Create.new(configuration: configuration).run + end + end + + class Delete < Admiral::Command + define_help description: "delete - Delete a cluster" + + define_flag configuration_file_path : String, + description: "The path of the YAML configuration file", + long: "config", + short: "c", + required: true + + def run + configuration = Configuration::Loader.new(flags.configuration_file_path, nil) + configuration.validate(:delete) + + Cluster::Delete.new(configuration: configuration).run + end + end + + class Upgrade < Admiral::Command + define_help description: "upgrade - Upgrade a cluster to a newer version of k3s" + + define_flag configuration_file_path : String, + description: "The path of the YAML configuration file", + long: "config", + short: "c", + required: true + + define_flag new_k3s_version : String, + description: "The new version of k3s to upgrade to", + long: "--new-k3s-version", + required: true + + def run + configuration = Configuration::Loader.new(flags.configuration_file_path, flags.new_k3s_version) + configuration.validate(:upgrade) + + Cluster::Upgrade.new(configuration: configuration).run + end + end + + class Releases < Admiral::Command + define_help description: "releases - List the available k3s releases" + + def run + puts "Available k3s releases:" + + ::K3s.available_releases.each do |release| + puts release + end + end + end + + define_version VERSION + + define_help description: "hetzner-k3s - A tool to create k3s clusters on Hetzner Cloud" + + register_sub_command create : Create, description: "Create a cluster" + register_sub_command delete : Delete, description: "Delete a cluster" + register_sub_command upgrade : Upgrade, description: "Upgrade a cluster to a new version of k3s" + register_sub_command releases : Releases, description: "List the available k3s releases" + + def run + puts help + end + end +end + +Hetzner::K3s::CLI.run diff --git a/src/hetzner/client.cr b/src/hetzner/client.cr new file mode 100644 index 00000000..b13240e3 --- /dev/null +++ b/src/hetzner/client.cr @@ -0,0 +1,144 @@ +require "crest" +require "yaml" +require "json" + +require "./location" +require "./locations_list" +require "./instance_type" +require "./instance_types_list" + +class Hetzner::Client + getter token : String? + + private getter api_url : String = "https://api.hetzner.cloud/v1" + private getter mutex : Mutex = Mutex.new + + def initialize(token) + @token = token + end + + def locations : Array(Location) + @locations ||= begin + success, response = get("/locations") + + if success + Hetzner::LocationsList.from_json(response).locations + else + puts "[Preflight checks] Unable to fetch locations via Hetzner API" + exit 1 + end + end + end + + def instance_types : Array(InstanceType) + @instance_types ||= begin + success, response = get("/server_types") + + if success + Hetzner::InstanceTypesList.from_json(response).server_types + else + puts "[Preflight checks] Unable to fetch instance types via Hetzner API" + exit 1 + end + end + end + + def get(path, params : Hash = {} of Symbol => String | Bool | Nil) + response = with_rate_limit do + Crest.get( + "#{api_url}#{path}", + params: params, + json: true, + headers: headers, + handle_errors: false + ) + end + + handle_response(response) + end + + def post(path, params = {} of KeyType => ValueType) + response = with_rate_limit do + Crest.post( + "#{api_url}#{path}", + params, + json: true, + headers: headers, + handle_errors: false + ) + end + + handle_response(response) + end + + def put(path, params = {} of KeyType => ValueType) + response = with_rate_limit do + Crest.put( + "#{api_url}#{path}", + params, + json: true, + headers: headers, + handle_errors: false + ) + end + + handle_response(response) + end + + def delete(path, id) + response = with_rate_limit do + Crest.delete( + "#{api_url}#{path}/#{id}", + json: true, + headers: headers, + handle_errors: false + ) + end + + handle_response(response) + end + + private def headers + @headers ||= { + "Authorization" => "Bearer #{token}", + } + end + + private def handle_rate_limit(response) + reset_timestamp = response.headers["ratelimit-reset"] + + return unless reset_timestamp.is_a?(String) + + reset_time = reset_timestamp.to_i + wait_time = reset_time - Time.utc.to_unix + 30 + + while wait_time > 0 + reset_time = Time.utc.to_unix + wait_time + remaining_time = Time::Span.new(seconds: wait_time) + puts "[Hetzner API] Rate Limit hit. Waiting for #{remaining_time.total_hours.floor}h#{remaining_time.minutes.floor}m#{remaining_time.seconds.floor}s until reset..." + sleep_time = [wait_time, 5].min + sleep(sleep_time) + wait_time -= sleep_time + end + end + + private def with_rate_limit + while true + response = yield + + if response.status_code == 429 + mutex.synchronize do + handle_rate_limit(response) + end + else + return response + end + end + end + + private def handle_response(response) : Tuple(Bool, String) + success = response.status_code >= 200 && response.status_code < 300 + + {success, response.body.to_s} + end +end diff --git a/src/hetzner/firewall.cr b/src/hetzner/firewall.cr new file mode 100644 index 00000000..b9e32ca4 --- /dev/null +++ b/src/hetzner/firewall.cr @@ -0,0 +1,8 @@ +require "json" + +class Hetzner::Firewall + include JSON::Serializable + + property id : Int32 + property name : String +end diff --git a/src/hetzner/firewall/create.cr b/src/hetzner/firewall/create.cr new file mode 100644 index 00000000..ba831e5b --- /dev/null +++ b/src/hetzner/firewall/create.cr @@ -0,0 +1,216 @@ +require "../client" +require "./find" +require "../../util" +require "../../configuration/networking" + +class Hetzner::Firewall::Create + include Util + + private getter settings : Configuration::Main + private getter masters : Array(Hetzner::Instance) + private getter hetzner_client : Hetzner::Client + private getter firewall_name : String + private getter firewall_finder : Hetzner::Firewall::Find + private getter private_network : Configuration::NetworkingComponents::PrivateNetwork + private getter ssh : Configuration::NetworkingComponents::SSH + private getter allowed_networks : Configuration::NetworkingComponents::AllowedNetworks + + def initialize( + @settings, + @hetzner_client, + @firewall_name, + @masters + ) + @private_network = settings.networking.private_network + @ssh = settings.networking.ssh + @allowed_networks = settings.networking.allowed_networks + @firewall_finder = Hetzner::Firewall::Find.new(hetzner_client, firewall_name) + end + + def run + firewall = firewall_finder.run + action = firewall ? :update : :create + + if firewall + log_line "Updating firewall..." + action_path = "/firewalls/#{firewall.id}/actions/set_rules" + else + log_line "Creating firewall..." + action_path = "/firewalls" + end + + Retriable.retry(max_attempts: 10, backoff: false, base_interval: 5.seconds) do + success, response = hetzner_client.post(action_path, firewall_config) + + if success + log_line action == :update ? "...firewall updated" : "...firewall created" + else + STDERR.puts "[#{default_log_prefix}] Failed to create or update firewall: #{response}" + STDERR.puts "[#{default_log_prefix}] Retrying to create or update firewall in 5 seconds..." + raise "Failed to create or update firewall" + end + end + + firewall = firewall_finder.run + firewall.not_nil! + end + + private def firewall_config + rules = [ + { + :description => "Allow SSH port", + :direction => "in", + :protocol => "tcp", + :port => ssh.port.to_s, + :source_ips => allowed_networks.ssh, + :destination_ips => [] of String + }, + { + :description => "Allow ICMP (ping)", + :direction => "in", + :protocol => "icmp", + :source_ips => [ + "0.0.0.0/0", + "::/0" + ], + :destination_ips => [] of String + }, + { + :description => "Node port range", + :direction => "in", + :protocol => "tcp", + :port => "30000-32767", + :source_ips => [ + "0.0.0.0/0", + "::/0" + ], + :destination_ips => [] of String + }, + { + :description => "Allow port 6443 (Kubernetes API server)", + :direction => "in", + :protocol => "tcp", + :port => "6443", + :source_ips => allowed_networks.api, + :destination_ips => [] of String + } + ] + + if private_network.try(&.enabled) + rules += [ + { + :description => "Allow all TCP traffic between nodes on the private network", + :direction => "in", + :protocol => "tcp", + :port => "any", + :source_ips => [private_network.subnet], + :destination_ips => [] of String + }, + { + :description => "Allow all UDP traffic between nodes on the private network", + :direction => "in", + :protocol => "udp", + :port => "any", + :source_ips => [private_network.subnet], + :destination_ips => [] of String + } + ] + else + if settings.networking.cni.cilium? + rules += [ + { + :description => "Allow wireguard traffic (Cilium)", + :direction => "in", + :protocol => "tcp", + :port => "51871", + :source_ips => [ + "0.0.0.0/0", + "::/0" + ], + :destination_ips => [] of String + } + ] + else + rules += [ + { + :description => "Allow wireguard traffic", + :direction => "in", + :protocol => "tcp", + :port => "51820", + :source_ips => [ + "0.0.0.0/0", + "::/0" + ], + :destination_ips => [] of String + }, + { + :description => "Allow wireguard traffic", + :direction => "in", + :protocol => "tcp", + :port => "51821", + :source_ips => [ + "0.0.0.0/0", + "::/0" + ], + :destination_ips => [] of String + } + ] + end + + if masters.size > 0 && settings.datastore.mode == "etcd" + master_ips = masters.map do |master| + "#{master.public_ip_address}/32" + end + + rules << { + :description => "Allow etcd traffic between masters", + :direction => "in", + :protocol => "tcp", + :port => "2379", + :source_ips => master_ips, + :destination_ips => [] of String + } + + rules << { + :description => "Allow etcd traffic between masters", + :direction => "in", + :protocol => "tcp", + :port => "2380", + :source_ips => master_ips, + :destination_ips => [] of String + } + end + end + + if settings.embedded_registry_mirror.enabled + rules << { + :description => "Allow traffic between nodes for peer-to-peer image distribution", + :direction => "in", + :protocol => "tcp", + :port => "5001", + :source_ips => [ + "0.0.0.0/0", + "::/0" + ], + :destination_ips => [] of String + } + end + + { + :name => firewall_name, + :rules => rules, + :apply_to => [ + { + :label_selector => { + :selector => "cluster=#{settings.cluster_name}" + }, + :type => "label_selector" + } + ] + } + end + + private def default_log_prefix + "Firewall" + end +end diff --git a/src/hetzner/firewall/delete.cr b/src/hetzner/firewall/delete.cr new file mode 100644 index 00000000..a347848d --- /dev/null +++ b/src/hetzner/firewall/delete.cr @@ -0,0 +1,45 @@ +require "../client" +require "./find" +require "../../util" + +class Hetzner::Firewall::Delete + include Util + + getter hetzner_client : Hetzner::Client + getter firewall_name : String + getter firewall_finder : Hetzner::Firewall::Find + + def initialize(@hetzner_client, @firewall_name) + @firewall_finder = Hetzner::Firewall::Find.new(@hetzner_client, @firewall_name) + end + + def run + firewall = firewall_finder.run + + if firewall + log_line "Deleting firewall..." + delete_firewall(firewall.id) + log_line "...firewall deleted." + else + log_line "Firewall does not exist, skipping delete" + end + + firewall_name + end + + private def delete_firewall(firewall_id) + Retriable.retry(max_attempts: 10, backoff: false, base_interval: 5.seconds) do + success, response = hetzner_client.delete("/firewalls", firewall_id) + + unless success + STDERR.puts "[#{default_log_prefix}] Failed to delete firewall: #{response}" + STDERR.puts "[#{default_log_prefix}] Retrying to delete firewall in 5 seconds..." + raise "Failed to delete firewall" + end + end + end + + private def default_log_prefix + "Firewall" + end +end diff --git a/src/hetzner/firewall/find.cr b/src/hetzner/firewall/find.cr new file mode 100644 index 00000000..06ddc44b --- /dev/null +++ b/src/hetzner/firewall/find.cr @@ -0,0 +1,35 @@ +require "../client" +require "../firewall" +require "../firewalls_list" + +class Hetzner::Firewall::Find + getter hetzner_client : Hetzner::Client + getter firewall_name : String + + def initialize(@hetzner_client, @firewall_name) + end + + def run + firewalls = fetch_firewalls + + firewalls.find { |firewall| firewall.name == firewall_name } + end + + private def fetch_firewalls + Retriable.retry(max_attempts: 10, backoff: false, base_interval: 5.seconds) do + success, response = hetzner_client.get("/firewalls") + + if success + FirewallsList.from_json(response).firewalls + else + STDERR.puts "[#{default_log_prefix}] Failed to fetch firewall: #{response}" + STDERR.puts "[#{default_log_prefix}] Retrying to fetch firewall in 5 seconds..." + raise "Failed to fetch firewall" + end + end + end + + private def default_log_prefix + "Firewall" + end +end diff --git a/src/hetzner/firewalls_list.cr b/src/hetzner/firewalls_list.cr new file mode 100644 index 00000000..c92cbbc3 --- /dev/null +++ b/src/hetzner/firewalls_list.cr @@ -0,0 +1,7 @@ +require "./firewall" + +class Hetzner::FirewallsList + include JSON::Serializable + + property firewalls : Array(Hetzner::Firewall) +end diff --git a/src/hetzner/instance.cr b/src/hetzner/instance.cr new file mode 100644 index 00000000..4bb405ff --- /dev/null +++ b/src/hetzner/instance.cr @@ -0,0 +1,46 @@ +require "json" +require "./public_net" +require "./network_interface" + +class Hetzner::Instance + include JSON::Serializable + + property id : Int32 + property name : String + property status : String + getter public_net : PublicNet? + getter private_net : Array(Hetzner::NetworkInterface)? + + def public_ip_address + public_net.try(&.ipv4).try(&.ip) + end + + def private_ip_address + net = private_net + + return public_ip_address unless net + return if net.try(&.empty?) + + net[0].ip + end + + def host_ip_address + if public_ip_address.nil? + private_ip_address + else + public_ip_address + end + end + + def master? + /-master\d+/ =~ name + end + + def initialize(id : Int32, status : String, instance_name : String, internal_ip : String, external_ip : String) + @id = id + @status = status + @name = instance_name + @public_net = PublicNet.new(external_ip) unless external_ip.blank? + @private_net = [NetworkInterface.new(internal_ip)] unless internal_ip.blank? + end +end diff --git a/src/hetzner/instance/create.cr b/src/hetzner/instance/create.cr new file mode 100644 index 00000000..a819bc91 --- /dev/null +++ b/src/hetzner/instance/create.cr @@ -0,0 +1,300 @@ +require "crinja" +require "../client" +require "../ssh_key" +require "../network" +require "../placement_group" +require "./find" +require "../../util" +require "../../util/ssh" +require "../../util/shell" +require "../../kubernetes/util" + + +class Hetzner::Instance::Create + include Util + include Util::Shell + include Kubernetes::Util + + CLOUD_INIT_YAML = {{ read_file("#{__DIR__}/../../../templates/cloud_init.yaml") }} + + private getter settings : Configuration::Main + getter instance_name : String + private getter hetzner_client : Hetzner::Client + private getter cluster_name : String + private getter instance_type : String + private getter image : String | Int64 + private getter location : String + private getter ssh_key : Hetzner::SSHKey + private getter network : Hetzner::Network? + private getter enable_public_net_ipv4 : Bool + private getter enable_public_net_ipv6 : Bool + private getter additional_packages : Array(String) + private getter additional_post_create_commands : Array(String) + private getter instance_finder : Hetzner::Instance::Find + private getter snapshot_os : String + private getter ssh : Configuration::NetworkingComponents::SSH + private getter settings : Configuration::Main + private getter private_ssh_key_path : String + private getter public_ssh_key_path : String + private getter mutex : Mutex + private getter ssh_client : Util::SSH do + Util::SSH.new(ssh.private_key_path, ssh.public_key_path) + end + private getter instance_existed : Bool = false + + def initialize( + @settings, + @hetzner_client, + @mutex, + @instance_name, + @instance_type, + @image, + @ssh_key, + @network, + @placement_group : Hetzner::PlacementGroup? = nil, + @additional_packages = [] of String, + @additional_post_create_commands = [] of String, + @location = "" + ) + + @cluster_name = settings.cluster_name + @snapshot_os = settings.snapshot_os + @location = settings.masters_pool.location if location.empty? + @ssh = settings.networking.ssh + @enable_public_net_ipv4 = settings.networking.public_network.ipv4 + @enable_public_net_ipv6 = settings.networking.public_network.ipv6 + @private_ssh_key_path = settings.networking.ssh.private_key_path + @public_ssh_key_path = settings.networking.ssh.public_key_path + + @instance_finder = Hetzner::Instance::Find.new(@settings, @hetzner_client, @instance_name) + end + + def run + instance = find_instance_with_kubectl || instance_finder.run + + if instance + @instance_existed = true + log_line "Instance #{instance_name} already exists, skipping create" + ensure_instance_is_ready + else + instance = create_instance + + log_line "...instance #{instance_name} created" + end + + instance.not_nil! + end + + private def create_instance + attempts = 0 + + loop do + attempts += 1 + log_line "Creating instance #{instance_name} (attempt #{attempts})..." + success, response = hetzner_client.post("/servers", instance_config) + puts response unless success + break if success + end + + ensure_instance_is_ready + end + + private def ensure_instance_is_ready + ready = false + powering_on_count = 0 + attaching_to_network_count = 0 + + until ready + if !instance_existed && settings.networking.private_network.enabled + sleep 10 + end + + instance = instance_finder.run + + next unless instance + + log_line "Instance status: #{instance.status}" + + if instance.status != "running" && settings.networking.private_network.enabled + powering_on_count += 1 + power_on_instance(instance, powering_on_count) + next + end + + sleep 5 + + if settings.networking.private_network.enabled && !instance.try(&.private_ip_address) + attaching_to_network_count += 1 + attach_instance_to_network(instance, attaching_to_network_count) + next + end + + ssh_client.wait_for_instance instance, ssh.port, ssh.use_agent, "echo ready", "ready" + ready = true + end + + instance + end + + private def power_on_instance(instance, powering_on_count) + log_line "Powering on instance (attempt #{powering_on_count})" + hetzner_client.post("/servers/#{instance.id}/actions/poweron", {} of String => String) + log_line "Waiting for instance to be powered on..." + end + + private def attach_instance_to_network(instance, attaching_to_network_count) + mutex.synchronize do + log_line "Attaching instance to network (attempt #{attaching_to_network_count})" + hetzner_client.post("/servers/#{instance.id}/actions/attach_to_network", { :network => network.not_nil!.id }) + log_line "Waiting for instance to be attached to the network..." + end + end + + private def instance_config + user_data = Hetzner::Instance::Create.cloud_init(settings, ssh.port, snapshot_os, additional_packages, additional_post_create_commands) + + base_config = { + :name => instance_name, + :location => location, + :image => image, + :public_net => { + :enable_ipv4 => enable_public_net_ipv4, + :enable_ipv6 => enable_public_net_ipv6, + }, + :server_type => instance_type, + :ssh_keys => [ + ssh_key.id + ], + :user_data => user_data, + :labels => { + :cluster => cluster_name, + :role => (instance_name =~ /master/ ? "master" : "worker") + }, + :start_after_create => true + } + + placement_group = @placement_group + network = @network + + base_config = base_config.merge({ :placement_group => placement_group.id }) unless placement_group.nil? + base_config = base_config.merge({ :networks => [network.id] }) unless network.nil? + + base_config + end + + def self.cloud_init(settings, ssh_port = 22, snapshot_os = "default", additional_packages = [] of String, additional_post_create_commands = [] of String, init_commands = [] of String) + Crinja.render(CLOUD_INIT_YAML, { + packages_str: generate_packages_str(snapshot_os, additional_packages), + post_create_commands_str: generate_post_create_commands_str(snapshot_os, additional_post_create_commands, init_commands), + eth1_str: eth1(snapshot_os), + growpart_str: growpart(snapshot_os), + ssh_port: ssh_port + }) + end + + def self.growpart(snapshot_os) + snapshot_os == "microos" ? <<-YAML + growpart: + devices: ["/var"] + YAML + : "" + end + + def self.eth1(snapshot_os) + snapshot_os == "microos" ? <<-YAML + - content: | + BOOTPROTO='dhcp' + STARTMODE='auto' + path: /etc/sysconfig/network/ifcfg-eth1 + YAML + : "" + end + + def self.mandatory_post_create_commands + [ + "hostnamectl set-hostname $(curl http://169.254.169.254/hetzner/v1/metadata/hostname)", + "update-crypto-policies --set DEFAULT:SHA1 || true", + "/etc/configure-ssh.sh", + "echo \"nameserver 8.8.8.8\" > /etc/k8s-resolv.conf" + ] + end + + def self.generate_post_create_commands_str(snapshot_os, additional_post_create_commands, init_commands) + post_create_commands = mandatory_post_create_commands + + if snapshot_os == "microos" + post_create_commands += microos_commands + end + + additional_post_create_commands = additional_post_create_commands.map do |command| + if command.includes?("\n") + lines = ["|"] + command.split("\n").each do |line| + lines << " " + line + end + lines.join("\n") + else + command + end + end + + post_create_commands = post_create_commands + init_commands + additional_post_create_commands + + "- #{post_create_commands.join("\n- ")}" + end + + def self.generate_packages_str(snapshot_os, additional_packages) + packages = %w[fail2ban] + wireguard = snapshot_os == "microos" ? "wireguard-tools" : "wireguard" + packages << wireguard + packages += additional_packages + "'#{packages.join("', '")}'" + end + + def self.microos_commands + [ + "btrfs filesystem resize max /var", + "sed -i 's/NETCONFIG_DNS_STATIC_SERVERS=\"\"/NETCONFIG_DNS_STATIC_SERVERS=\"1.1.1.1 1.0.0.1\"/g' /etc/sysconfig/network/config", + "sed -i 's/#SystemMaxUse=/SystemMaxUse=3G/g' /etc/systemd/journald.conf", + "sed -i 's/#MaxRetentionSec=/MaxRetentionSec=1week/g' /etc/systemd/journald.conf", + "sed -i 's/NUMBER_LIMIT=\"2-10\"/NUMBER_LIMIT=\"4\"/g' /etc/snapper/configs/root", + "sed -i 's/NUMBER_LIMIT_IMPORTANT=\"4-10\"/NUMBER_LIMIT_IMPORTANT=\"3\"/g' /etc/snapper/configs/root", + "sed -i 's/NETCONFIG_NIS_SETDOMAINNAME=\"yes\"/NETCONFIG_NIS_SETDOMAINNAME=\"no\"/g' /etc/sysconfig/network/config", + "sed -i 's/DHCLIENT_SET_HOSTNAME=\"yes\"/DHCLIENT_SET_HOSTNAME=\"no\"/g' /etc/sysconfig/network/dhcp" + ] + end + + private def default_log_prefix + "Instance #{instance_name}" + end + + private def find_instance_with_kubectl + return nil unless api_server_ready?(settings.kubeconfig_path) + + command = %(kubectl get nodes -o jsonpath='{.items[0].status.addresses[?(@.type=="InternalIP")].address}{"\\n"}{.items[0].status.addresses[?(@.type=="ExternalIP")].address}' --field-selector metadata.name=#{instance_name}) + + result = run_shell_command(command, settings.kubeconfig_path, settings.hetzner_token, print_output: false, abort_on_error: false) + + if result.success? + internal_ip, external_ip = result.output.split("\n") + external_ip = internal_ip if external_ip.blank? # before CCM is installed external IP is not available + + unless internal_ip.blank? && external_ip.blank? + instance = Hetzner::Instance.new( + id: Random::Secure.rand(Int32::MIN..Int32::MAX), + status: "running", + instance_name: instance_name, + internal_ip: internal_ip, + external_ip: external_ip + ) + + result = ssh_client.wait_for_instance instance, ssh.port, ssh.use_agent, "echo ready", "ready" + + if result == "ready" + log_line "Instance was already a member of the cluster" + instance + end + end + end + end +end diff --git a/src/hetzner/instance/delete.cr b/src/hetzner/instance/delete.cr new file mode 100644 index 00000000..703970bc --- /dev/null +++ b/src/hetzner/instance/delete.cr @@ -0,0 +1,50 @@ +require "../client" +require "../instance" +require "../instances_list" +require "./find" +require "../../util" + +class Hetzner::Instance::Delete + include Util + + getter hetzner_client : Hetzner::Client + getter instance_name : String + getter instance_finder : Hetzner::Instance::Find + + private getter settings : Configuration::Main + private getter ssh : Configuration::NetworkingComponents::SSH + private getter ssh_client : Util::SSH do + Util::SSH.new(ssh.private_key_path, ssh.public_key_path) + end + + def initialize(@settings, @hetzner_client, @instance_name) + @ssh = settings.networking.ssh + @instance_finder = Hetzner::Instance::Find.new(@settings, @hetzner_client, @instance_name) + end + + def run + if instance = instance_finder.run + log_line "Deleting instance #{instance_name}..." + + Retriable.retry(max_attempts: 10, backoff: false, base_interval: 5.seconds) do + success, response = hetzner_client.delete("/servers", instance.id) + + if success + log_line "...instance #{instance_name} deleted" + else + STDERR.puts "[#{default_log_prefix}] Failed to delete instance #{instance_name}: #{response}" + STDERR.puts "[#{default_log_prefix}] Retrying to delete instance #{instance_name} in 5 seconds..." + raise "Failed to delete instance" + end + end + else + log_line "Instance #{instance_name} does not exist, skipping delete" + end + + instance_name + end + + private def default_log_prefix + "Instance #{instance_name}" + end +end diff --git a/src/hetzner/instance/find.cr b/src/hetzner/instance/find.cr new file mode 100644 index 00000000..23988d85 --- /dev/null +++ b/src/hetzner/instance/find.cr @@ -0,0 +1,46 @@ +require "../client" +require "../instance" +require "../instances_list" +require "../../util" +require "../../util/ssh" +require "../../util/shell" + +class Hetzner::Instance::Find + include Util + include Util::Shell + + getter hetzner_client : Hetzner::Client + getter instance_name : String + + private getter settings : Configuration::Main + private getter ssh : Configuration::NetworkingComponents::SSH + private getter ssh_client : Util::SSH do + Util::SSH.new(ssh.private_key_path, ssh.public_key_path) + end + + def initialize(@settings, @hetzner_client, @instance_name) + @ssh = settings.networking.ssh + end + + def run + instances = Retriable.retry(max_attempts: 10, backoff: false, base_interval: 5.seconds) do + success, response = hetzner_client.get("/servers", { :name => instance_name }) + + if success + InstancesList.from_json(response).servers + else + STDERR.puts "[#{default_log_prefix}] Failed to fetch instance #{instance_name}: #{response}" + STDERR.puts "[#{default_log_prefix}] Retrying to fetch instance #{instance_name} in 5 seconds..." + raise "Failed to fetch instance" + end + end + + instances.find do |instance| + instance.name == instance_name + end + end + + def default_log_prefix + "Instances" + end +end diff --git a/src/hetzner/instance_type.cr b/src/hetzner/instance_type.cr new file mode 100644 index 00000000..278abbc8 --- /dev/null +++ b/src/hetzner/instance_type.cr @@ -0,0 +1,8 @@ +require "./client" + +class Hetzner::InstanceType + include JSON::Serializable + + property id : Int32 + property name : String +end diff --git a/src/hetzner/instance_types_list.cr b/src/hetzner/instance_types_list.cr new file mode 100644 index 00000000..fdd913b0 --- /dev/null +++ b/src/hetzner/instance_types_list.cr @@ -0,0 +1,7 @@ +require "./instance_type" + +class Hetzner::InstanceTypesList + include JSON::Serializable + + property server_types : Array(Hetzner::InstanceType) +end diff --git a/src/hetzner/instances_list.cr b/src/hetzner/instances_list.cr new file mode 100644 index 00000000..8a5f6925 --- /dev/null +++ b/src/hetzner/instances_list.cr @@ -0,0 +1,7 @@ +require "./instance" + +class Hetzner::InstancesList + include JSON::Serializable + + property servers : Array(Hetzner::Instance) +end diff --git a/src/hetzner/ipv4.cr b/src/hetzner/ipv4.cr new file mode 100644 index 00000000..2a4be3e2 --- /dev/null +++ b/src/hetzner/ipv4.cr @@ -0,0 +1,11 @@ +require "./client" + +class Hetzner::Ipv4 + include JSON::Serializable + + property ip : String? + + def initialize(ip : String) + @ip = ip + end +end diff --git a/src/hetzner/load_balancer.cr b/src/hetzner/load_balancer.cr new file mode 100644 index 00000000..58d0fe43 --- /dev/null +++ b/src/hetzner/load_balancer.cr @@ -0,0 +1,23 @@ +require "json" +require "./public_net" + +class Hetzner::LoadBalancer + include JSON::Serializable + + property id : Int32 + property name : String + property private_net : Array(Hetzner::Ipv4) + getter public_net : PublicNet? + + def public_ip_address + public_net.try(&.ipv4).try(&.ip) + end + + def private_ip_address + if private_net.any? + private_net[0].try(&.ip) + else + public_ip_address + end + end +end diff --git a/src/hetzner/load_balancer/create.cr b/src/hetzner/load_balancer/create.cr new file mode 100644 index 00000000..3d99c1df --- /dev/null +++ b/src/hetzner/load_balancer/create.cr @@ -0,0 +1,120 @@ +require "../client" +require "./find" +require "../../util" + +class Hetzner::LoadBalancer::Create + include Util + + getter settings : Configuration::Main + getter hetzner_client : Hetzner::Client + getter cluster_name : String + getter location : String + getter network_id : Int64? = 0 + getter load_balancer_finder : Hetzner::LoadBalancer::Find + getter load_balancer_name : String do + "#{cluster_name}-api" + end + + def initialize(@settings, @hetzner_client, @location, @network_id) + @cluster_name = settings.cluster_name + @load_balancer_finder = Hetzner::LoadBalancer::Find.new(@hetzner_client, load_balancer_name) + end + + def run + load_balancer = load_balancer_finder.run + + if load_balancer + log_line "Load balancer for API server already exists, skipping create" + else + log_line "Creating load balancer for API server..." + create_load_balancer + load_balancer = wait_for_load_balancer_public_ip + log_line "...load balancer for API server created" + end + + load_balancer.not_nil! + end + + private def create_load_balancer + Retriable.retry(max_attempts: 10, backoff: false, base_interval: 5.seconds) do + success, response = hetzner_client.post("/load_balancers", load_balancer_config) + + unless success + STDERR.puts "[#{default_log_prefix}] Failed to create load balancer: #{response}" + STDERR.puts "[#{default_log_prefix}] Retrying to create load balancer in 5 seconds..." + raise "Failed to create load balancer" + end + end + end + + private def wait_for_load_balancer_public_ip + loop do + load_balancer = load_balancer_finder.run + break load_balancer if load_balancer.try(&.public_ip_address) + sleep 1 + end + end + + private def load_balancer_config + if settings.networking.private_network.enabled + { + :algorithm => { + :type => "round_robin" + }, + :load_balancer_type => "lb11", + :location => location, + :name => load_balancer_name, + :network => network_id, + :public_interface => true, + :services => [ + { + :destination_port => 6443, + :listen_port => 6443, + :protocol => "tcp", + :proxyprotocol => false + } + ], + :targets => [ + { + :label_selector => { + :selector => "cluster=#{cluster_name},role=master" + }, + :type => "label_selector", + :use_private_ip => true + } + ] + } + else + { + :algorithm => { + :type => "round_robin" + }, + :load_balancer_type => "lb11", + :location => location, + :name => load_balancer_name, + :public_interface => true, + :services => [ + { + :destination_port => 6443, + :listen_port => 6443, + :protocol => "tcp", + :proxyprotocol => false + } + ], + :targets => [ + { + :label_selector => { + :selector => "cluster=#{cluster_name},role=master" + }, + :type => "label_selector", + :use_private_ip => false + } + ] + } + end + end + + private def default_log_prefix + "API Load balancer" + end +end diff --git a/src/hetzner/load_balancer/delete.cr b/src/hetzner/load_balancer/delete.cr new file mode 100644 index 00000000..4fa9b86d --- /dev/null +++ b/src/hetzner/load_balancer/delete.cr @@ -0,0 +1,65 @@ +require "../client" +require "./find" +require "../../util" + +class Hetzner::LoadBalancer::Delete + include Util + + getter hetzner_client : Hetzner::Client + getter cluster_name : String + getter load_balancer_name : String do + "#{cluster_name}-api" + end + getter load_balancer_finder : Hetzner::LoadBalancer::Find + + def initialize(@hetzner_client, @cluster_name) + @load_balancer_finder = Hetzner::LoadBalancer::Find.new(@hetzner_client, load_balancer_name) + end + + def run + load_balancer = load_balancer_finder.run + + if load_balancer + log_line "Deleting load balancer for API server..." + delete_load_balancer(load_balancer.id) + log_line "...load balancer for API server deleted" + else + log_line "Load balancer for API server does not exist, skipping delete" + end + + load_balancer_name + end + + private def delete_load_balancer(load_balancer_id) + Retriable.retry(max_attempts: 10, backoff: false, base_interval: 5.seconds) do + success, response = hetzner_client.post("/load_balancers/#{load_balancer_id}/actions/remove_target", remove_targets_config) + + unless success + STDERR.puts "[#{default_log_prefix}] Failed to delete load balancer: #{response}" + STDERR.puts "[#{default_log_prefix}] Retrying to delete load balancer in 5 seconds..." + raise "Failed to delete load balancer" + end + + success, response = hetzner_client.delete("/load_balancers", load_balancer_id) + + unless success + STDERR.puts "[#{default_log_prefix}] Failed to delete load balancer: #{response}" + STDERR.puts "[#{default_log_prefix}] Retrying to delete load balancer in 5 seconds..." + raise "Failed to delete load balancer" + end + end + end + + private def remove_targets_config + { + :label_selector => { + :selector => "cluster=#{cluster_name},role=master" + }, + :type => "label_selector" + } + end + + private def default_log_prefix + "API Load balancer" + end +end diff --git a/src/hetzner/load_balancer/find.cr b/src/hetzner/load_balancer/find.cr new file mode 100644 index 00000000..ab1db680 --- /dev/null +++ b/src/hetzner/load_balancer/find.cr @@ -0,0 +1,36 @@ +require "../client" +require "../load_balancer" +require "../load_balancers_list" + +class Hetzner::LoadBalancer::Find + getter hetzner_client : Hetzner::Client + getter load_balancer_name : String + + def initialize(@hetzner_client, @load_balancer_name) + end + + def run + load_balancers = fetch_load_balancers + + load_balancers.find { |load_balancer| load_balancer.name == load_balancer_name } + end + + private def fetch_load_balancers + + Retriable.retry(max_attempts: 10, backoff: false, base_interval: 5.seconds) do + success, response = hetzner_client.get("/load_balancers") + + if success + LoadBalancersList.from_json(response).load_balancers + else + STDERR.puts "[#{default_log_prefix}] Failed to fetch load balancers: #{response}" + STDERR.puts "[#{default_log_prefix}] Retrying to fetch load balancers in 5 seconds..." + raise "Failed to fetch load balancers" + end + end + end + + private def default_log_prefix + "API Load balancer" + end +end diff --git a/src/hetzner/load_balancers_list.cr b/src/hetzner/load_balancers_list.cr new file mode 100644 index 00000000..f2d49aa0 --- /dev/null +++ b/src/hetzner/load_balancers_list.cr @@ -0,0 +1,7 @@ +require "./network" + +class Hetzner::LoadBalancersList + include JSON::Serializable + + property load_balancers : Array(Hetzner::LoadBalancer) +end diff --git a/src/hetzner/location.cr b/src/hetzner/location.cr new file mode 100644 index 00000000..e99f515f --- /dev/null +++ b/src/hetzner/location.cr @@ -0,0 +1,9 @@ +require "./client" + +class Hetzner::Location + include JSON::Serializable + + property id : Int32 + property name : String + property network_zone : String +end diff --git a/src/hetzner/locations_list.cr b/src/hetzner/locations_list.cr new file mode 100644 index 00000000..9869141f --- /dev/null +++ b/src/hetzner/locations_list.cr @@ -0,0 +1,7 @@ +require "./location" + +class Hetzner::LocationsList + include JSON::Serializable + + property locations : Array(Hetzner::Location) +end diff --git a/src/hetzner/network.cr b/src/hetzner/network.cr new file mode 100644 index 00000000..fa6c87af --- /dev/null +++ b/src/hetzner/network.cr @@ -0,0 +1,8 @@ +require "json" + +class Hetzner::Network + include JSON::Serializable + + property id : Int64 + property name : String +end diff --git a/src/hetzner/network/create.cr b/src/hetzner/network/create.cr new file mode 100644 index 00000000..ce19a497 --- /dev/null +++ b/src/hetzner/network/create.cr @@ -0,0 +1,67 @@ +require "../client" +require "./find" +require "../../util" +require "../../configuration/main" +require "../../configuration/networking" + +class Hetzner::Network::Create + include Util + + getter hetzner_client : Hetzner::Client + getter settings : Configuration::Main + getter network_name : String + getter location : String + getter network_finder : Hetzner::Network::Find + getter locations : Array(Hetzner::Location) + + def initialize(@settings, @hetzner_client, @network_name, @locations) + @location = settings.masters_pool.location + @network_finder = Hetzner::Network::Find.new(hetzner_client, network_name) + end + + def run + network = network_finder.run + + if network + log_line "Private network already exists, skipping create" + else + log_line "Creating private network..." + + Retriable.retry(max_attempts: 10, backoff: false, base_interval: 5.seconds) do + success, response = hetzner_client.post("/networks", network_config) + + unless success + STDERR.puts "[#{default_log_prefix}] Failed to create private network: #{response}" + STDERR.puts "[#{default_log_prefix}] Retrying to create private network in 5 seconds..." + raise "Failed to create private network" + end + end + + network = network_finder.run + + log_line "...private network created" + end + + network.not_nil! + end + + private def network_config + network_zone = locations.find { |l| l.name == location }.not_nil!.network_zone + + { + :name => network_name, + :ip_range => settings.networking.private_network.subnet, + :subnets => [ + { + :ip_range => settings.networking.private_network.subnet, + :network_zone => network_zone, + :type => "cloud" + } + ] + } + end + + private def default_log_prefix + "Private Network" + end +end diff --git a/src/hetzner/network/delete.cr b/src/hetzner/network/delete.cr new file mode 100644 index 00000000..7ca72ee6 --- /dev/null +++ b/src/hetzner/network/delete.cr @@ -0,0 +1,41 @@ +require "../client" +require "./find" +require "../../util" + +class Hetzner::Network::Delete + include Util + + getter hetzner_client : Hetzner::Client + getter network_name : String + getter network_finder : Hetzner::Network::Find + + def initialize(@hetzner_client, @network_name) + @network_finder = Hetzner::Network::Find.new(@hetzner_client, @network_name) + end + + def run + if network = network_finder.run + log_line "Deleting private network..." + + Retriable.retry(max_attempts: 10, backoff: false, base_interval: 5.seconds) do + success, response = hetzner_client.delete("/networks", network.id) + + unless success + STDERR.puts "[#{default_log_prefix}] Failed to delete private network: #{response}" + STDERR.puts "[#{default_log_prefix}] Retrying to delete private network in 5 seconds..." + raise "Failed to delete private network" + end + end + + log_line "...private network deleted" + else + log_line "Private network does not exist, skipping delete" + end + + network_name + end + + private def default_log_prefix + "Private network" + end +end diff --git a/src/hetzner/network/find.cr b/src/hetzner/network/find.cr new file mode 100644 index 00000000..f324a684 --- /dev/null +++ b/src/hetzner/network/find.cr @@ -0,0 +1,35 @@ +require "../client" +require "../network" +require "../networks_list" + +class Hetzner::Network::Find + getter hetzner_client : Hetzner::Client + getter network_name : String + + def initialize(@hetzner_client, @network_name) + end + + def run + networks = fetch_networks + + networks.find { |network| network.name == network_name } + end + + private def fetch_networks + Retriable.retry(max_attempts: 10, backoff: false, base_interval: 5.seconds) do + success, response = hetzner_client.get("/networks") + + if success + NetworksList.from_json(response).networks + else + STDERR.puts "[#{default_log_prefix}] Failed to fetch networks: #{response}" + STDERR.puts "[#{default_log_prefix}] Retrying to fetch networks in 5 seconds..." + raise "Failed to fetch networks" + end + end + end + + private def default_log_prefix + "Private network" + end +end diff --git a/src/hetzner/network_interface.cr b/src/hetzner/network_interface.cr new file mode 100644 index 00000000..14267d7d --- /dev/null +++ b/src/hetzner/network_interface.cr @@ -0,0 +1,11 @@ +require "json" + +class Hetzner::NetworkInterface + include JSON::Serializable + + property ip : String? + + def initialize(ip : String) + @ip = ip + end +end diff --git a/src/hetzner/networks_list.cr b/src/hetzner/networks_list.cr new file mode 100644 index 00000000..2d06cbd0 --- /dev/null +++ b/src/hetzner/networks_list.cr @@ -0,0 +1,7 @@ +require "./network" + +class Hetzner::NetworksList + include JSON::Serializable + + property networks : Array(Hetzner::Network) +end diff --git a/src/hetzner/placement_group.cr b/src/hetzner/placement_group.cr new file mode 100644 index 00000000..b09284d2 --- /dev/null +++ b/src/hetzner/placement_group.cr @@ -0,0 +1,10 @@ +require "json" + +class Hetzner::PlacementGroup + include JSON::Serializable + include JSON::Serializable::Unmapped + + property id : Int32 + property name : String + property servers : Array(Int64) +end diff --git a/src/hetzner/placement_group/all.cr b/src/hetzner/placement_group/all.cr new file mode 100644 index 00000000..5b4933dc --- /dev/null +++ b/src/hetzner/placement_group/all.cr @@ -0,0 +1,54 @@ +require "../client" +require "../placement_group" +require "../placement_groups_list" + +class Hetzner::PlacementGroup::All + getter hetzner_client : Hetzner::Client + + def initialize(@hetzner_client) + end + + def run : Array(Hetzner::PlacementGroup) + fetch_placement_groups + end + + def delete_unused + all_placement_groups = fetch_placement_groups + + all_placement_groups.reject! do |placement_group| + if placement_group.servers.size == 0 + Hetzner::PlacementGroup::Delete.new(hetzner_client, placement_group: placement_group ).run + true + else + false + end + end + + all_placement_groups + end + + def delete_all + fetch_placement_groups.each do |placement_group| + Hetzner::PlacementGroup::Delete.new(hetzner_client, placement_group: placement_group ).run + end + end + + private def fetch_placement_groups + Retriable.retry(max_attempts: 10, backoff: false, base_interval: 5.seconds) do + success, response = hetzner_client.get("/placement_groups", { :per_page => 100 }) + + if success + groups = PlacementGroupsList.from_json(response).placement_groups + groups.sort_by { |placement_group| placement_group.name } + else + STDERR.puts "[#{default_log_prefix}] Failed to fetch placement groups: #{response}" + STDERR.puts "[#{default_log_prefix}] Retrying to fetch placement groups in 5 seconds..." + raise "Failed to fetch placement groups" + end + end + end + + private def default_log_prefix + "Placement groups" + end +end diff --git a/src/hetzner/placement_group/create.cr b/src/hetzner/placement_group/create.cr new file mode 100644 index 00000000..2a9ed539 --- /dev/null +++ b/src/hetzner/placement_group/create.cr @@ -0,0 +1,53 @@ +require "../client" +require "./find" +require "../../util" + +class Hetzner::PlacementGroup::Create + include Util + + getter hetzner_client : Hetzner::Client + getter placement_group_name : String + getter placement_group_finder : Hetzner::PlacementGroup::Find + + def initialize(@hetzner_client, @placement_group_name) + @placement_group_finder = Hetzner::PlacementGroup::Find.new(@hetzner_client, @placement_group_name) + end + + def run + placement_group = placement_group_finder.run + + if placement_group + log_line "Placement group #{placement_group_name} already exists, skipping create" + else + log_line "Creating placement group #{placement_group_name}..." + create_placement_group + placement_group = placement_group_finder.run + log_line "...placement group #{placement_group_name} created" + end + + placement_group.not_nil! + end + + private def create_placement_group + Retriable.retry(max_attempts: 10, backoff: false, base_interval: 5.seconds) do + success, response = hetzner_client.post("/placement_groups", placement_group_config) + + unless success + STDERR.puts "[#{default_log_prefix}] Failed to create placement group #{placement_group_name}: #{response}" + STDERR.puts "[#{default_log_prefix}] Retrying to create placement group #{placement_group_name} in 5 seconds..." + raise "Failed to create placement group" + end + end + end + + private def placement_group_config + { + :name => placement_group_name, + :type => "spread" + } + end + + private def default_log_prefix + "Placement groups" + end +end diff --git a/src/hetzner/placement_group/delete.cr b/src/hetzner/placement_group/delete.cr new file mode 100644 index 00000000..d5cac840 --- /dev/null +++ b/src/hetzner/placement_group/delete.cr @@ -0,0 +1,54 @@ +require "../client" +require "./find" +require "../../util" + +class Hetzner::PlacementGroup::Delete + include Util + + private getter hetzner_client : Hetzner::Client + private getter deleting_unused = false + + def initialize(@hetzner_client, @placement_group_name : String? = nil, @placement_group : Hetzner::PlacementGroup? = nil) + if placement_group_name = @placement_group_name + placement_group_finder = Hetzner::PlacementGroup::Find.new(@hetzner_client, placement_group_name) + placement_group = placement_group_finder.run + elsif placement_group = @placement_group + @placement_group_name = placement_group.name + end + + @deleting_unused = placement_group_name.nil? + end + + def run + placement_group = @placement_group + placement_group_name = @placement_group_name + + if placement_group + if deleting_unused + log_line "Deleting unused placement group #{placement_group_name}..." + else + log_line "Deleting placement group #{placement_group_name}..." + end + + Retriable.retry(max_attempts: 10, backoff: false, base_interval: 5.seconds) do + success, response = hetzner_client.delete("/placement_groups", placement_group.id) + + if success + log_line "...placement group #{placement_group_name} deleted" + else + STDERR.puts "[#{default_log_prefix}] Failed to delete placement group #{placement_group_name}: #{response}" + STDERR.puts "[#{default_log_prefix}] Retrying to delete placement group #{placement_group_name} in 5 seconds..." + raise "Failed to delete placement group" + end + end + else + log_line "Placement group #{placement_group_name} does not exist, skipping delete" + end + + placement_group_name + end + + private def default_log_prefix + "Placement groups" + end +end diff --git a/src/hetzner/placement_group/find.cr b/src/hetzner/placement_group/find.cr new file mode 100644 index 00000000..b2875d2a --- /dev/null +++ b/src/hetzner/placement_group/find.cr @@ -0,0 +1,35 @@ +require "../client" +require "../placement_group" +require "../placement_groups_list" + +class Hetzner::PlacementGroup::Find + getter hetzner_client : Hetzner::Client + getter placement_group_name : String + + def initialize(@hetzner_client, @placement_group_name) + end + + def run + placement_groups = fetch_placement_groups + + placement_groups.find { |placement_group| placement_group.name == placement_group_name } + end + + private def fetch_placement_groups + Retriable.retry(max_attempts: 10, backoff: false, base_interval: 5.seconds) do + success, response = hetzner_client.get("/placement_groups", { :name => placement_group_name }) + + if success + PlacementGroupsList.from_json(response).placement_groups + else + STDERR.puts "[#{default_log_prefix}] Failed to fetch placement group #{placement_group_name}: #{response}" + STDERR.puts "[#{default_log_prefix}] Retrying to fetch placement group #{placement_group_name} in 5 seconds..." + raise "Failed to fetch placement groups" + end + end + end + + private def default_log_prefix + "Placement groups" + end +end diff --git a/src/hetzner/placement_groups_list.cr b/src/hetzner/placement_groups_list.cr new file mode 100644 index 00000000..e64cca9e --- /dev/null +++ b/src/hetzner/placement_groups_list.cr @@ -0,0 +1,7 @@ +require "./placement_group" + +class Hetzner::PlacementGroupsList + include JSON::Serializable + + property placement_groups : Array(Hetzner::PlacementGroup) +end diff --git a/src/hetzner/public_net.cr b/src/hetzner/public_net.cr new file mode 100644 index 00000000..522d68ff --- /dev/null +++ b/src/hetzner/public_net.cr @@ -0,0 +1,12 @@ +require "./client" +require "./ipv4" + +class Hetzner::PublicNet + include JSON::Serializable + + property ipv4 : Hetzner::Ipv4? + + def initialize(ipv4 : String) : Hetzner::Ipv4 + @ipv4 = Hetzner::Ipv4.new(ipv4) + end +end diff --git a/src/hetzner/servers_list.cr b/src/hetzner/servers_list.cr new file mode 100644 index 00000000..ad2c5faa --- /dev/null +++ b/src/hetzner/servers_list.cr @@ -0,0 +1,7 @@ +require "./instance" + +class Hetzner::InstancesList + include JSON::Serializable + + property instances : Array(Hetzner::Instance) +end diff --git a/src/hetzner/ssh_key.cr b/src/hetzner/ssh_key.cr new file mode 100644 index 00000000..a683caa6 --- /dev/null +++ b/src/hetzner/ssh_key.cr @@ -0,0 +1,9 @@ +require "json" + +class Hetzner::SSHKey + include JSON::Serializable + + property id : Int32 + property name : String + property fingerprint : String +end diff --git a/src/hetzner/ssh_key/create.cr b/src/hetzner/ssh_key/create.cr new file mode 100644 index 00000000..7001e20e --- /dev/null +++ b/src/hetzner/ssh_key/create.cr @@ -0,0 +1,57 @@ +require "../client" +require "./find" +require "../../util" + +class Hetzner::SSHKey::Create + include Util + + getter hetzner_client : Hetzner::Client + getter settings : Configuration::Main + getter ssh_key_name : String + getter public_ssh_key_path : String + getter ssh_key_finder : Hetzner::SSHKey::Find + + def initialize(@hetzner_client, @settings) + @ssh_key_name = settings.cluster_name + @public_ssh_key_path = settings.networking.ssh.public_key_path + @ssh_key_finder = Hetzner::SSHKey::Find.new(hetzner_client, ssh_key_name, public_ssh_key_path) + end + + def run + ssh_key = ssh_key_finder.run + + if ssh_key + log_line "SSH key already exists, skipping create" + else + log_line "Creating SSH key..." + + create_ssh_key + ssh_key = ssh_key_finder.run + + log_line "...SSH key created" + end + + ssh_key.not_nil! + end + + private def create_ssh_key + ssh_key_config = { + :name => ssh_key_name, + :public_key => File.read(public_ssh_key_path).chomp + } + + Retriable.retry(max_attempts: 10, backoff: false, base_interval: 5.seconds) do + success, response = hetzner_client.post("/ssh_keys", ssh_key_config) + + unless success + STDERR.puts "[#{default_log_prefix}] Failed to create SSH key: #{response}" + STDERR.puts "[#{default_log_prefix}] Retrying to create SSH key in 5 seconds" + raise "Failed to create SSH key" + end + end + end + + private def default_log_prefix + "SSH key" + end +end diff --git a/src/hetzner/ssh_key/delete.cr b/src/hetzner/ssh_key/delete.cr new file mode 100644 index 00000000..01471925 --- /dev/null +++ b/src/hetzner/ssh_key/delete.cr @@ -0,0 +1,55 @@ +require "../client" +require "../ssh_key" +require "../ssh_keys_list" +require "./find" +require "../../util" + +class Hetzner::SSHKey::Delete + include Util + + getter hetzner_client : Hetzner::Client + getter ssh_key_name : String + getter ssh_key_finder : Hetzner::SSHKey::Find + + def initialize(@hetzner_client, @ssh_key_name, public_ssh_key_path) + @ssh_key_finder = Hetzner::SSHKey::Find.new(hetzner_client, ssh_key_name, public_ssh_key_path) + end + + def run + ssh_key = ssh_key_finder.run + + return handle_no_ssh_key if ssh_key.nil? + return handle_existing_ssh_key(ssh_key) if ssh_key.name == ssh_key_name + + log_line "An SSH key with the expected fingerprint existed before creating the cluster, so I won't delete it" + + ssh_key_name + end + + private def handle_no_ssh_key + log_line "SSH key does not exist, skipping delete" + ssh_key_name + end + + private def handle_existing_ssh_key(ssh_key) + log_line "Deleting SSH key..." + + Retriable.retry(max_attempts: 10, backoff: false, base_interval: 5.seconds) do + success, response = hetzner_client.delete("/ssh_keys", ssh_key.id) + + if success + log_line "...SSH key deleted" + else + STDERR.puts "[#{default_log_prefix}] Failed to delete ssh key: #{response}" + STDERR.puts "[#{default_log_prefix}] Retrying to delete ssh key in 5 seconds..." + raise "Failed to delete ssh key" + end + end + + ssh_key_name + end + + private def default_log_prefix + "SSH key" + end +end diff --git a/src/hetzner/ssh_key/find.cr b/src/hetzner/ssh_key/find.cr new file mode 100644 index 00000000..da4de02b --- /dev/null +++ b/src/hetzner/ssh_key/find.cr @@ -0,0 +1,44 @@ +require "../client" +require "../ssh_key" +require "../ssh_keys_list" + +class Hetzner::SSHKey::Find + getter hetzner_client : Hetzner::Client + getter ssh_key_name : String + getter public_ssh_key_path : String + + def initialize(@hetzner_client, @ssh_key_name, @public_ssh_key_path) + end + + def run + ssh_keys = fetch_ssh_keys + fingerprint = calculate_fingerprint(public_ssh_key_path) + + key = ssh_keys.find { |ssh_key| ssh_key.fingerprint == fingerprint } + key ||= ssh_keys.find { |ssh_key| ssh_key.name == ssh_key_name } + key + end + + private def fetch_ssh_keys + Retriable.retry(max_attempts: 10, backoff: false, base_interval: 5.seconds) do + success, response = hetzner_client.get("/ssh_keys") + + if success + SSHKeysList.from_json(response).ssh_keys + else + STDERR.puts "[#{default_log_prefix}] Failed to fetch ssh keys: #{response}" + STDERR.puts "[#{default_log_prefix}] Retrying to fetch ssh keys in 5 seconds..." + raise "Failed to fetch ssh keys" + end + end + end + + private def calculate_fingerprint(public_ssh_key_path) + private_key = File.read(public_ssh_key_path).split[1] + Digest::MD5.hexdigest(Base64.decode(private_key)).chars.each_slice(2).map(&.join).join(":") + end + + private def default_log_prefix + "SSH key" + end +end diff --git a/src/hetzner/ssh_keys_list.cr b/src/hetzner/ssh_keys_list.cr new file mode 100644 index 00000000..501cfb1c --- /dev/null +++ b/src/hetzner/ssh_keys_list.cr @@ -0,0 +1,7 @@ +require "./ssh_key" + +class Hetzner::SSHKeysList + include JSON::Serializable + + property ssh_keys : Array(Hetzner::SSHKey) +end diff --git a/src/k3s.cr b/src/k3s.cr new file mode 100644 index 00000000..5f6377ea --- /dev/null +++ b/src/k3s.cr @@ -0,0 +1,46 @@ +require "yaml" +require "file" +require "crest" + +module K3s + GITHUB_DELIM_LINKS = "," + GITHUB_LINK_REGEX = /<(?[^>]+)>; rel="(?[^"]+)"/ + RELEASES_FILENAME = "/tmp/k3s-releases.yaml" + + def self.available_releases + return YAML.parse(File.read(RELEASES_FILENAME)).as_a if File.exists?(RELEASES_FILENAME) + + releases = fetch_all_releases_from_github + File.open(RELEASES_FILENAME, "w") { |f| YAML.dump(releases, f) } + releases + end + + private def self.fetch_all_releases_from_github : Array(String) + releases = [] of String + next_page_url = "https://api.github.com/repos/k3s-io/k3s/tags?per_page=100" + + while next_page_url + response = Crest.get(next_page_url, json: true) + releases.concat(JSON.parse(response.body).as_a.map { |release| release["name"].as_s }) + + next_page_url = extract_next_github_page_url(response.headers["Link"]?) + end + + releases.reverse + end + + private def self.extract_next_github_page_url(link_header : (Array(String) | String | Nil)) : String? + return nil unless link_header + + link_header = link_header.is_a?(Array) ? link_header.join(",") : link_header + + links = link_header.split(GITHUB_DELIM_LINKS, remove_empty: true) + + links.each do |link| + captures = GITHUB_LINK_REGEX.match(link.strip).try &.named_captures + return captures["link"] if captures && captures["rel"]? == "next" + end + + nil + end +end diff --git a/src/kubernetes/installer.cr b/src/kubernetes/installer.cr new file mode 100644 index 00000000..53241ad4 --- /dev/null +++ b/src/kubernetes/installer.cr @@ -0,0 +1,391 @@ +require "crinja" +require "base64" +require "file_utils" + +require "../util" +require "../util/ssh" +require "../util/shell" +require "../kubernetes/util" +require "../hetzner/instance" +require "../hetzner/load_balancer" +require "../configuration/loader" +require "./software/system_upgrade_controller" +require "./software/cilium" +require "./software/hetzner/secret" +require "./software/hetzner/cloud_controller_manager" +require "./software/hetzner/csi_driver" +require "./software/cluster_autoscaler" + +class Kubernetes::Installer + include Util + include Util::Shell + + MASTER_INSTALL_SCRIPT = {{ read_file("#{__DIR__}/../../templates/master_install_script.sh") }} + WORKER_INSTALL_SCRIPT = {{ read_file("#{__DIR__}/../../templates/worker_install_script.sh") }} + CLOUD_INIT_WAIT_SCRIPT = {{ read_file("#{__DIR__}/../../templates/cloud_init_wait_script.sh") }} + + getter configuration : Configuration::Loader + getter settings : Configuration::Main { configuration.settings } + getter masters : Array(Hetzner::Instance) = [] of Hetzner::Instance + getter workers : Array(Hetzner::Instance) = [] of Hetzner::Instance + getter autoscaling_worker_node_pools : Array(Configuration::NodePool) + getter load_balancer : Hetzner::LoadBalancer? + getter ssh : ::Util::SSH + + private getter first_master : Hetzner::Instance? + + private getter cni : Configuration::NetworkingComponents::CNI { settings.networking.cni } + + def initialize( + @configuration, + # @load_balancer, + @ssh, + @autoscaling_worker_node_pools + ) + end + + def run(masters_installation_queue_channel, workers_installation_queue_channel, completed_channel, master_count, worker_count) + ensure_kubectl_is_installed! + + set_up_control_plane(masters_installation_queue_channel, master_count) + + save_kubeconfig(master_count) + + install_software(master_count) + + set_up_workers(workers_installation_queue_channel, worker_count, master_count) + + add_labels_and_taints_to_masters + add_labels_and_taints_to_workers + + completed_channel.send(nil) + end + + private def set_up_control_plane(masters_installation_queue_channel, master_count) + master_count.times do + masters << masters_installation_queue_channel.receive + end + + masters_ready_channel = Channel(Hetzner::Instance).new + + set_up_first_master(master_count) + + other_masters = masters - [first_master] + + other_masters.each do |master| + spawn do + deploy_k3s_to_master(master, master_count) + masters_ready_channel.send(master) + end + end + + (master_count - 1).times do + masters_ready_channel.receive + end + end + + private def set_up_workers(workers_installation_queue_channel, worker_count, master_count) + workers_ready_channel = Channel(Hetzner::Instance).new + + mutex = Mutex.new + + worker_count.times do + spawn do + worker = workers_installation_queue_channel.receive + mutex.synchronize { workers << worker } + deploy_k3s_to_worker(worker, master_count) + workers_ready_channel.send(worker) + end + end + + worker_count.times do + workers_ready_channel.receive + end + end + + private def set_up_first_master(master_count : Int) + ssh.run(first_master, settings.networking.ssh.port, CLOUD_INIT_WAIT_SCRIPT, settings.networking.ssh.use_agent) + + install_script = master_install_script(first_master, master_count) + + output = ssh.run(first_master, settings.networking.ssh.port, install_script, settings.networking.ssh.use_agent) + + log_line "Waiting for the control plane to be ready...", log_prefix: "Instance #{first_master.name}" + + sleep 10 unless /No change detected/ =~ output + + save_kubeconfig(master_count) + + sleep 5 + + command = "kubectl cluster-info 2> /dev/null" + + Retriable.retry(max_attempts: 3, on: Tasker::Timeout, backoff: false) do + Tasker.timeout(30.seconds) do + loop do + result = run_shell_command(command, configuration.kubeconfig_path, settings.hetzner_token, log_prefix: "Control plane", abort_on_error: false, print_output: false) + break if result.output.includes?("running") + sleep 1 + end + end + end + + log_line "...k3s deployed", log_prefix: "Instance #{first_master.name}" + end + + private def deploy_k3s_to_master(master : Hetzner::Instance, master_count) + ssh.run(master, settings.networking.ssh.port, CLOUD_INIT_WAIT_SCRIPT, settings.networking.ssh.use_agent) + + install_script = master_install_script(master, master_count) + ssh.run(master, settings.networking.ssh.port, install_script, settings.networking.ssh.use_agent) + log_line "...k3s deployed", log_prefix: "Instance #{master.name}" + end + + private def deploy_k3s_to_worker(worker : Hetzner::Instance, master_count) + ssh.run(worker, settings.networking.ssh.port, CLOUD_INIT_WAIT_SCRIPT, settings.networking.ssh.use_agent) + + install_script = worker_install_script(master_count) + ssh.run(worker, settings.networking.ssh.port, install_script, settings.networking.ssh.use_agent) + log_line "...k3s has been deployed to worker #{worker.name}.", log_prefix: "Instance #{worker.name}" + end + + private def master_install_script(master, master_count) + server = "" + datastore_endpoint = "" + etcd_arguments = "" + + if settings.datastore.mode == "etcd" + server = master == first_master ? " --cluster-init " : " --server https://#{api_server_ip_address}:6443 " + etcd_arguments = " --etcd-expose-metrics=true " + else + datastore_endpoint = " K3S_DATASTORE_ENDPOINT='#{settings.datastore.external_datastore_endpoint}' " + end + + extra_args = "#{kube_api_server_args_list} #{kube_scheduler_args_list} #{kube_controller_manager_args_list} #{kube_cloud_controller_manager_args_list} #{kubelet_args_list} #{kube_proxy_args_list}" + taint = settings.schedule_workloads_on_masters ? " " : " --node-taint CriticalAddonsOnly=true:NoExecute " + + Crinja.render(MASTER_INSTALL_SCRIPT, { + cluster_name: settings.cluster_name, + k3s_version: settings.k3s_version, + k3s_token: k3s_token, + cni: cni.enabled.to_s, + cni_mode: cni.mode, + flannel_backend: flannel_backend, + taint: taint, + extra_args: extra_args, + server: server, + tls_sans: generate_tls_sans(master_count), + private_network_enabled: settings.networking.private_network.enabled.to_s, + private_network_test_ip: settings.networking.private_network.subnet.split(".")[0..2].join(".") + ".0", + private_network_subnet: settings.networking.private_network.enabled ? settings.networking.private_network.subnet : "", + cluster_cidr: settings.networking.cluster_cidr, + service_cidr: settings.networking.service_cidr, + cluster_dns: settings.networking.cluster_dns, + datastore_endpoint: datastore_endpoint, + etcd_arguments: etcd_arguments, + embedded_registry_mirror_enabled: settings.embedded_registry_mirror.enabled.to_s, + }) + end + + private def worker_install_script(master_count) + Crinja.render(WORKER_INSTALL_SCRIPT, { + cluster_name: settings.cluster_name, + k3s_token: k3s_token, + k3s_version: settings.k3s_version, + api_server_ip_address: api_server_ip_address, + private_network_enabled: settings.networking.private_network.enabled.to_s, + private_network_test_ip: settings.networking.private_network.subnet.split(".")[0..2].join(".") + ".0", + private_network_subnet: settings.networking.private_network.enabled ? settings.networking.private_network.subnet : "", + extra_args: kubelet_args_list + }) + end + + private def flannel_backend + if cni.flannel? && cni.encryption? + available_releases = K3s.available_releases + selected_k3s_index = available_releases.index(settings.k3s_version).not_nil! + k3s_1_23_6_index = available_releases.index("v1.23.6+k3s1").not_nil! + + selected_k3s_index >= k3s_1_23_6_index ? " --flannel-backend=wireguard-native " : " --flannel-backend=wireguard " + elsif cni.flannel? + " " + else + args = [ + "--flannel-backend=none", + "--disable-network-policy" + ] + + args << "--disable-kube-proxy" unless cni.kube_proxy? + args.join(" ") + end + end + + private def kube_api_server_args_list + kubernetes_component_args_list("kube-apiserver", settings.kube_api_server_args) + end + + private def kube_scheduler_args_list + kubernetes_component_args_list("kube-scheduler", settings.kube_scheduler_args) + end + + private def kube_controller_manager_args_list + kubernetes_component_args_list("kube-controller-manager", settings.kube_controller_manager_args) + end + + private def kube_cloud_controller_manager_args_list + kubernetes_component_args_list("kube-cloud-controller-manager", settings.kube_cloud_controller_manager_args) + end + + private def kubelet_args_list + kubernetes_component_args_list("kubelet", settings.all_kubelet_args) + end + + private def kube_proxy_args_list + kubernetes_component_args_list("kube-proxy", settings.kube_proxy_args) + end + + private def k3s_token : String + @k3s_token ||= begin + tokens = masters.map do |master| + token_by_master(master) + end.reject(&.empty?) + + if tokens.empty? + Random::Secure.hex + else + tokens = tokens.tally + max_counts = tokens.max_of { |_, count| count } + token = tokens.key_for(max_counts) + token.empty? ? Random::Secure.hex : token.split(':').last + end + end + end + + private def first_master : Hetzner::Instance + @first_master ||= begin + return masters[0] if k3s_token.empty? + + bootstrapped_master = masters.sort_by(&.name).find do |master| + token_by_master(master) == k3s_token + end + + bootstrapped_master || masters[0] + end + end + + private def token_by_master(master : Hetzner::Instance) + ssh.run(master, settings.networking.ssh.port, "cat /var/lib/rancher/k3s/server/node-token", settings.networking.ssh.use_agent, print_output: false).split(':').last + rescue + "" + end + + private def save_kubeconfig(master_count) + kubeconfig_path = configuration.kubeconfig_path + + log_line "Generating the kubeconfig file to #{kubeconfig_path}...", "Control plane" + + kubeconfig = ssh.run(first_master, settings.networking.ssh.port, "cat /etc/rancher/k3s/k3s.yaml", settings.networking.ssh.use_agent, print_output: false). + gsub("default", settings.cluster_name) + + File.write(kubeconfig_path, kubeconfig) + + masters.each_with_index do |master, index| + master_ip_address = settings.networking.public_network.ipv4 ? master.public_ip_address : master.private_ip_address + master_kubeconfig_path = "#{kubeconfig_path}-#{master.name}" + master_kubeconfig = kubeconfig + .gsub("server: https://127.0.0.1:6443", "server: https://#{master_ip_address}:6443") + .gsub("name: #{settings.cluster_name}", "name: #{master.name}") + .gsub("cluster: #{settings.cluster_name}", "cluster: #{master.name}") + .gsub("user: #{settings.cluster_name}", "user: #{master.name}") + .gsub("current-context: #{settings.cluster_name}", "current-context: #{master.name}") + + File.write(master_kubeconfig_path, master_kubeconfig) + end + + paths = masters.map { |master| "#{kubeconfig_path}-#{master.name}" }.join(":") + + system("KUBECONFIG=#{paths} kubectl config view --flatten > #{kubeconfig_path}") + system("KUBECONFIG=#{kubeconfig_path} kubectl config use-context #{first_master.name}") + + masters.each do |master| + FileUtils.rm("#{kubeconfig_path}-#{master.name}") + end + + File.chmod kubeconfig_path, 0o600 + + log_line "...kubeconfig file generated as #{kubeconfig_path}.", "Control plane" + end + + private def add_labels_and_taints_to_masters + add_labels_or_taints(:label, masters, settings.masters_pool.labels, "masters_pool") + add_labels_or_taints(:taint, masters, settings.masters_pool.taints, "masters_pool") + end + + private def add_labels_and_taints_to_workers + settings.worker_node_pools.each do |node_pool| + instance_type = node_pool.instance_type + node_name_prefix = /#{settings.cluster_name}-pool-#{node_pool.name}-worker/ + + nodes = workers.select { |worker| node_name_prefix =~ worker.name } + + add_labels_or_taints(:label, nodes, node_pool.labels, node_pool.name) + add_labels_or_taints(:taint, nodes, node_pool.taints, node_pool.name) + end + end + + private def add_labels_or_taints(mark_type, instances, marks, node_pool_name) + return unless marks.any? + + node_names = instances.map(&.name).join(" ") + + log_line "\nAdding #{mark_type}s to #{node_pool_name} pool workers...", log_prefix: "Node labels" + + all_marks = marks.map do |mark| + "#{mark.key}=#{mark.value}" + end.join(" ") + + command = "kubectl #{mark_type} --overwrite nodes #{node_names} #{all_marks}" + + run_shell_command(command, configuration.kubeconfig_path, settings.hetzner_token, log_prefix: "Node labels") + + log_line "...node labels applied", log_prefix: "Node labels" + end + + private def generate_tls_sans(master_count) + sans = [ + "--tls-san=#{api_server_ip_address}", + "--tls-san=127.0.0.1" + ] + sans << "--tls-san=#{settings.api_server_hostname}" if settings.api_server_hostname + + masters.each do |master| + master_private_ip = master.private_ip_address + master_public_ip = master.public_ip_address + sans << "--tls-san=#{master_private_ip}" + sans << "--tls-san=#{master_public_ip}" + end + + sans.uniq.sort.join(" ") + end + + private def install_software(master_count) + Kubernetes::Software::Cilium.new(configuration, settings).install if settings.networking.cni.cilium? + Kubernetes::Software::Hetzner::Secret.new(configuration, settings).create + Kubernetes::Software::Hetzner::CloudControllerManager.new(configuration, settings).install + Kubernetes::Software::Hetzner::CSIDriver.new(configuration, settings).install + Kubernetes::Software::SystemUpgradeController.new(configuration, settings).install + Kubernetes::Software::ClusterAutoscaler.new(configuration, settings, first_master, ssh, autoscaling_worker_node_pools, worker_install_script(master_count)).install + end + + private def default_log_prefix + "Kubernetes software" + end + + private def api_server_ip_address + if first_master.private_ip_address.nil? + first_master.public_ip_address + else + first_master.private_ip_address + end + end +end diff --git a/src/kubernetes/resources/deployment.cr b/src/kubernetes/resources/deployment.cr new file mode 100644 index 00000000..730a04be --- /dev/null +++ b/src/kubernetes/resources/deployment.cr @@ -0,0 +1,10 @@ +require "./deployment/spec" + +module Kubernetes::Resources + class Deployment + include YAML::Serializable + include YAML::Serializable::Unmapped + + property spec : Kubernetes::Resources::Deployment::Spec + end +end diff --git a/src/kubernetes/resources/deployment/spec.cr b/src/kubernetes/resources/deployment/spec.cr new file mode 100644 index 00000000..6ae160ed --- /dev/null +++ b/src/kubernetes/resources/deployment/spec.cr @@ -0,0 +1,12 @@ +require "./spec/template" + +module Kubernetes::Resources + class Deployment + class Spec + include YAML::Serializable + include YAML::Serializable::Unmapped + + property template : Kubernetes::Resources::Deployment::Spec::Template + end + end +end diff --git a/src/kubernetes/resources/deployment/spec/template.cr b/src/kubernetes/resources/deployment/spec/template.cr new file mode 100644 index 00000000..3876febb --- /dev/null +++ b/src/kubernetes/resources/deployment/spec/template.cr @@ -0,0 +1,14 @@ +require "../../pod/spec" + +module Kubernetes::Resources + class Deployment + class Spec + class Template + include YAML::Serializable + include YAML::Serializable::Unmapped + + property spec : Kubernetes::Resources::Pod::Spec + end + end + end +end diff --git a/src/kubernetes/resources/pod.cr b/src/kubernetes/resources/pod.cr new file mode 100644 index 00000000..b9bc7eda --- /dev/null +++ b/src/kubernetes/resources/pod.cr @@ -0,0 +1,10 @@ +require "./pod/spec" + +module Kubernetes::Resources + class Pod + include YAML::Serializable + include YAML::Serializable::Unmapped + + property spec : Kubernetes::Resources::Pod::Spec + end +end diff --git a/src/kubernetes/resources/pod/spec.cr b/src/kubernetes/resources/pod/spec.cr new file mode 100644 index 00000000..0a10d9f5 --- /dev/null +++ b/src/kubernetes/resources/pod/spec.cr @@ -0,0 +1,26 @@ +require "./spec/toleration" +require "./spec/container" +require "./spec/volume" + +module Kubernetes::Resources + class Pod + class Spec + include YAML::Serializable + include YAML::Serializable::Unmapped + + property tolerations : Array(Kubernetes::Resources::Pod::Spec::Toleration)? + property containers : Array(Kubernetes::Resources::Pod::Spec::Container)? + property volumes : Array(Kubernetes::Resources::Pod::Spec::Volume)? + + def add_toleration(key, value, effect) + toleration = Kubernetes::Resources::Pod::Spec::Toleration.new(key: key, value: value, effect: effect) + + if tolerations = self.tolerations + tolerations << toleration + else + self.tolerations = [toleration] + end + end + end + end +end diff --git a/src/kubernetes/resources/pod/spec/container.cr b/src/kubernetes/resources/pod/spec/container.cr new file mode 100644 index 00000000..882596a3 --- /dev/null +++ b/src/kubernetes/resources/pod/spec/container.cr @@ -0,0 +1,17 @@ +require "./container/env_variable" +require "./container/volume_mount" + +class Kubernetes::Resources::Pod + class Spec + class Container + include YAML::Serializable + include YAML::Serializable::Unmapped + + property name : String? + property image : String? + property command : Array(String)? + property env : Array(EnvVariable)? + property volumeMounts : Array(VolumeMount)? + end + end +end diff --git a/src/kubernetes/resources/pod/spec/container/env_variable.cr b/src/kubernetes/resources/pod/spec/container/env_variable.cr new file mode 100644 index 00000000..91ee9262 --- /dev/null +++ b/src/kubernetes/resources/pod/spec/container/env_variable.cr @@ -0,0 +1,16 @@ +class Kubernetes::Resources::Pod + class Spec + class Container + class EnvVariable + include YAML::Serializable + include YAML::Serializable::Unmapped + + property name : String? + property value : String? + + def initialize(@name, @value) + end + end + end + end +end diff --git a/src/kubernetes/resources/pod/spec/container/volume_mount.cr b/src/kubernetes/resources/pod/spec/container/volume_mount.cr new file mode 100644 index 00000000..14765cb9 --- /dev/null +++ b/src/kubernetes/resources/pod/spec/container/volume_mount.cr @@ -0,0 +1,14 @@ +class Kubernetes::Resources::Pod + class Spec + class Container + class VolumeMount + include YAML::Serializable + include YAML::Serializable::Unmapped + + property name : String? + property mountPath : String? + property readOnly : Bool? + end + end + end +end diff --git a/src/kubernetes/resources/pod/spec/toleration.cr b/src/kubernetes/resources/pod/spec/toleration.cr new file mode 100644 index 00000000..c092d4e0 --- /dev/null +++ b/src/kubernetes/resources/pod/spec/toleration.cr @@ -0,0 +1,15 @@ +class Kubernetes::Resources::Pod + class Spec + class Toleration + include YAML::Serializable + include YAML::Serializable::Unmapped + + property effect : String? + property key : String? + property value : String? + + def initialize(@effect, @key, @value) + end + end + end +end diff --git a/src/kubernetes/resources/pod/spec/volume.cr b/src/kubernetes/resources/pod/spec/volume.cr new file mode 100644 index 00000000..1a9385db --- /dev/null +++ b/src/kubernetes/resources/pod/spec/volume.cr @@ -0,0 +1,13 @@ +require "./volume/host_path" + +class Kubernetes::Resources::Pod + class Spec + class Volume + include YAML::Serializable + include YAML::Serializable::Unmapped + + property name : String? + property hostPath : HostPath? + end + end +end diff --git a/src/kubernetes/resources/pod/spec/volume/host_path.cr b/src/kubernetes/resources/pod/spec/volume/host_path.cr new file mode 100644 index 00000000..394fc6f1 --- /dev/null +++ b/src/kubernetes/resources/pod/spec/volume/host_path.cr @@ -0,0 +1,12 @@ +class Kubernetes::Resources::Pod + class Spec + class Volume + class HostPath + include YAML::Serializable + include YAML::Serializable::Unmapped + + property path : String? + end + end + end +end diff --git a/src/kubernetes/resources/resource.cr b/src/kubernetes/resources/resource.cr new file mode 100644 index 00000000..1d385471 --- /dev/null +++ b/src/kubernetes/resources/resource.cr @@ -0,0 +1,8 @@ +module Kubernetes::Resources + class Resource + include YAML::Serializable + include YAML::Serializable::Unmapped + + property kind : String + end +end diff --git a/src/kubernetes/software/cilium.cr b/src/kubernetes/software/cilium.cr new file mode 100644 index 00000000..bbfb279d --- /dev/null +++ b/src/kubernetes/software/cilium.cr @@ -0,0 +1,55 @@ +require "../../configuration/loader" +require "../../configuration/main" +require "../../util" +require "../../util/shell" + +class Kubernetes::Software::Cilium + include Util + include Util::Shell + + getter configuration : Configuration::Loader + getter settings : Configuration::Main { configuration.settings } + + def initialize(@configuration, @settings) + end + + def install + log_line "Installing Cilium..." + + command = <<-BASH + helm repo add cilium https://helm.cilium.io/ + + helm upgrade --install \ + --version #{settings.networking.cni.cilium.chart_version} \ + --namespace kube-system \ + --set encryption.enabled=#{settings.networking.cni.encryption.to_s} \ + --set encryption.type=wireguard \ + --set encryption.nodeEncryption=#{settings.networking.cni.encryption.to_s} \ + --set routingMode=tunnel \ + --set tunnelProtocol=vxlan \ + --set ipam.mode="kubernetes" \ + --set kubeProxyReplacement=true \ + --set hubble.enabled=true \ + --set hubble.metrics.enabled="{dns,drop,tcp,flow,port-distribution,icmp,http}" \ + --set hubble.relay.enabled=true \ + --set hubble.ui.enabled=true \ + --set k8sServiceHost=127.0.0.1 \ + --set k8sServicePort=6444 \ + --set operator.replicas=1 \ + --set operator.resources.requests.memory=128Mi \ + --set resources.requests.memory=512Mi \ + cilium cilium/cilium + + echo "Waiting for Cilium to be ready..." + kubectl -n kube-system rollout status ds cilium + BASH + + run_shell_command(command, configuration.kubeconfig_path, settings.hetzner_token) + + log_line "...Cilium installed" + end + + private def default_log_prefix + "CNI" + end +end diff --git a/src/kubernetes/software/cluster_autoscaler.cr b/src/kubernetes/software/cluster_autoscaler.cr new file mode 100644 index 00000000..5a45d967 --- /dev/null +++ b/src/kubernetes/software/cluster_autoscaler.cr @@ -0,0 +1,163 @@ +require "../../configuration/loader" +require "../../configuration/main" +require "../../hetzner/instance" +require "../../hetzner/instance/create" +require "../../util" +require "../../util/shell" +require "../../util/ssh" +require "../resources/resource" +require "../resources/deployment" +require "../resources/pod/spec/toleration" +require "../resources/pod/spec/container" +require "../../util" +require "../util" + +class Kubernetes::Software::ClusterAutoscaler + include Util + include Kubernetes::Util + + getter configuration : Configuration::Loader + getter settings : Configuration::Main { configuration.settings } + getter autoscaling_worker_node_pools : Array(Configuration::NodePool) + getter worker_install_script : String + getter first_master : ::Hetzner::Instance + getter ssh : ::Util::SSH + + def initialize(@configuration, @settings, @first_master, @ssh, @autoscaling_worker_node_pools, @worker_install_script) + end + + def install + log_line "Installing Cluster Autoscaler..." + + apply_manifest_from_yaml(manifest) + + log_line "...Cluster Autoscaler installed" + end + + private def cloud_init + ::Hetzner::Instance::Create.cloud_init(settings, settings.networking.ssh.port, settings.snapshot_os, settings.additional_packages, settings.post_create_commands, [k3s_join_script]) + end + + private def k3s_join_script + "|\n #{worker_install_script.gsub("\n", "\n ")}" + end + + private def certificate_path + @certificate_path ||= if ssh.run(first_master, settings.networking.ssh.port, "[ -f /etc/ssl/certs/ca-certificates.crt ] && echo 1 || echo 2", settings.networking.ssh.use_agent, false).chomp == "1" + "/etc/ssl/certs/ca-certificates.crt" + else + "/etc/ssl/certs/ca-bundle.crt" + end + end + + private def node_pool_args + autoscaling_worker_node_pools.map do |pool| + autoscaling = pool.autoscaling.not_nil! + "--nodes=#{autoscaling.min_instances}:#{autoscaling.max_instances}:#{pool.instance_type.upcase}:#{pool.location.upcase}:#{pool.name}" + end + end + + private def patch_resources(resources) + resources.map do |resource| + resource = Kubernetes::Resources::Resource.from_yaml(resource.to_yaml) + + if resource.kind == "Deployment" + patched_deployment(resource) + else + resource + end + end + end + + private def patched_deployment(resource) + deployment = Kubernetes::Resources::Deployment.from_yaml(resource.to_yaml) + + patch_tolerations(deployment.spec.template.spec) + patch_containers(deployment.spec.template.spec.containers) + patch_volumes(deployment.spec.template.spec.volumes) + + deployment + end + + private def patch_tolerations(pod_spec) + pod_spec.add_toleration(key: "CriticalAddonsOnly", value: "true", effect: "NoExecute") + end + + private def container_command + command = [ + "./cluster-autoscaler", + "--cloud-provider=hetzner", + "--enforce-node-group-min-size", + ] + + command += node_pool_args + end + + private def patch_autoscaler_container(autoscaler_container) + autoscaler_container.image = "docker.io/hetznercloud/cluster-autoscaler:v1.31.0-hcloud1" + autoscaler_container.command = container_command + + set_container_environment_variable(autoscaler_container, "HCLOUD_CLOUD_INIT", Base64.strict_encode(cloud_init)) + set_container_environment_variable(autoscaler_container, "HCLOUD_IMAGE", settings.autoscaling_image || settings.image) + set_container_environment_variable(autoscaler_container, "HCLOUD_FIREWALL", settings.cluster_name) + set_container_environment_variable(autoscaler_container, "HCLOUD_SSH_KEY", settings.cluster_name) + set_container_environment_variable(autoscaler_container, "HCLOUD_NETWORK", (settings.networking.private_network.existing_network_name.blank? ? settings.cluster_name : settings.networking.private_network.existing_network_name)) + set_container_environment_variable(autoscaler_container, "HCLOUD_PUBLIC_IPV4", settings.networking.public_network.ipv4.to_s) + set_container_environment_variable(autoscaler_container, "HCLOUD_PUBLIC_IPV6", settings.networking.public_network.ipv6.to_s) + + set_certificate_path(autoscaler_container) + end + + private def set_container_environment_variable(autoscaler_container, variable_name, variable_value) + env_variables = autoscaler_container.env + + return if env_variables.nil? + + if variable = env_variables.find { |env| env.name == variable_name } + variable.value = variable_value + else + env_variables << Kubernetes::Resources::Pod::Spec::Container::EnvVariable.new(name: variable_name, value: variable_value) + end + end + + private def set_certificate_path(autoscaler_container) + volume_mounts = autoscaler_container.volumeMounts + + return unless volume_mounts + + if volume_mount = volume_mounts.find { |volume_mount| volume_mount.name == "ssl-certs" } + volume_mount.mountPath = certificate_path + end + end + + private def patch_containers(containers) + return unless containers + + if autoscaler_container = containers.find { |container| container.name == "cluster-autoscaler" } + patch_autoscaler_container(autoscaler_container) + end + end + + private def patch_volumes(volumes) + return unless volumes + + certificate_volume = volumes.find { |volume| volume.name == "ssl-certs" } + + return unless certificate_volume + + if host_path = certificate_volume.hostPath + host_path.path = certificate_path + end + end + + private def manifest + manifest = fetch_manifest(settings.manifests.cluster_autoscaler_manifest_url) + resources = YAML.parse_all(manifest) + patched_resources = patch_resources(resources) + patched_resources.map(&.to_yaml).join + end + + private def default_log_prefix + "Cluster Autoscaler" + end +end diff --git a/src/kubernetes/software/hetzner/cloud_controller_manager.cr b/src/kubernetes/software/hetzner/cloud_controller_manager.cr new file mode 100644 index 00000000..1f7b3cc4 --- /dev/null +++ b/src/kubernetes/software/hetzner/cloud_controller_manager.cr @@ -0,0 +1,36 @@ +require "../../../util" +require "../../util" + +class Kubernetes::Software::Hetzner::CloudControllerManager + include Util + include Kubernetes::Util + + getter configuration : Configuration::Loader + getter settings : Configuration::Main { configuration.settings } + + def initialize(@configuration, @settings) + end + + def install + log_line "Installing Hetzner Cloud Controller Manager..." + + apply_manifest_from_yaml(manifest) + + log_line "Hetzner Cloud Controller Manager installed" + end + + private def default_log_prefix + "Hetzner Cloud Controller" + end + + private def manifest + manifest_url = if settings.networking.private_network.enabled + settings.manifests.cloud_controller_manager_manifest_url + else + settings.manifests.cloud_controller_manager_manifest_url.gsub("-networks", "") + end + + manifest = fetch_manifest(manifest_url) + manifest.gsub(/--cluster-cidr=[^"]+/, "--cluster-cidr=#{settings.networking.cluster_cidr}") + end +end diff --git a/src/kubernetes/software/hetzner/csi_driver.cr b/src/kubernetes/software/hetzner/csi_driver.cr new file mode 100644 index 00000000..15560c54 --- /dev/null +++ b/src/kubernetes/software/hetzner/csi_driver.cr @@ -0,0 +1,25 @@ +require "../../../util" +require "../../util" + +class Kubernetes::Software::Hetzner::CSIDriver + include Util + include Kubernetes::Util + + getter configuration : Configuration::Loader + getter settings : Configuration::Main { configuration.settings } + + def initialize(@configuration, @settings) + end + + def install + log_line "Installing Hetzner CSI Driver..." + + apply_manifest_from_url(settings.manifests.csi_driver_manifest_url) + + log_line "Hetzner CSI Driver installed" + end + + private def default_log_prefix + "Hetzner CSI Driver" + end +end diff --git a/src/kubernetes/software/hetzner/secret.cr b/src/kubernetes/software/hetzner/secret.cr new file mode 100644 index 00000000..038dbd1e --- /dev/null +++ b/src/kubernetes/software/hetzner/secret.cr @@ -0,0 +1,39 @@ +require "../../../util" +require "../../util" + +class Kubernetes::Software::Hetzner::Secret + include Util + include Kubernetes::Util + + HETZNER_CLOUD_SECRET_MANIFEST = {{ read_file("#{__DIR__}/../../../../templates/hetzner_cloud_secret_manifest.yaml") }} + + getter configuration : Configuration::Loader + getter settings : Configuration::Main { configuration.settings } + + def initialize(@configuration, @settings) + end + + def create + log_line "Creating secret for Hetzner Cloud token..." + + network_name = if settings.networking.private_network.enabled + existing_network_name = settings.networking.private_network.existing_network_name + existing_network_name.empty? ? settings.cluster_name : existing_network_name + else + "" + end + + secret_manifest = Crinja.render(HETZNER_CLOUD_SECRET_MANIFEST, { + network: network_name, + token: settings.hetzner_token + }) + + apply_manifest_from_yaml(secret_manifest) + + log_line "...secret created" + end + + private def default_log_prefix + "Hetzner Cloud Secret" + end +end diff --git a/src/kubernetes/software/system_upgrade_controller.cr b/src/kubernetes/software/system_upgrade_controller.cr new file mode 100644 index 00000000..a8179cd6 --- /dev/null +++ b/src/kubernetes/software/system_upgrade_controller.cr @@ -0,0 +1,70 @@ +require "../resources/resource" +require "../resources/deployment" +require "../resources/pod/spec/toleration" +require "../../configuration/loader" +require "../../configuration/main" +require "../../util" +require "../util" + +class Kubernetes::Software::SystemUpgradeController + include Util + include Kubernetes::Util + + getter configuration : Configuration::Loader + getter settings : Configuration::Main { configuration.settings } + + def initialize(@configuration, @settings) + end + + def install + log_line "Installing System Upgrade Controller..." + + create_namespace + create_crd + create_resources + + log_line "...System Upgrade Controller installed" + end + + private def create_namespace + command = "kubectl create ns system-upgrade --dry-run=client -o yaml | kubectl apply -f -" + apply_kubectl_command(command, error_message = "Failed to install System Upgrade Controller") + end + + private def create_crd + apply_manifest_from_url(settings.manifests.system_upgrade_controller_crd_manifest_url) + end + + private def create_resources + manifest = fetch_manifest(settings.manifests.system_upgrade_controller_deployment_manifest_url) + resources = YAML.parse_all(manifest) + patched_resources = patch_resources(resources) + patched_manifest = patched_resources.map(&.to_yaml).join + + apply_manifest_from_yaml(patched_manifest) + end + + private def deployment_with_added_toleration(resource) + deployment = Kubernetes::Resources::Deployment.from_yaml(resource.to_yaml) + + deployment.spec.template.spec.add_toleration(key: "CriticalAddonsOnly", value: "true", effect: "NoExecute") + + deployment + end + + private def patch_resources(resources) + resources.map do |resource| + resource = Kubernetes::Resources::Resource.from_yaml(resource.to_yaml) + + if resource.kind == "Deployment" + deployment_with_added_toleration(resource) + else + resource + end + end + end + + private def default_log_prefix + "System Upgrade Controller" + end +end diff --git a/src/kubernetes/util.cr b/src/kubernetes/util.cr new file mode 100644 index 00000000..4ac020b9 --- /dev/null +++ b/src/kubernetes/util.cr @@ -0,0 +1,86 @@ +require "../util" +require "../util/shell" +require "socket" + +module Kubernetes::Util + include ::Util + include ::Util::Shell + + def ensure_kubectl_is_installed! + return if which("kubectl") + + log_line "Please ensure kubectl is installed and in your PATH.", log_prefix: "Tooling" + exit 1 + end + + def apply_manifest_from_yaml(yaml) + command = <<-BASH + kubectl apply -f - <<-EOF + #{yaml} + EOF + BASH + + result = run_shell_command(command, configuration.kubeconfig_path, settings.hetzner_token) + + unless result.success? + log_line "Failed to apply manifest: #{result.output}" + exit 1 + end + end + + def apply_manifest_from_url(url) + command = "kubectl apply -f #{url}" + + result = run_shell_command(command, configuration.kubeconfig_path, settings.hetzner_token) + + unless result.success? + log_line "Failed to apply manifest: #{result.output}" + exit 1 + end + end + + def apply_kubectl_command(command, error_message = "") + result = run_shell_command(command, configuration.kubeconfig_path, settings.hetzner_token) + + unless result.success? + log_line "#{error_message}: #{result.output}" + exit 1 + end + end + + def fetch_manifest(url) + response = Crest.get(url) + + unless response.success? + log_line "Failed to fetch manifest from #{url}: Server responded with status #{response.status_code}" + exit 1 + end + + response.body.to_s + end + + def kubernetes_component_args_list(settings_group, setting) + setting.map { |arg| " --#{settings_group}-arg \"#{arg}\" " }.join + end + + def port_open?(ip, port, timeout = 1.0) + begin + socket = TCPSocket.new(ip, port, connect_timeout: timeout) + socket.close + true + rescue Socket::Error | IO::TimeoutError + false + end + end + + def api_server_ready?(kubeconfig_path) + return false unless File.exists?(kubeconfig_path) + + kubeconfig = YAML.parse(File.read(kubeconfig_path)) + server = kubeconfig["clusters"][0]["cluster"]["server"].as_s + ip_address = server.split(":")[1].gsub("//", "") + port = server.split(":")[2] + + port_open?(ip_address, port, timeout = 1.0) + end +end diff --git a/src/util.cr b/src/util.cr new file mode 100644 index 00000000..7f08395c --- /dev/null +++ b/src/util.cr @@ -0,0 +1,22 @@ +module Util + def which(command) + exts = ENV.fetch("PATHEXT", "").split(";") + paths = ENV["PATH"]?.try(&.split(Process::PATH_DELIMITER)) || [] of String + + paths.each do |path| + exts.each do |ext| + exe = File.join(path, "#{command}#{ext}") + return exe if File.executable?(exe) && !File.directory?(exe) + end + end + + nil + end + + def log_line(line, log_prefix = "") + log_prefix = log_prefix.blank? ? default_log_prefix : log_prefix + puts "[#{log_prefix}] #{line}" + end + + abstract def default_log_prefix +end diff --git a/src/util/prefixed_io.cr b/src/util/prefixed_io.cr new file mode 100644 index 00000000..78142faf --- /dev/null +++ b/src/util/prefixed_io.cr @@ -0,0 +1,15 @@ +class PrefixedIO < IO + def initialize(@prefix : String, @io : IO); end + + def read(slice : Bytes) + raise NotImplementedError.new "#read" + end + + def write(slice : Bytes) : Nil + content = String.new(slice) + lines = content.lines + lines.each do |line| + @io << @prefix << "#{line}\n" + end + end +end diff --git a/src/util/shell.cr b/src/util/shell.cr new file mode 100644 index 00000000..3e050db7 --- /dev/null +++ b/src/util/shell.cr @@ -0,0 +1,58 @@ +require "./shell/command_result" +require "random/secure" + +module Util + module Shell + def run_shell_command(command : String, kubeconfig_path : String, hetzner_token : String, error_message : String = "", abort_on_error = true, log_prefix = "", print_output : Bool = true) : CommandResult + cmd_file_path = "/tmp/cli_#{Random::Secure.hex(8)}.cmd" + + File.write(cmd_file_path, <<-CONTENT + set -euo pipefail + #{command} + CONTENT + ) + + File.chmod(cmd_file_path, 0o700) + + stdout = IO::Memory.new + stderr = IO::Memory.new + + log_prefix = log_prefix.blank? ? default_log_prefix : log_prefix + + if print_output + all_io_out = if log_prefix.blank? + IO::MultiWriter.new(STDOUT, stdout) + else + IO::MultiWriter.new(PrefixedIO.new("[#{log_prefix}] ", STDOUT), stdout) + end + + all_io_err = IO::MultiWriter.new(STDERR, stderr) + else + all_io_out = stdout + all_io_err = stderr + end + + env = { + "KUBECONFIG" => kubeconfig_path, + "HCLOUD_TOKEN" => hetzner_token + } + + status = Process.run("bash", + args: ["-c", cmd_file_path], + env: env, + output: all_io_out, + error: all_io_err + ) + + output = status.success? ? stdout.to_s : stderr.to_s + result = CommandResult.new(output, status.exit_code) + + unless result.success? + log_line "#{error_message}: #{result.output}", log_prefix: log_prefix if print_output + exit 1 if abort_on_error + end + + result + end + end +end diff --git a/src/util/shell/command_result.cr b/src/util/shell/command_result.cr new file mode 100644 index 00000000..152a2ad0 --- /dev/null +++ b/src/util/shell/command_result.cr @@ -0,0 +1,11 @@ +class Util::Shell::CommandResult + getter output : String + getter status : Int32 + + def initialize(@output, @status) + end + + def success? + status.zero? + end +end diff --git a/src/util/ssh.cr b/src/util/ssh.cr new file mode 100644 index 00000000..0823c2b4 --- /dev/null +++ b/src/util/ssh.cr @@ -0,0 +1,78 @@ +require "ssh2" +require "io" +require "../util" +require "retriable" +require "tasker" +require "./prefixed_io" + +class Util::SSH + include ::Util + + getter private_ssh_key_path : String + getter public_ssh_key_path : String + + def initialize(@private_ssh_key_path, @public_ssh_key_path) + end + + def run(instance, port, command, use_ssh_agent, print_output = true) + Retriable.retry(max_attempts: 300, backoff: false, base_interval: 1.second, on: {SSH2::SSH2Error, SSH2::SessionError, Socket::ConnectError}) do + run_command(instance, port, command, use_ssh_agent, print_output) + end + end + + def wait_for_instance(instance, port, use_ssh_agent, test_command, expected_result, max_attempts : Int16 = 20) + result = nil + + loop do + log_line "Waiting for successful ssh connectivity with instance #{instance.name}...", log_prefix: "Instance #{instance.name}" + + sleep 1 + + Retriable.retry(max_attempts: max_attempts, on: Tasker::Timeout, backoff: false) do + Tasker.timeout(5.second) do + result = run(instance, port, test_command, use_ssh_agent, false) + log_line result, log_prefix: "Instance #{instance.name}" if result != expected_result + end + end + + break result if result == expected_result + end + + log_line "...instance #{instance.name} is now up.", log_prefix: "Instance #{instance.name}" + + result + end + + private def run_command(instance, port, command, use_ssh_agent, print_output = true) + host_ip_address = instance.host_ip_address.not_nil! + + result = IO::Memory.new + all_output = if print_output + IO::MultiWriter.new(PrefixedIO.new("[Instance #{instance.name}] ", STDOUT), result) + else + IO::MultiWriter.new(result) + end + + SSH2::Session.open(host_ip_address, port) do |session| + session.timeout = 5000 + session.knownhosts.delete_if { |h| h.name == instance.host_ip_address } + + if use_ssh_agent + session.login_with_agent("root") + else + session.login_with_pubkey("root", private_ssh_key_path, public_ssh_key_path) + end + + session.open_session do |channel| + channel.command(command) + IO.copy(channel, all_output) + end + end + + result.to_s.chomp + end + + private def default_log_prefix + "+" + end +end diff --git a/templates/cloud_init.yaml b/templates/cloud_init.yaml new file mode 100644 index 00000000..7b04c89a --- /dev/null +++ b/templates/cloud_init.yaml @@ -0,0 +1,35 @@ +#cloud-config +preserve_hostname: true + +write_files: +{{ eth1_str }} + +{{ growpart_str }} + +- path: /etc/systemd/system/ssh.socket.d/listen.conf + content: | + [Socket] + ListenStream= + ListenStream={{ ssh_port}} + +- path: /etc/configure-ssh.sh + permissions: '0755' + content: | + if systemctl is-active ssh.socket > /dev/null 2>&1 + then + # OpenSSH is using socket activation + systemctl disable ssh + systemctl daemon-reload + systemctl restart ssh.socket + systemctl stop ssh + else + # OpenSSH is not using socket activation + sed -i 's/^#*Port .*/Port {{ ssh_port }}/' /etc/ssh/sshd_config + fi + systemctl restart ssh + + +packages: [{{ packages_str }}] + +runcmd: +{{ post_create_commands_str }} diff --git a/templates/cloud_init_wait_script.sh b/templates/cloud_init_wait_script.sh new file mode 100644 index 00000000..11bbc199 --- /dev/null +++ b/templates/cloud_init_wait_script.sh @@ -0,0 +1,13 @@ +fn_cloud="/var/lib/cloud/instance/boot-finished" +function await_cloud_init { + echo "🕒 Awaiting cloud config (may take a minute...)" + while true; do + for _ in $(seq 1 10); do + test -f $fn_cloud && return + sleep 1 + done + echo -n "." + done +} +test -f $fn_cloud || await_cloud_init +echo "Cloud init finished: $(cat $fn_cloud)" diff --git a/templates/hetzner_cloud_secret_manifest.yaml b/templates/hetzner_cloud_secret_manifest.yaml new file mode 100644 index 00000000..8e421cab --- /dev/null +++ b/templates/hetzner_cloud_secret_manifest.yaml @@ -0,0 +1,8 @@ +apiVersion: "v1" +kind: "Secret" +metadata: + namespace: 'kube-system' + name: 'hcloud' +stringData: + network: "{{ network }}" + token: "{{ token }}" diff --git a/templates/master_install_script.sh b/templates/master_install_script.sh new file mode 100644 index 00000000..25530b96 --- /dev/null +++ b/templates/master_install_script.sh @@ -0,0 +1,74 @@ +touch /etc/initialized + +HOSTNAME=$(hostname -f) +PUBLIC_IP=$(hostname -I | awk '{print $1}') + +if [ "{{ private_network_enabled }}" = "true" ]; then + echo "Using private network " > /var/log/hetzner-k3s.log + SUBNET="{{ private_network_subnet }}" + SUBNET_PREFIX=$(echo $SUBNET | cut -d'/' -f1 | sed 's/\./\\./g' | sed 's/0$//') + MAX_ATTEMPTS=30 + DELAY=10 + UP="false" + + for i in $(seq 1 $MAX_ATTEMPTS); do + if ip -4 addr show | grep -q "inet $SUBNET_PREFIX"; then + echo "Private network IP in subnet $SUBNET is up" 2>&1 | tee -a /var/log/hetzner-k3s.log + UP="true" + break + fi + echo "Waiting for private network IP in subnet $SUBNET to be available... (Attempt $i/$MAX_ATTEMPTS)" 2>&1 | tee -a /var/log/hetzner-k3s.log + sleep $DELAY + done + + if [ "$UP" = "false" ]; then + echo "Timeout waiting for private network IP in subnet $SUBNET" 2>&1 | tee -a /var/log/hetzner-k3s.log + fi + + PRIVATE_IP=$(ip route get {{ private_network_test_ip }} | awk -F"src " 'NR==1{split($2,a," ");print a[1]}') + NETWORK_INTERFACE=" --flannel-iface=$(ip route get {{ private_network_test_ip }} | awk -F"dev " 'NR==1{split($2,a," ");print a[1]}') " +else + echo "Using public network " > /var/log/hetzner-k3s.log + PRIVATE_IP="${PUBLIC_IP}" + NETWORK_INTERFACE=" " +fi + +if [ "{{ cni }}" = "true" ] && [ "{{ cni_mode }}" = "flannel" ]; then + FLANNEL_SETTINGS=" {{ flannel_backend }} $NETWORK_INTERFACE " +else + FLANNEL_SETTINGS=" {{ flannel_backend }} " +fi + +if [ "{{ embedded_registry_mirror_enabled }}" = "true" ]; then + EMBEDDED_REGISTRY_MIRROR=" --embedded-registry " +else + EMBEDDED_REGISTRY_MIRROR=" " +fi + +mkdir -p /etc/rancher/k3s + +cat > /etc/rancher/k3s/registries.yaml < /etc/initialized diff --git a/templates/upgrade_plan_for_masters.yaml b/templates/upgrade_plan_for_masters.yaml new file mode 100644 index 00000000..f3898851 --- /dev/null +++ b/templates/upgrade_plan_for_masters.yaml @@ -0,0 +1,22 @@ +apiVersion: upgrade.cattle.io/v1 +kind: Plan +metadata: + name: k3s-server + namespace: system-upgrade + labels: + k3s-upgrade: server +spec: + concurrency: 1 + version: {{ new_k3s_version }} + nodeSelector: + matchExpressions: + - {key: node-role.kubernetes.io/master, operator: In, values: ["true"]} + serviceAccountName: system-upgrade + tolerations: + - key: "CriticalAddonsOnly" + operator: "Equal" + value: "true" + effect: "NoExecute" + cordon: true + upgrade: + image: rancher/k3s-upgrade diff --git a/templates/upgrade_plan_for_workers.yaml b/templates/upgrade_plan_for_workers.yaml new file mode 100644 index 00000000..6af451d1 --- /dev/null +++ b/templates/upgrade_plan_for_workers.yaml @@ -0,0 +1,22 @@ +apiVersion: upgrade.cattle.io/v1 +kind: Plan +metadata: + name: k3s-agent + namespace: system-upgrade + labels: + k3s-upgrade: agent +spec: + concurrency: {{ worker_upgrade_concurrency }} + version: {{ new_k3s_version }} + nodeSelector: + matchExpressions: + - {key: node-role.kubernetes.io/master, operator: NotIn, values: ["true"]} + serviceAccountName: system-upgrade + tolerations: + - {key: '', effect: NoSchedule, operator: Exists, value: ''} + prepare: + image: rancher/k3s-upgrade + args: ["prepare", "k3s-server"] + cordon: true + upgrade: + image: rancher/k3s-upgrade diff --git a/templates/worker_install_script.sh b/templates/worker_install_script.sh new file mode 100644 index 00000000..ebd46661 --- /dev/null +++ b/templates/worker_install_script.sh @@ -0,0 +1,49 @@ +touch /etc/initialized + +HOSTNAME=$(hostname -f) +PUBLIC_IP=$(hostname -I | awk '{print $1}') + +if [ "{{ private_network_enabled }}" = "true" ]; then + echo "Using private network " > /var/log/hetzner-k3s.log + SUBNET="{{ private_network_subnet }}" + SUBNET_PREFIX=$(echo $SUBNET | cut -d'/' -f1 | sed 's/\./\\./g' | sed 's/0$//') + MAX_ATTEMPTS=30 + DELAY=10 + UP="false" + + for i in $(seq 1 $MAX_ATTEMPTS); do + if ip -4 addr show | grep -q "inet $SUBNET_PREFIX"; then + echo "Private network IP in subnet $SUBNET is up" 2>&1 | tee -a /var/log/hetzner-k3s.log + UP="true" + break + fi + echo "Waiting for private network IP in subnet $SUBNET to be available... (Attempt $i/$MAX_ATTEMPTS)" 2>&1 | tee -a /var/log/hetzner-k3s.log + sleep $DELAY + done + + if [ "$UP" = "false" ]; then + echo "Timeout waiting for private network IP in subnet $SUBNET" 2>&1 | tee -a /var/log/hetzner-k3s.log + fi + + PRIVATE_IP=$(ip route get {{ private_network_test_ip }} | awk -F"src " 'NR==1{split($2,a," ");print a[1]}') + NETWORK_INTERFACE=" --flannel-iface=$(ip route get {{ private_network_test_ip }} | awk -F"dev " 'NR==1{split($2,a," ");print a[1]}') " +else + echo "Using public network " > /var/log/hetzner-k3s.log + PRIVATE_IP="${PUBLIC_IP}" + NETWORK_INTERFACE=" " +fi + +mkdir -p /etc/rancher/k3s + +cat > /etc/rancher/k3s/registries.yaml < /etc/initialized