From c78461e046572b8d61f46c832ce399069bd0b320 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Wed, 18 Sep 2024 21:50:24 -0600 Subject: [PATCH] Add monitoring node --- autoscaling/tf_init/cluster-network.tf | 2 + autoscaling/tf_init/compute-cluster.tf | 1 + autoscaling/tf_init/controller_update.tf | 4 +- autoscaling/tf_init/instance-pool.tf | 2 + autoscaling/tf_init/inventory.tpl | 4 +- conf/variables.tpl | 4 +- controller.tf | 8 +- data.tf | 7 + inventory.tpl | 4 +- locals.tf | 8 + marketplace.tf | 28 +- monitoring.tf | 55 ++++ outputs.tf | 4 + playbooks/destroy.yml | 2 +- playbooks/monitoring.yml | 25 ++ playbooks/new_nodes.yml | 6 +- playbooks/resize_add.yml | 6 +- playbooks/resize_remove.yml | 2 +- playbooks/resize_remove_unreachable.yml | 2 +- .../roles/autoscaling_mon/tasks/ubuntu.yml | 2 +- playbooks/roles/etc-hosts/tasks/common.yml | 20 +- .../templates/etc-hosts-controller.j2 | 4 + playbooks/roles/grafana/tasks/dashboard.yml | 1 + .../roles/metrics-exporter/tasks/main.yml | 8 +- .../tasks/node_exporter_el.yml | 25 +- .../tasks/node_exporter_ubuntu.yml | 9 - .../roles/prometheus/tasks/gather_info.yml | 18 +- playbooks/roles/prometheus/tasks/main.yml | 30 +- .../roles/slurm/tasks/compute-rack-aware.yml | 8 +- playbooks/roles/slurm/tasks/compute.yml | 2 +- .../roles/slurm/tasks/destroy-rack-aware.yml | 4 +- playbooks/roles/slurm/tasks/destroy.yml | 2 +- playbooks/roles/slurm/tasks/el7.yml | 4 +- playbooks/roles/slurm/tasks/el8.yml | 4 +- playbooks/roles/slurm/tasks/ubuntu.yml | 4 +- playbooks/site.yml | 31 +- playbooks/slurm_config.yml | 2 +- schema.yaml | 267 +++++++++++++++++- slurm_ha.tf | 8 +- variables.tf | 34 ++- 40 files changed, 572 insertions(+), 89 deletions(-) create mode 100644 monitoring.tf create mode 100644 playbooks/monitoring.yml diff --git a/autoscaling/tf_init/cluster-network.tf b/autoscaling/tf_init/cluster-network.tf index d7b0b4f0..c3218b1f 100755 --- a/autoscaling/tf_init/cluster-network.tf +++ b/autoscaling/tf_init/cluster-network.tf @@ -28,6 +28,8 @@ resource "oci_core_cluster_network" "cluster_network" { } freeform_tags = { "user" = var.tags + "cluster_name" = local.cluster_name + "parent_cluster" = local.cluster_name } placement_configuration { availability_domain = var.ad diff --git a/autoscaling/tf_init/compute-cluster.tf b/autoscaling/tf_init/compute-cluster.tf index ef9067b8..1b5b7dfa 100755 --- a/autoscaling/tf_init/compute-cluster.tf +++ b/autoscaling/tf_init/compute-cluster.tf @@ -7,6 +7,7 @@ resource "oci_core_compute_cluster" "compute_cluster" { #Optional display_name = local.cluster_name freeform_tags = { + "user" = var.tags "cluster_name" = local.cluster_name "parent_cluster" = local.cluster_name } diff --git a/autoscaling/tf_init/controller_update.tf b/autoscaling/tf_init/controller_update.tf index 86ddf9e5..1f4d36db 100755 --- a/autoscaling/tf_init/controller_update.tf +++ b/autoscaling/tf_init/controller_update.tf @@ -24,6 +24,8 @@ resource "local_file" "inventory" { backup_ip = var.backup_ip, login_name = var.login_name, login_ip = var.login_ip, + monitoring_name = var.monitoring_name, + monitoring_ip = var.monitoring_ip, compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) public_subnet = var.public_subnet, private_subnet = var.private_subnet, @@ -66,7 +68,7 @@ resource "local_file" "inventory" { instance_pool_ocpus=local.instance_pool_ocpus, queue=var.queue, instance_type=var.instance_type, - monitoring=var.monitoring, + cluster_monitoring=var.cluster_monitoring, autoscaling_monitoring = var.autoscaling_monitoring, unsupported = var.unsupported, hyperthreading = var.hyperthreading, diff --git a/autoscaling/tf_init/instance-pool.tf b/autoscaling/tf_init/instance-pool.tf index 37ca4b2e..a5088bb8 100755 --- a/autoscaling/tf_init/instance-pool.tf +++ b/autoscaling/tf_init/instance-pool.tf @@ -27,6 +27,8 @@ resource "oci_core_instance_pool" "instance_pool" { display_name = local.cluster_name freeform_tags = { "user" = var.tags + "cluster_name" = local.cluster_name + "parent_cluster" = local.cluster_name } placement_configurations { availability_domain = var.ad diff --git a/autoscaling/tf_init/inventory.tpl b/autoscaling/tf_init/inventory.tpl index 7006035f..f511e48a 100755 --- a/autoscaling/tf_init/inventory.tpl +++ b/autoscaling/tf_init/inventory.tpl @@ -4,6 +4,8 @@ ${controller_name} ansible_host=${controller_ip} ansible_user=${controller_usern %{ if backup_name != "" }${backup_name} ansible_host=${backup_ip} ansible_user=${controller_username} role=controller%{ endif } [login] %{ if login_name != "" }${login_name} ansible_host=${login_ip} ansible_user=${compute_username} role=login%{ endif } +[monitoring] +%{ if monitoring_name != "" }${monitoring_name} ansible_host=${monitoring_ip} ansible_user=${compute_username} role=monitoring%{ endif } [compute_to_add] [compute_configured] %{ for host, ip in compute ~} @@ -62,7 +64,7 @@ log_vol=${log_vol} ldap=${ldap} queue=${queue} instance_type=${instance_type} -monitoring=${monitoring} +cluster_monitoring=${cluster_monitoring} hyperthreading=${hyperthreading} privilege_sudo=${privilege_sudo} privilege_group_name=${privilege_group_name} diff --git a/conf/variables.tpl b/conf/variables.tpl index 37497275..c4f9ea05 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -92,6 +92,8 @@ variable "backup_name" {default = "${backup_name}"} variable "backup_ip" {default = "${backup_ip}"} variable "login_name" {default = "${login_name}"} variable "login_ip" {default = "${login_ip}"} +variable "monitoring_name" {default = "${monitoring_name}"} +variable "monitoring_ip" {default = "${monitoring_ip}"} variable "scripts_folder" {default = "/opt/oci-hpc/bin/"} variable "autoscaling_folder" {default = "/opt/oci-hpc/autoscaling/"} variable "cluster_block_volume_size" {default="${cluster_block_volume_size}"} @@ -120,7 +122,7 @@ variable "hyperthreading" { default = ##HT## } variable "unsupported" { default = ${unsupported} } variable "image_ocid" { default = "##IMAGE##" } variable "ldap" { default = ${ldap} } -variable "monitoring" { default = ${monitoring} } +variable "cluster_monitoring" { default = ${cluster_monitoring} } variable "autoscaling_monitoring" { default = ${autoscaling_monitoring} } diff --git a/controller.tf b/controller.tf index ab942ed0..f8185c20 100644 --- a/controller.tf +++ b/controller.tf @@ -237,6 +237,8 @@ resource "null_resource" "cluster" { backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "", login_name = var.login_node ? oci_core_instance.login[0].display_name : "", login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "", + monitoring_name = var.monitoring_node ? oci_core_instance.monitoring[0].display_name : "", + monitoring_ip = var.monitoring_node ? oci_core_instance.monitoring[0].private_ip: "", compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) public_subnet = data.oci_core_subnet.public_subnet.cidr_block, private_subnet = data.oci_core_subnet.private_subnet.cidr_block, @@ -278,7 +280,7 @@ resource "null_resource" "cluster" { shape = local.shape, instance_pool_ocpus = local.instance_pool_ocpus, queue=var.queue, - monitoring = var.monitoring, + cluster_monitoring = var.cluster_monitoring, hyperthreading = var.hyperthreading, controller_username = var.controller_username, compute_username = var.compute_username, @@ -384,6 +386,8 @@ resource "null_resource" "cluster" { backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "", login_name = var.login_node ? oci_core_instance.login[0].display_name : "", login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "", + monitoring_name = var.monitoring_node ? oci_core_instance.monitoring[0].display_name : "", + monitoring_ip = var.monitoring_node ? oci_core_instance.monitoring[0].private_ip: "", compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) public_subnet = data.oci_core_subnet.public_subnet.cidr_block, public_subnet_id = local.controller_subnet_id, @@ -430,7 +434,7 @@ resource "null_resource" "cluster" { localdisk = var.localdisk, log_vol = var.log_vol, redundancy = var.redundancy, - monitoring = var.monitoring, + cluster_monitoring = var.cluster_monitoring, hyperthreading = var.hyperthreading, unsupported = var.unsupported, autoscaling_monitoring = var.autoscaling_monitoring, diff --git a/data.tf b/data.tf index e5dd4277..05c577e3 100755 --- a/data.tf +++ b/data.tf @@ -78,6 +78,13 @@ data "oci_resourcemanager_private_endpoint_reachable_ip" "private_endpoint_reach private_ip = tostring(oci_core_instance.login[0].private_ip) } +data "oci_resourcemanager_private_endpoint_reachable_ip" "private_endpoint_reachable_ip_monitoring" { + #Required + count = (var.private_deployment && var.monitoring_node) ? 1 : 0 + private_endpoint_id = oci_resourcemanager_private_endpoint.rms_private_endpoint[0].id + private_ip = tostring(oci_core_instance.monitoring[0].private_ip) +} + data "oci_dns_views" "dns_views" { depends_on = [local.controller_subnet, oci_core_vcn.vcn] compartment_id = var.targetCompartment diff --git a/inventory.tpl b/inventory.tpl index eed3ac4d..681cf159 100755 --- a/inventory.tpl +++ b/inventory.tpl @@ -4,6 +4,8 @@ ${controller_name} ansible_host=${controller_ip} ansible_user=${controller_usern %{ if backup_name != "" }${backup_name} ansible_host=${backup_ip} ansible_user=${compute_username} role=controller%{ endif } [login] %{ if login_name != "" }${login_name} ansible_host=${login_ip} ansible_user=${compute_username} role=login%{ endif } +[monitoring] +%{ if monitoring_name != "" }${monitoring_name} ansible_host=${monitoring_ip} ansible_user=${compute_username} role=monitoring%{ endif } [compute_to_add] [compute_configured] %{ for host, ip in compute ~} @@ -53,7 +55,7 @@ redundancy=${redundancy} log_vol=${log_vol} instance_pool_ocpus=${instance_pool_ocpus} queue=${queue} -monitoring=${monitoring} +cluster_monitoring=${cluster_monitoring} hyperthreading=${hyperthreading} ldap=${ldap} autoscaling_monitoring=${autoscaling_monitoring} diff --git a/locals.tf b/locals.tf index 12a07964..93bd4b0b 100755 --- a/locals.tf +++ b/locals.tf @@ -6,11 +6,14 @@ locals { image_ocid = var.unsupported ? var.image_ocid : var.image custom_controller_image_ocid = var.unsupported_controller ? var.unsupported_controller_image : var.custom_controller_image custom_login_image_ocid = var.unsupported_login ? var.unsupported_login_image : var.custom_login_image + custom_monitoring_image_ocid = var.unsupported_monitoring ? var.unsupported_monitoring_image : var.custom_monitoring_image + shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape instance_pool_ocpus = ( local.shape == "VM.DenseIO.E4.Flex" || local.shape == "VM.DenseIO.E5.Flex" ) ? var.instance_pool_ocpus_denseIO_flex : var.instance_pool_ocpus controller_ocpus = ( var.controller_shape == "VM.DenseIO.E4.Flex" || var.controller_shape == "VM.DenseIO.E5.Flex" ) ? var.controller_ocpus_denseIO_flex : var.controller_ocpus login_ocpus = ( var.login_shape == "VM.DenseIO.E4.Flex" || var.login_shape == "VM.DenseIO.E5.Flex" ) ? var.login_ocpus_denseIO_flex : var.login_ocpus + monitoring_ocpus = ( var.monitoring_shape == "VM.DenseIO.E4.Flex" || var.monitoring_shape == "VM.DenseIO.E5.Flex" ) ? var.monitoring_ocpus_denseIO_flex : var.monitoring_ocpus // ips of the instances cluster_instances_ips = var.compute_cluster ? oci_core_instance.compute_cluster_instances.*.private_ip : var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.private_ip : data.oci_core_instance.instance_pool_instances.*.private_ip first_vcn_ip = cidrhost(data.oci_core_subnet.private_subnet.cidr_block,0) @@ -36,6 +39,8 @@ locals { login_image = var.login_node && var.use_marketplace_image_login ? oci_core_app_catalog_subscription.login_mp_image_subscription[0].listing_resource_id : local.custom_login_image_ocid + monitoring_image = var.monitoring_node && var.use_marketplace_image_monitoring ? oci_core_app_catalog_subscription.monitoring_mp_image_subscription[0].listing_resource_id : local.custom_monitoring_image_ocid + cluster_network_image = var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid instance_pool_image = ! var.cluster_network && var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid @@ -44,6 +49,7 @@ locals { is_controller_flex_shape = length(regexall(".*VM.*.*Flex$", var.controller_shape)) > 0 ? [local.controller_ocpus]:[] is_login_flex_shape = length(regexall(".*VM.*.*Flex$", var.login_shape)) > 0 ? [local.login_ocpus]:[] + is_monitoring_flex_shape = length(regexall(".*VM.*.*Flex$", var.monitoring_shape)) > 0 ? [local.monitoring_ocpus]:[] is_instance_pool_flex_shape = length(regexall(".*VM.*.*Flex$", var.instance_pool_shape)) > 0 ? [local.instance_pool_ocpus]:[] @@ -63,10 +69,12 @@ locals { host = var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip[0].ip_address : oci_core_instance.controller.public_ip controller_bool_ip = var.private_deployment ? false : true login_bool_ip = var.private_deployment ? false : true + monitoring_bool_ip = var.private_deployment ? false : true controller_subnet = var.private_deployment ? oci_core_subnet.private-subnet : oci_core_subnet.public-subnet private_subnet_cidr = var.private_deployment ? [var.public_subnet, var.private_subnet] : [var.private_subnet] host_backup = var.slurm_ha ? var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip_backup[0].ip_address : oci_core_instance.backup[0].public_ip : "none" host_login = var.login_node ? var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip_login[0].ip_address : oci_core_instance.login[0].public_ip : "none" + host_monitoring = var.monitoring_node ? var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip_monitoring[0].ip_address : oci_core_instance.monitoring[0].public_ip : "none" timeout_per_batch= var.cluster_network ? 30 : 15 timeout_ip = join("",[ (( var.node_count - ( var.node_count % 20 ) )/20 + 1 ) * local.timeout_per_batch,"m"]) diff --git a/marketplace.tf b/marketplace.tf index a735598d..8cac340e 100755 --- a/marketplace.tf +++ b/marketplace.tf @@ -3,9 +3,11 @@ locals { mp_listing_id = var.use_marketplace_image ? substr(var.marketplace_listing,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" mp_controller_listing_id = var.use_marketplace_image_controller ? substr(var.marketplace_listing_controller,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" mp_login_listing_id = var.use_marketplace_image_login ? substr(var.marketplace_listing_login,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" + mp_monitoring_listing_id = var.use_marketplace_image_monitoring ? substr(var.marketplace_listing_monitoring,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" mp_version_id = var.marketplace_version_id[var.marketplace_listing] mp_controller_version_id = var.marketplace_version_id[var.marketplace_listing_controller] mp_login_version_id = var.marketplace_version_id[var.marketplace_listing_login] + mp_monitoring_version_id = var.marketplace_version_id[var.marketplace_listing_monitoring] } /* @@ -80,13 +82,23 @@ data "oci_core_app_catalog_listing_resource_versions" "login_app_catalog_listing count = var.login_node && var.use_marketplace_image_login ? 1 : 0 listing_id = local.mp_login_listing_id } - +data "oci_core_app_catalog_listing_resource_versions" "monitoring_app_catalog_listing_resource_versions" { + count = var.monitoring_node && var.use_marketplace_image_monitoring ? 1 : 0 + listing_id = local.mp_monitoring_listing_id +} resource "oci_core_app_catalog_listing_resource_version_agreement" "login_mp_image_agreement" { count = var.login_node && var.use_marketplace_image_login ? 1 : 0 listing_id = local.mp_login_listing_id listing_resource_version = local.mp_login_version_id +} +resource "oci_core_app_catalog_listing_resource_version_agreement" "monitoring_mp_image_agreement" { + count = var.monitoring_node && var.use_marketplace_image_monitoring ? 1 : 0 + + listing_id = local.mp_monitoring_listing_id + listing_resource_version = local.mp_monitoring_version_id + } resource "oci_core_app_catalog_subscription" "login_mp_image_subscription" { @@ -103,3 +115,17 @@ resource "oci_core_app_catalog_subscription" "login_mp_image_subscription" { create = "20m" } } +resource "oci_core_app_catalog_subscription" "monitoring_mp_image_subscription" { + count = var.monitoring_node && var.use_marketplace_image_monitoring ? 1 : 0 + compartment_id = var.targetCompartment + eula_link = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].eula_link + listing_id = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].listing_id + listing_resource_version = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].listing_resource_version + oracle_terms_of_use_link = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].oracle_terms_of_use_link + signature = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].signature + time_retrieved = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].time_retrieved + + timeouts { + create = "20m" + } +} diff --git a/monitoring.tf b/monitoring.tf new file mode 100644 index 00000000..bf8d71e7 --- /dev/null +++ b/monitoring.tf @@ -0,0 +1,55 @@ +resource "oci_core_instance" "monitoring" { + count = var.monitoring_node ? 1 : 0 + depends_on = [oci_core_subnet.public-subnet] + availability_domain = var.monitoring_ad + compartment_id = var.targetCompartment + shape = var.monitoring_shape + + dynamic "shape_config" { + for_each = local.is_monitoring_flex_shape + content { + ocpus = shape_config.value + memory_in_gbs = var.monitoring_custom_memory ? var.monitoring_memory : 16 * shape_config.value + } + } + agent_config { + is_management_disabled = true + } + display_name = "${local.cluster_name}-monitoring" + + freeform_tags = { + "cluster_name" = local.cluster_name + "parent_cluster" = local.cluster_name + } + + metadata = { + ssh_authorized_keys = "${var.ssh_key}\n${tls_private_key.ssh.public_key_openssh}" + user_data = base64encode(data.template_file.controller_config.rendered) + } + source_details { + source_id = local.monitoring_image + boot_volume_size_in_gbs = var.monitoring_boot_volume_size + boot_volume_vpus_per_gb = 30 + source_type = "image" + } + + create_vnic_details { + subnet_id = local.controller_subnet_id + assign_public_ip = local.monitoring_bool_ip + } +} + +resource "oci_dns_rrset" "rrset-monitoring" { + count = var.monitoring_node && var.dns_entries ? 1 : 0 + zone_name_or_id = data.oci_dns_zones.dns_zones.zones[0].id + domain = "${var.monitoring_node ? oci_core_instance.monitoring[0].display_name : ""}.${local.zone_name}" + rtype = "A" + items { + domain = "${var.monitoring_node ? oci_core_instance.monitoring[0].display_name : ""}.${local.zone_name}" + rtype = "A" + rdata = var.monitoring_node ? oci_core_instance.monitoring[0].private_ip: "" + ttl = 3600 + } + scope = "PRIVATE" + view_id = data.oci_dns_views.dns_views.views[0].id +} \ No newline at end of file diff --git a/outputs.tf b/outputs.tf index af5b5cba..d4770cf1 100755 --- a/outputs.tf +++ b/outputs.tf @@ -12,4 +12,8 @@ output "backup" { output "login" { value = var.login_node ? local.host_login : "No Login Node Defined" +} + +output "monitoring" { + value = var.monitoring_node ? local.host_monitoring : "No Monitoring Node Defined" } \ No newline at end of file diff --git a/playbooks/destroy.yml b/playbooks/destroy.yml index 9f413982..f7ded4a0 100755 --- a/playbooks/destroy.yml +++ b/playbooks/destroy.yml @@ -9,7 +9,7 @@ - include_role: name: slurm when: slurm|default(false)|bool -- hosts: controller, slurm_backup, login +- hosts: controller, slurm_backup, login, monitoring become: true vars: destroy: true diff --git a/playbooks/monitoring.yml b/playbooks/monitoring.yml new file mode 100644 index 00000000..8f1102b9 --- /dev/null +++ b/playbooks/monitoring.yml @@ -0,0 +1,25 @@ +- hosts: all,!monitoring + gather_facts: true + tasks: + - include_role: + name: metrics-exporter + when: cluster_monitoring|default(false)|bool + +- hosts: monitoring + gather_facts: true + tasks: + - include_role: + name: grafana + when: cluster_monitoring|default(false)|bool and ( groups['monitoring'] | length > 0 ) + +- hosts: controller + tasks: + - include_role: + name: grafana + when: cluster_monitoring|default(false)|bool and ( groups['monitoring'] | length == 0 ) + +- hosts: controller, monitoring + tasks: + - include_role: + name: prometheus + when: cluster_monitoring|default(false)|bool \ No newline at end of file diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml index 83553281..5e3adb1b 100644 --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -51,7 +51,7 @@ - include_role: name: healthchecks -- hosts: controller,slurm_backup,login,compute +- hosts: controller,slurm_backup,login,compute, monitoring become: true vars: destroy: false @@ -201,10 +201,10 @@ when: spack|default(false)|bool - include_role: name: prometheus - when: monitoring|default(false)|bool + when: cluster_monitoring|default(false)|bool - include_role: name: metrics-exporter - when: monitoring|default(false)|bool + when: cluster_monitoring|default(false)|bool - include_role: name: slurm when: slurm|default(false)|bool \ No newline at end of file diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index 95a05f21..576cbd1f 100644 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -49,7 +49,7 @@ - include_role: name: healthchecks -- hosts: controller,slurm_backup,login,compute +- hosts: controller,slurm_backup,login,compute, monitoring become: true vars: destroy: false @@ -194,10 +194,10 @@ when: spack|default(false)|bool - include_role: name: prometheus - when: monitoring|default(false)|bool + when: cluster_monitoring|default(false)|bool - include_role: name: metrics-exporter - when: monitoring|default(false)|bool + when: cluster_monitoring|default(false)|bool - include_role: name: slurm when: slurm|default(false)|bool \ No newline at end of file diff --git a/playbooks/resize_remove.yml b/playbooks/resize_remove.yml index 99029c50..b5d04156 100755 --- a/playbooks/resize_remove.yml +++ b/playbooks/resize_remove.yml @@ -1,4 +1,4 @@ -- hosts: controller, slurm_backup, compute, login +- hosts: controller, slurm_backup, compute, login, monitoring become: true gather_facts: true vars: diff --git a/playbooks/resize_remove_unreachable.yml b/playbooks/resize_remove_unreachable.yml index 5921cd16..ecc61d35 100644 --- a/playbooks/resize_remove_unreachable.yml +++ b/playbooks/resize_remove_unreachable.yml @@ -1,4 +1,4 @@ -- hosts: controller, compute, slurm_backup, login +- hosts: controller, compute, slurm_backup, login, monitoring become: true gather_facts: true vars: diff --git a/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml b/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml index 64020bc0..3bf2d6c9 100644 --- a/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml +++ b/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml @@ -53,7 +53,7 @@ - name: install grafana include_role: name: grafana - when: not monitoring|default(false)|bool + when: not cluster_monitoring|default(false)|bool # - name: Import mysql-2022 key # become: true diff --git a/playbooks/roles/etc-hosts/tasks/common.yml b/playbooks/roles/etc-hosts/tasks/common.yml index 97888dcb..d6d47225 100644 --- a/playbooks/roles/etc-hosts/tasks/common.yml +++ b/playbooks/roles/etc-hosts/tasks/common.yml @@ -52,13 +52,13 @@ run_once: true when: not destroy|bool and groups['compute']|length > 0 -- name: move /etc/hosts on backup slurm and login node +- name: move /etc/hosts on backup slurm, login node and monitoring node become: true copy: dest: /etc/hosts src: /etc/hosts force: yes - when: ( not destroy|bool ) and (('slurm_backup' in group_names) or ('login' in group_names)) + when: ( not destroy|bool ) and (('slurm_backup' in group_names) or ('login' in group_names) or ('monitoring' in group_names)) - name: Make sure the IP for each node was not left over in another cluster become: true @@ -66,7 +66,7 @@ dest: /etc/hosts regexp: "^127.0.1.1\\s{{hostvars[groups['controller'][0]]['inventory_hostname']}}.*" state: absent - when: ( not destroy|bool ) and (('slurm_backup' in group_names) or ('login' in group_names)) + when: ( not destroy|bool ) and (('slurm_backup' in group_names) or ('login' in group_names) or ('monitoring' in group_names)) - name: move /etc/hosts on all compute nodes become: true @@ -74,7 +74,7 @@ dest: /etc/hosts src: /tmp/hosts.etc.{{ cluster_name }} force: yes - when: ( not destroy|bool ) and (not 'controller' in group_names) and (not 'slurm_backup' in group_names) and (not 'login' in group_names) + when: ( not destroy|bool ) and (not 'controller' in group_names) and (not 'slurm_backup' in group_names) and (not 'login' in group_names) and (not 'monitoring' in group_names) - name: remove cluster from etc-host become: true @@ -104,4 +104,14 @@ state: absent delegate_to: "{{ groups['login'][0] }}" run_once: true - when: destroy|bool and (groups['login']|length > 0)|bool \ No newline at end of file + when: destroy|bool and (groups['login']|length > 0)|bool + +- name: remove cluster from etc-host on monitoring + become: true + blockinfile: + dest: /etc/hosts + marker: "# {mark} ANSIBLE MANAGED BLOCK {{ cluster_name }}" + state: absent + delegate_to: "{{ groups['monitoring'][0] }}" + run_once: true + when: destroy|bool and (groups['monitoring']|length > 0)|bool \ No newline at end of file diff --git a/playbooks/roles/etc-hosts/templates/etc-hosts-controller.j2 b/playbooks/roles/etc-hosts/templates/etc-hosts-controller.j2 index e604e118..180e46f2 100755 --- a/playbooks/roles/etc-hosts/templates/etc-hosts-controller.j2 +++ b/playbooks/roles/etc-hosts/templates/etc-hosts-controller.j2 @@ -9,4 +9,8 @@ {% for item in groups['login'] %} {% set short_name = hostvars[item]['ansible_fqdn'].split('.') %} {{ hostvars[item]['ansible_host'] }} {{ hostvars[item]['ansible_fqdn'] }} {{ short_name[0] }} login +{% endfor %} +{% for item in groups['monitoring'] %} +{% set short_name = hostvars[item]['ansible_fqdn'].split('.') %} +{{ hostvars[item]['ansible_host'] }} {{ hostvars[item]['ansible_fqdn'] }} {{ short_name[0] }} monitoring {% endfor %} \ No newline at end of file diff --git a/playbooks/roles/grafana/tasks/dashboard.yml b/playbooks/roles/grafana/tasks/dashboard.yml index af4b65b5..355d3f50 100644 --- a/playbooks/roles/grafana/tasks/dashboard.yml +++ b/playbooks/roles/grafana/tasks/dashboard.yml @@ -53,6 +53,7 @@ copy: src: "{{ dashboard_build_dir }}/cluster_prometheus_v2.json" dest: "/opt/oci-hpc/monitoring/cluster_prometheus_v2.json" + remote_src: true - name: Import NodeExporter, DCGM, RDMA, NVLink Grafana dashboards community.grafana.grafana_dashboard: diff --git a/playbooks/roles/metrics-exporter/tasks/main.yml b/playbooks/roles/metrics-exporter/tasks/main.yml index 2029c8e1..fa3cad5c 100644 --- a/playbooks/roles/metrics-exporter/tasks/main.yml +++ b/playbooks/roles/metrics-exporter/tasks/main.yml @@ -5,13 +5,13 @@ when: ansible_distribution == 'Ubuntu' - include_tasks: dcgm_exporter.yml - when: ('compute' in group_names) + when: ('compute' in group_names) and 'GPU' in shape - include_tasks: rdma_exporter.yml - when: ('compute' in group_names) + when: ('compute' in group_names) and cluster_network|bool - include_tasks: nvlink_exporter.yml - when: ('compute' in group_names) + when: ('compute' in group_names) and 'GPU' in shape - include_tasks: custom_metrics.yml - when: ('compute' in group_names) + when: ('compute' in group_names) and cluster_network|bool diff --git a/playbooks/roles/metrics-exporter/tasks/node_exporter_el.yml b/playbooks/roles/metrics-exporter/tasks/node_exporter_el.yml index b1d8b47e..18ba2420 100644 --- a/playbooks/roles/metrics-exporter/tasks/node_exporter_el.yml +++ b/playbooks/roles/metrics-exporter/tasks/node_exporter_el.yml @@ -1,5 +1,14 @@ --- +- name: Create user for prometheus + become: true + user: + name: "{{ prometheus_user }}" + state: present + createhome: no + shell: /usr/sbin/nologin + append: yes + - name: Create /var/lib/prometheus/node_exporter directory become: true file: @@ -7,14 +16,12 @@ state: directory owner: 'prometheus' group: 'prometheus' - when: ('compute' in group_names) - name: Download node_exporter {{ node_exporter }} become: true get_url: url: https://github.com/prometheus/node_exporter/releases/download/v{{ node_exporter }}/node_exporter-{{ node_exporter }}.linux-amd64.tar.gz dest: /tmp/node_exporter-{{ node_exporter }}.linux-amd64.tar.gz - when: ('compute' in group_names) - name: Extract node_exporter {{ node_exporter }}.linux-amd64.tar.gz into /var/lib/prometheus/node_exporter/ become: true @@ -23,7 +30,6 @@ dest: /var/lib/prometheus/node_exporter/ extra_opts: [--strip-components=1] remote_src: yes - when: ('compute' in group_names) - name: Recursively change ownership of a /var/lib/prometheus/node_exporter/ become: true @@ -33,7 +39,6 @@ recurse: yes owner: prometheus group: prometheus - when: ('compute' in group_names) - name: Create a symbolic link node_exporter become: true @@ -41,27 +46,30 @@ src: /var/lib/prometheus/node_exporter/node_exporter dest: /usr/bin/node_exporter state: link - when: ('compute' in group_names) - name: Configure node_exporter service become: true copy: src: 'node_exporter.service' dest: '/usr/lib/systemd/system/node_exporter.service' - when: ('compute' in group_names) - name: Run command deactivate selinux for node_exporter, chcon become: true command: chcon --reference=/bin/less /usr/bin/node_exporter - when: ('compute' in group_names) +- name: Make sure the python setuptools are installed + vars: + package_name: + - platform-python-setuptools + include_role: + name: safe_yum + - name: Run command deactivate selinux for node_exporter, semanage become: true command: semanage fcontext -a -t bin_t "/usr/bin/node_exporter" register: node_exporter failed_when: "node_exporter.rc != 0 and 'already defined' not in node_exporter.stderr" - when: ('compute' in group_names) - name: start node_exporter.service become: true @@ -69,4 +77,3 @@ name: node_exporter.service state: restarted enabled: true - when: ('compute' in group_names) diff --git a/playbooks/roles/metrics-exporter/tasks/node_exporter_ubuntu.yml b/playbooks/roles/metrics-exporter/tasks/node_exporter_ubuntu.yml index f779465c..3ad498cf 100644 --- a/playbooks/roles/metrics-exporter/tasks/node_exporter_ubuntu.yml +++ b/playbooks/roles/metrics-exporter/tasks/node_exporter_ubuntu.yml @@ -5,7 +5,6 @@ name: prometheus state: present system: yes - when: "'compute' in group_names" - name: Create prometheus user on compute become: true @@ -14,7 +13,6 @@ comment: "prometheus user" group: prometheus createhome: no # Optional: depending on if you want to create a home directory - when: "'compute' in group_names" - name: Create /var/lib/prometheus/node_exporter directory become: true @@ -23,14 +21,12 @@ state: directory owner: 'prometheus' group: 'prometheus' - when: "'compute' in group_names" - name: Download node_exporter {{ node_exporter }} become: true get_url: url: "https://github.com/prometheus/node_exporter/releases/download/v{{ node_exporter }}/node_exporter-{{ node_exporter }}.linux-amd64.tar.gz" dest: "/tmp/node_exporter-{{ node_exporter }}.linux-amd64.tar.gz" - when: "'compute' in group_names" - name: Extract node_exporter {{ node_exporter }}.linux-amd64.tar.gz into /var/lib/prometheus/node_exporter/ become: true @@ -39,7 +35,6 @@ dest: "/var/lib/prometheus/node_exporter/" extra_opts: ["--strip-components=1"] remote_src: yes - when: "'compute' in group_names" - name: Recursively change ownership of /var/lib/prometheus/node_exporter/ become: true @@ -49,7 +44,6 @@ recurse: yes owner: prometheus group: prometheus - when: "'compute' in group_names" - name: Create a symbolic link for node_exporter become: true @@ -57,14 +51,12 @@ src: /var/lib/prometheus/node_exporter/node_exporter dest: /usr/bin/node_exporter state: link - when: "'compute' in group_names" - name: Configure node_exporter service become: true copy: src: 'node_exporter.service' dest: '/etc/systemd/system/node_exporter.service' - when: "'compute' in group_names" - name: Start and enable node_exporter service become: true @@ -72,4 +64,3 @@ name: node_exporter.service state: restarted enabled: true - when: "'compute' in group_names" diff --git a/playbooks/roles/prometheus/tasks/gather_info.yml b/playbooks/roles/prometheus/tasks/gather_info.yml index e5cba703..cc3afa38 100644 --- a/playbooks/roles/prometheus/tasks/gather_info.yml +++ b/playbooks/roles/prometheus/tasks/gather_info.yml @@ -3,6 +3,22 @@ register: serial_output delegate_to: "{{ item }}" +- name: Gather FSS IP + shell: "cat /etc/fstab | grep {{nfs_source_path}}" + register: nfs_output + delegate_to: "{{ item }}" + ignore_errors: yes + +- name: Extract the IP address using regex + set_fact: + ip_address: "{{ nfs_output.stdout | regex_search('([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3})') }}" + when: nfs_output.rc == 0 + +- name: Extract the IP address using regex + set_fact: + ip_address: "None" + when: nfs_output.rc != 0 + - name: gather metadata uri: url: http://169.254.169.254/opc/v2/instance/ @@ -43,4 +59,4 @@ - name: Build the host_info dictionary set_fact: - host_info: "{{ host_info | default({}) | combine({item: {'serial_number': serial_output.stdout, 'cluster_name': instance_metadata['freeformTags']['cluster_name'], 'shape': instance_metadata['shape'] , 'ocid': instance_metadata['id'] , 'oci_name': instance_metadata['displayName'], 'availabilityDomain': instance_metadata['availabilityDomain'],'compartmentId': instance_metadata['compartmentId'],'rackID': rdma_metadata['rackId'],'networkBlockId': rdma_metadata['networkBlockId'],'rail_id': rdma_metadata['rdmaTopologyData']['customerLocalBlock'],'hpc_island': rdma_metadata['rdmaTopologyData']['customerHPCIslandId'] }}) }}" \ No newline at end of file + host_info: "{{ host_info | default({}) | combine({item: {'serial_number': serial_output.stdout, 'cluster_name': instance_metadata['freeformTags']['cluster_name'], 'shape': instance_metadata['shape'] , 'ocid': instance_metadata['id'] , 'oci_name': instance_metadata['displayName'], 'availabilityDomain': instance_metadata['availabilityDomain'],'compartmentId': instance_metadata['compartmentId'],'rackID': rdma_metadata['rackId'],'networkBlockId': rdma_metadata['networkBlockId'],'rail_id': rdma_metadata['rdmaTopologyData']['customerLocalBlock'],'fss_ip': ip_address,'hpc_island': rdma_metadata['rdmaTopologyData']['customerHPCIslandId'] }}) }}" \ No newline at end of file diff --git a/playbooks/roles/prometheus/tasks/main.yml b/playbooks/roles/prometheus/tasks/main.yml index bd9a5602..c19d49d9 100644 --- a/playbooks/roles/prometheus/tasks/main.yml +++ b/playbooks/roles/prometheus/tasks/main.yml @@ -8,6 +8,8 @@ createhome: no # Create the user's home directory shell: /usr/sbin/nologin append: yes + run_once: true + delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}" - name: Create installation folder in etc become: true @@ -18,6 +20,9 @@ group: "{{ prometheus_user }}" mode: '0775' recurse: yes + run_once: true + delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}" + - name: Create data folder become: true @@ -28,6 +33,8 @@ group: "{{ prometheus_user }}" mode: '0775' recurse: yes + run_once: true + delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}" - name: Download/unarchive Packages for prometheus become: true @@ -38,6 +45,8 @@ group: "{{ prometheus_user }}" remote_src: yes creates: "{{ prometheus_download_dir }}" + run_once: true + delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}" - name: Copying the service binary for prometheus become: true @@ -51,6 +60,8 @@ with_items: - prometheus - promtool + run_once: true + delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}" - name: Copying the console binary become: true @@ -64,18 +75,24 @@ with_items: - consoles - console_libraries + run_once: true + delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}" - name: Removing the tar file of prometheus become: true file: path: "{{ prometheus_download_dir }}" state: absent + run_once: true + delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}" - name: Create prometheus systemd service file become: true template: src: templates/prometheus.service.j2 dest: "{{ service_dest_dir }}/prometheus.service" + run_once: true + delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}" - name: Get current nodes in /etc/hosts shell: "cat /etc/hosts | grep .local.vcn | awk '{print $2}'" @@ -84,9 +101,16 @@ run_once: true ignore_errors: yes +- name: Get controller and login nodes + shell: "cat /etc/hosts | grep \"controller\\|login\" | grep -v \"ANSIBLE MANAGED BLOCK\" | awk '{print $3}'" + register: c_l_nodes_in_etc_hosts + delegate_to: 127.0.0.1 + run_once: true + ignore_errors: yes + - name: set fact set_fact: - nodelist: "{{ nodes_in_etc_hosts.stdout_lines }}" + nodelist: "{{ nodes_in_etc_hosts.stdout_lines + c_l_nodes_in_etc_hosts.stdout_lines }}" run_once: true - name: Loop over the list of hosts and gather serial number and cluster name @@ -103,7 +127,7 @@ group: "{{ prometheus_user }}" mode: '0775' run_once: true - delegate_to: 127.0.0.1 + delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}" - name: restart prometheus become: true @@ -112,4 +136,4 @@ state: restarted daemon_reload: yes enabled: yes - delegate_to: 127.0.0.1 + delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}" diff --git a/playbooks/roles/slurm/tasks/compute-rack-aware.yml b/playbooks/roles/slurm/tasks/compute-rack-aware.yml index 51c44d70..3af0233a 100755 --- a/playbooks/roles/slurm/tasks/compute-rack-aware.yml +++ b/playbooks/roles/slurm/tasks/compute-rack-aware.yml @@ -124,7 +124,7 @@ - name: Get rackIDs for all compute nodes set_fact: racks_to_add_temp: "{{cluster_name}}:{{hostvars[item]['rackID']}}" - with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login'])}}" + with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login'])| difference(groups['monitoring'])}}" run_once: true register: racks_to_add_temp_results @@ -135,7 +135,7 @@ - name: Get hostnames set_fact: nodes_to_add_temp: "{{hostvars[item]['ansible_hostname']}}" - with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) }}" + with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) | difference(groups['monitoring'])}}" run_once: true register: nodes_to_add_temp_results @@ -162,7 +162,7 @@ - name: Get hostlist if switch exists vars: - new_line: "{% for node in ( play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) ) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}" + new_line: "{% for node in ( play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) | difference(groups['monitoring'])) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}" command: "scontrol show hostlistsorted {{ item.stdout_lines | union (new_line[:-1].split(',') | list )| join(',') }}" register: rack_hostlist1 delegate_to: 127.0.0.1 @@ -172,7 +172,7 @@ - name: Get hostlist if switch does not exists vars: - new_line: "{% for node in ( play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) ) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}" + new_line: "{% for node in ( play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) | difference(groups['monitoring'])) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}" command: "scontrol show hostlistsorted {{ new_line[:-1] }}" register: rack_hostlist2 delegate_to: 127.0.0.1 diff --git a/playbooks/roles/slurm/tasks/compute.yml b/playbooks/roles/slurm/tasks/compute.yml index 94fdb547..3c384390 100755 --- a/playbooks/roles/slurm/tasks/compute.yml +++ b/playbooks/roles/slurm/tasks/compute.yml @@ -92,7 +92,7 @@ - name: Get hostnames set_fact: nodes_to_add_temp: "{{hostvars[item]['ansible_hostname']}}" - with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) }}" + with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) | difference(groups['monitoring']) }}" run_once: true register: nodes_to_add_temp_results diff --git a/playbooks/roles/slurm/tasks/destroy-rack-aware.yml b/playbooks/roles/slurm/tasks/destroy-rack-aware.yml index 93c20964..fa3a15ab 100755 --- a/playbooks/roles/slurm/tasks/destroy-rack-aware.yml +++ b/playbooks/roles/slurm/tasks/destroy-rack-aware.yml @@ -16,7 +16,7 @@ - name: Get hostnames set_fact: nodes_to_remove_temp: "{{hostvars[item]['ansible_hostname']}}" - with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login'])}}" + with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) | difference(groups['monitoring'])}}" run_once: true register: nodes_to_remove_temp_results @@ -62,7 +62,7 @@ - name: Get rackIDs set_fact: racks_to_remove_temp: "{{cluster_name}}:{{hostvars[item]['rackID']}}" - with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login'])}}" + with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) | difference(groups['monitoring'])}}" run_once: true register: racks_to_remove_temp_results diff --git a/playbooks/roles/slurm/tasks/destroy.yml b/playbooks/roles/slurm/tasks/destroy.yml index 5d9d7c5d..ea6bf5e1 100755 --- a/playbooks/roles/slurm/tasks/destroy.yml +++ b/playbooks/roles/slurm/tasks/destroy.yml @@ -36,7 +36,7 @@ - name: Get hostnames set_fact: nodes_to_add_temp: "{{hostvars[item]['ansible_hostname']}}" - with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) }}" + with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) | difference(groups['monitoring']) }}" run_once: true register: nodes_to_add_temp_results diff --git a/playbooks/roles/slurm/tasks/el7.yml b/playbooks/roles/slurm/tasks/el7.yml index d9c15214..8afebd88 100755 --- a/playbooks/roles/slurm/tasks/el7.yml +++ b/playbooks/roles/slurm/tasks/el7.yml @@ -5,11 +5,11 @@ include_tasks: "compute{{rack_aware_playbook_suffix}}.yml" when: ('compute' in group_names) and (not destroy|bool) -- name: run login server directives +- name: run login/monitoring server directives vars: slurm_repos: "epel,ol7_developer_EPEL" include_tasks: login.yml - when: ('login' in group_names) and (not destroy|bool) and (initial| bool) + when: (('login' in group_names) or ('monitoring' in group_names) )and (not destroy|bool) and (initial| bool) - name: run backup server directives vars: diff --git a/playbooks/roles/slurm/tasks/el8.yml b/playbooks/roles/slurm/tasks/el8.yml index 1f5a2482..693df83e 100755 --- a/playbooks/roles/slurm/tasks/el8.yml +++ b/playbooks/roles/slurm/tasks/el8.yml @@ -5,11 +5,11 @@ include_tasks: "compute{{rack_aware_playbook_suffix}}.yml" when: ('compute' in group_names) and (not destroy|bool) -- name: run login server directives +- name: run login/monitoring server directives vars: slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder" include_tasks: login.yml - when: ('login' in group_names) and (not destroy|bool) and (initial| bool) + when: (('login' in group_names) or ('monitoring' in group_names) ) and (not destroy|bool) and (initial| bool) - name: run backup server directives vars: diff --git a/playbooks/roles/slurm/tasks/ubuntu.yml b/playbooks/roles/slurm/tasks/ubuntu.yml index 5399d7cc..e3533b75 100644 --- a/playbooks/roles/slurm/tasks/ubuntu.yml +++ b/playbooks/roles/slurm/tasks/ubuntu.yml @@ -10,9 +10,9 @@ include_tasks: "compute{{rack_aware_playbook_suffix}}.yml" when: ('compute' in group_names) and (not destroy|bool) -- name: run login server directives +- name: run login/monitoring server directives include_tasks: login.yml - when: ('login' in group_names) and (not destroy|bool) and (initial| bool) + when: (('login' in group_names) or ('monitoring' in group_names) ) and (not destroy|bool) and (initial| bool) - name: run backup server directives include_tasks: backup_server.yml diff --git a/playbooks/site.yml b/playbooks/site.yml index 70bc03b6..a76a6afd 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -11,7 +11,7 @@ when: change_hostname | default(false) | bool # for ubuntu, on all compute nodes, run --fix-broken install -- hosts: compute, login +- hosts: compute, login, monitoring become: true tasks: - include_role: @@ -90,7 +90,7 @@ name: fss-home when: add_nfs|bool and home_fss|bool -- hosts: controller, slurm_backup, login +- hosts: controller, slurm_backup, login, monitoring become: true tasks: - include_role: @@ -125,7 +125,7 @@ when: not inst_prin|bool -- hosts: compute, login +- hosts: compute, login, monitoring become: true tasks: - include_role: @@ -254,27 +254,38 @@ name: nccl-conf when: cluster_network|bool -- hosts: all +- hosts: all,!monitoring tasks: - - include_role: - name: prometheus - when: monitoring|default(false)|bool - include_role: name: metrics-exporter - when: monitoring|default(false)|bool + when: cluster_monitoring|default(false)|bool + +- hosts: monitoring + tasks: + - include_role: + name: grafana + when: cluster_monitoring|default(false)|bool and ( groups['monitoring'] | length > 0 ) - hosts: controller tasks: - include_role: name: grafana - when: monitoring|default(false)|bool + when: cluster_monitoring|default(false)|bool and ( groups['monitoring'] | length == 0 ) + +- hosts: controller, monitoring + tasks: + - include_role: + name: prometheus + when: cluster_monitoring|default(false)|bool + +- hosts: controller + tasks: - include_role: name: autoscaling_mon when: autoscaling_monitoring|default(false)|bool - include_role: name: cron - - hosts: compute become: true vars: diff --git a/playbooks/slurm_config.yml b/playbooks/slurm_config.yml index dce70f01..ce9c15f0 100755 --- a/playbooks/slurm_config.yml +++ b/playbooks/slurm_config.yml @@ -1,4 +1,4 @@ -- hosts: controller,slurm_backup,compute,login +- hosts: controller,slurm_backup,compute,login, monitoring gather_facts: true vars: destroy: false diff --git a/schema.yaml b/schema.yaml index befa2a21..edcfb0d2 100755 --- a/schema.yaml +++ b/schema.yaml @@ -93,6 +93,26 @@ variableGroups: - ${login_block} - ${login_block_volume_size} - ${login_block_volume_performance} + + - title: "Additional Monitoring Node" + variables: + - ${cluster_monitoring} + - ${monitoring_node} + - ${monitoring_ad} + - ${monitoring_shape} + - ${monitoring_ocpus} + - ${monitoring_ocpus_denseIO_flex} + - ${monitoring_custom_memory} + - ${monitoring_memory} + - ${monitoring_boot_volume_size} + - ${use_marketplace_image_monitoring} + - ${marketplace_listing_monitoring} + - ${unsupported_monitoring} + - ${monitoring_image_compartment} + - ${custom_monitoring_image} + - ${unsupported_monitoring_image} + - ${monitoring_username} + - title: Autoscaling variables: - ${autoscaling} @@ -104,7 +124,7 @@ variableGroups: - ${api_user_ocid} - ${api_fingerprint} - ${api_user_key} - - title: "Monitoring" + - title: "Autoscaling Monitoring" variables: - ${autoscaling_mysql_service} - ${monitoring_shape_name} @@ -178,7 +198,6 @@ variableGroups: - ${rack_aware} - ${queue} - ${spack} - - ${monitoring} - ${enroot} - ${pyxis} - ${pam} @@ -1187,6 +1206,12 @@ variables: description: "Install Enroot, Nvidia Container Toolkit, and docker." visible: ${slurm} + cluster_monitoring: + type: boolean + title: "Install HPC Cluster Monitoring Tools" + default: false + description: "Install Grafana, Node-Exporter, and Prometheus tools for system monitoring." + pam: type: boolean title: "Enable PAM" @@ -1207,12 +1232,6 @@ variables: default: false description: "Will run tests on GPU nodes before starting a job. Nodes that are showing issues will be set in drain state" visible: ${slurm} - - monitoring: - type: boolean - title: "Install HPC Cluster Monitoring Tools" - default: false - description: "Install Grafana, Node-Exporter, and Prometheus tools for system monitoring." autoscaling: type: boolean @@ -1704,3 +1723,235 @@ variables: and: - ${use_marketplace_image_login} - ${login_node} + + + + monitoring_node: + type: boolean + title: "Monitoring Node" + default: false + description: "Create an additional monitoring node for users" + visible: cluster_monitoring + + monitoring_ad: + type: oci:identity:availabilitydomain:name + dependsOn: + compartmentId: ${targetCompartment} + visible: + and: + - complexExpression + - ${monitoring_node} + required: true + description: "Availability Domain for monitoring node" + title: "Availability Domain For monitoring Node" + default: ${ad} + + monitoring_shape: + type: oci:core:instanceshape:name + dependsOn: + compartmentId: ${targetCompartment} + required: true + default: VM.Standard.E4.Flex + visible: ${monitoring_node} + + monitoring_ocpus: + type: integer + description: Number of OCPU's for flex shape + minimum: 1 + maximum: 64 + default: 32 + visible: + and: + - or: + - eq: + - ${monitoring_shape} + - "VM.Standard.E3.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard.E4.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard.E5.Flex" + - eq: + - ${monitoring_shape} + - "VM.Optimized3.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard.A1.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard3.Flex" + - ${monitoring_node} + required: true + + monitoring_ocpus_denseIO_flex: + title: Cores + type: enum + description: Number of OCPU's for Dense IO flex shape + enum: + - 8 + - 16 + - 32 + default: 16 + visible: + and: + - or: + - eq: + - ${monitoring_shape} + - "VM.DenseIO.E4.Flex" + - eq: + - ${monitoring_shape} + - "VM.DenseIO.E5.Flex" + - ${monitoring_node} + required: true + + monitoring_custom_memory: + title: Use custom memory size + type: boolean + default: false + visible: + and: + - or: + - eq: + - ${monitoring_shape} + - "VM.Standard.E3.Flex" + - eq: + - ${monitoring_shape} + - "VM.Optimized3.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard.E4.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard.E5.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard.A1.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard3.Flex" + - ${monitoring_node} + monitoring_memory: + title: Memory in GBS + type: integer + description: Number of memory for flex shape. Minimum 1GB per core. + minimum: 1 + maximum: 1024 + default: 256 + visible: + and: + - and: + - or: + - eq: + - ${monitoring_shape} + - "VM.Standard.E3.Flex" + - eq: + - ${monitoring_shape} + - "VM.Optimized3.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard.E4.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard.E5.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard.A1.Flex" + - eq: + - ${monitoring_shape} + - "VM.Standard3.Flex" + - and: + - ${monitoring_custom_memory} + - ${monitoring_node} + required: true + + monitoring_boot_volume_size: + type: integer + required: true + minimum: 50 + title: "Size of the boot volume in GB" + default: 512 + visible: ${monitoring_node} + + unsupported_monitoring: + title: "Use unsupported image" + description: "Custom image ID for monitoring Node" + type: boolean + default: false + visible: + and: + - ${monitoring_node} + - not: + - ${use_marketplace_image_monitoring} + + monitoring_image_compartment: + title: "monitoring image compartment" + type: oci:identity:compartment:id + default: ${targetCompartment} + visible: + and: + - ${monitoring_node} + - not: + - ${unsupported_monitoring} + - not: + - ${use_marketplace_image_monitoring} + required: true + + custom_monitoring_image: + title: "monitoring Image ID" + description: "Custom image ID for monitoring nodes. Please note that only Oracle Linux and Ubuntu 22.04 are supported as monitoring image at this moment. " + type: oci:core:image:id + dependsOn: + compartmentId: ${monitoring_image_compartment} + visible: + and: + - ${monitoring_node} + - not: + - ${unsupported_monitoring} + - not: + - ${use_marketplace_image_monitoring} + required: true + unsupported_monitoring_image: + title: "Image OCID" + description: "Custom image ID for monitoring nodes" + type: string + required: true + visible: + and: + - ${unsupported_monitoring} + - not: + - ${use_marketplace_image_monitoring} + default: "image.ocid" + + monitoring_username: + title: "Default username for monitoring node" + description: "Custom image ID for monitoring node" + type: string + default: "opc" + required: true + visible: ${monitoring_node} + + use_marketplace_image_monitoring: + type: boolean + title: "use marketplace image" + description: "Use marketplace image, otherwise provide custom image OCID" + default: true + visible: ${monitoring_node} + + marketplace_listing_monitoring: + type: enum + title: "Image version" + description: "Marketplace listing to use" + required: true + enum: + - "HPC_OL7" + - "HPC_OL8" + - "GPU_OL8_NV550" + - "GPU_OL7_NV550" + - "GPU_OL8_NV535" + - "GPU_OL7_NV535" + default: "HPC_OL8" + visible: + and: + - ${use_marketplace_image_monitoring} + - ${monitoring_node} diff --git a/slurm_ha.tf b/slurm_ha.tf index df2cf788..89dda946 100644 --- a/slurm_ha.tf +++ b/slurm_ha.tf @@ -191,6 +191,8 @@ resource "null_resource" "cluster_backup" { backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "", login_name = var.login_node ? oci_core_instance.login[0].display_name : "", login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "", + monitoring_name = var.monitoring_node ? oci_core_instance.monitoring[0].display_name : "", + monitoring_ip = var.monitoring_node ? oci_core_instance.monitoring[0].private_ip: "", compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) public_subnet = data.oci_core_subnet.public_subnet.cidr_block, private_subnet = data.oci_core_subnet.private_subnet.cidr_block, @@ -232,7 +234,7 @@ resource "null_resource" "cluster_backup" { shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape, instance_pool_ocpus = local.instance_pool_ocpus, queue=var.queue, - monitoring = var.monitoring, + cluster_monitoring = var.cluster_monitoring, hyperthreading = var.hyperthreading, controller_username = var.controller_username, compute_username = var.compute_username, @@ -341,6 +343,8 @@ resource "null_resource" "cluster_backup" { backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "", login_name = var.login_node ? oci_core_instance.login[0].display_name : "", login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "", + monitoring_name = var.monitoring_node ? oci_core_instance.monitoring[0].display_name : "", + monitoring_ip = var.monitoring_node ? oci_core_instance.monitoring[0].private_ip: "", compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) public_subnet = data.oci_core_subnet.public_subnet.cidr_block, public_subnet_id = local.controller_subnet_id, @@ -387,7 +391,7 @@ resource "null_resource" "cluster_backup" { localdisk = var.localdisk, log_vol = var.log_vol, redundancy = var.redundancy, - monitoring = var.monitoring, + cluster_monitoring = var.cluster_monitoring, hyperthreading = var.hyperthreading, unsupported = var.unsupported, autoscaling_monitoring = var.autoscaling_monitoring, diff --git a/variables.tf b/variables.tf index 4a6ef21e..ccf1fc15 100755 --- a/variables.tf +++ b/variables.tf @@ -24,6 +24,10 @@ variable "custom_login_image" { type = string default = "image.ocid" } +variable "custom_monitoring_image" { + type = string + default = "image.ocid" +} variable "controller_boot_volume_size" {} variable "controller_boot_volume_backup" {} variable "controller_boot_volume_backup_type" {default = "INCREMENTAL"} @@ -40,6 +44,7 @@ variable "image_ocid" { default = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm variable "use_compute_agent" { default = true } variable "unsupported_controller_image" { default = "" } variable "unsupported_login_image" { default = "" } +variable "unsupported_monitoring_image" { default = "" } variable "use_cluster_nfs" { default = true} variable "use_scratch_nfs" { default = false } variable "cluster_nfs_path" { default = "/nfs/cluster" } @@ -61,6 +66,10 @@ variable "login_node" { default = true } variable "login_ad" {default = ""} variable "login_shape" { default = "VM.Standard2.4" } variable "login_boot_volume_size" {default = 50} +variable "monitoring_node" { default = false } +variable "monitoring_ad" {default = ""} +variable "monitoring_shape" { default = "VM.Standard2.4" } +variable "monitoring_boot_volume_size" {default = 50} variable "slurm_nfs" { default = false } variable "rack_aware" { default = false } variable "ldap" { default = true } @@ -73,10 +82,14 @@ variable "instance_pool_memory" { default = 16 } variable "instance_pool_custom_memory" { default = false } variable "login_ocpus" { default = 2} variable "login_ocpus_denseIO_flex" { default = 8} +variable "monitoring_ocpus" { default = 2} +variable "monitoring_ocpus_denseIO_flex" { default = 8} variable "controller_memory" { default = 16 } variable "controller_custom_memory" { default = false } variable "login_memory" { default = 16 } variable "login_custom_memory" { default = false } +variable "monitoring_memory" { default = 16 } +variable "monitoring_custom_memory" { default = false } variable "privilege_sudo" { default = true } variable "privilege_group_name" { default = "privilege" } @@ -134,9 +147,7 @@ variable "login_block_volume_performance" { */ default = "10. Balanced performance" - } - variable "login_block" { default = false } @@ -171,8 +182,8 @@ variable "nfs_source_IP" { default = ""} variable "nfs_list_of_mount_target_IPs" { default = ""} variable "nfs_source_path" { default = "/app"} variable "nfs_options" {default = ""} -variable "monitoring" { default = true } variable "enroot" { default = false } +variable "cluster_monitoring" { default = false } variable "pyxis" { default = false } variable "pam" { default = false } variable "sacct_limits" { default = false } @@ -195,6 +206,10 @@ variable "unsupported_login" { type=bool default = false } +variable "unsupported_monitoring" { + type=bool + default = false +} variable "controller_username" { type = string default = "opc" @@ -208,7 +223,10 @@ variable "login_username" { type = string default = "opc" } - +variable "monitoring_username" { + type = string + default = "opc" +} variable "autoscaling_monitoring" { type= bool default = false @@ -246,12 +264,16 @@ variable "log_vol" { default = false } variable "redundancy" { default = true } variable "use_marketplace_image_login" { default = true} +variable "use_marketplace_image_monitoring" { default = true} variable "marketplace_listing_login" { - default = "HPC_OL7" + default = "HPC_OL8" +} +variable "marketplace_listing_monitoring" { + default = "HPC_OL8" } variable "marketplace_listing_controller" { - default = "HPC_OL7" + default = "HPC_OL8" } variable "zone_name" { default = ""