Skip to content

Commit

Permalink
Add monitoring node
Browse files Browse the repository at this point in the history
  • Loading branch information
arnaudfroidmont committed Sep 19, 2024
1 parent de2e194 commit c78461e
Show file tree
Hide file tree
Showing 40 changed files with 572 additions and 89 deletions.
2 changes: 2 additions & 0 deletions autoscaling/tf_init/cluster-network.tf
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ resource "oci_core_cluster_network" "cluster_network" {
}
freeform_tags = {
"user" = var.tags
"cluster_name" = local.cluster_name
"parent_cluster" = local.cluster_name
}
placement_configuration {
availability_domain = var.ad
Expand Down
1 change: 1 addition & 0 deletions autoscaling/tf_init/compute-cluster.tf
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ resource "oci_core_compute_cluster" "compute_cluster" {
#Optional
display_name = local.cluster_name
freeform_tags = {
"user" = var.tags
"cluster_name" = local.cluster_name
"parent_cluster" = local.cluster_name
}
Expand Down
4 changes: 3 additions & 1 deletion autoscaling/tf_init/controller_update.tf
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ resource "local_file" "inventory" {
backup_ip = var.backup_ip,
login_name = var.login_name,
login_ip = var.login_ip,
monitoring_name = var.monitoring_name,
monitoring_ip = var.monitoring_ip,
compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[])
public_subnet = var.public_subnet,
private_subnet = var.private_subnet,
Expand Down Expand Up @@ -66,7 +68,7 @@ resource "local_file" "inventory" {
instance_pool_ocpus=local.instance_pool_ocpus,
queue=var.queue,
instance_type=var.instance_type,
monitoring=var.monitoring,
cluster_monitoring=var.cluster_monitoring,
autoscaling_monitoring = var.autoscaling_monitoring,
unsupported = var.unsupported,
hyperthreading = var.hyperthreading,
Expand Down
2 changes: 2 additions & 0 deletions autoscaling/tf_init/instance-pool.tf
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ resource "oci_core_instance_pool" "instance_pool" {
display_name = local.cluster_name
freeform_tags = {
"user" = var.tags
"cluster_name" = local.cluster_name
"parent_cluster" = local.cluster_name
}
placement_configurations {
availability_domain = var.ad
Expand Down
4 changes: 3 additions & 1 deletion autoscaling/tf_init/inventory.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ ${controller_name} ansible_host=${controller_ip} ansible_user=${controller_usern
%{ if backup_name != "" }${backup_name} ansible_host=${backup_ip} ansible_user=${controller_username} role=controller%{ endif }
[login]
%{ if login_name != "" }${login_name} ansible_host=${login_ip} ansible_user=${compute_username} role=login%{ endif }
[monitoring]
%{ if monitoring_name != "" }${monitoring_name} ansible_host=${monitoring_ip} ansible_user=${compute_username} role=monitoring%{ endif }
[compute_to_add]
[compute_configured]
%{ for host, ip in compute ~}
Expand Down Expand Up @@ -62,7 +64,7 @@ log_vol=${log_vol}
ldap=${ldap}
queue=${queue}
instance_type=${instance_type}
monitoring=${monitoring}
cluster_monitoring=${cluster_monitoring}
hyperthreading=${hyperthreading}
privilege_sudo=${privilege_sudo}
privilege_group_name=${privilege_group_name}
Expand Down
4 changes: 3 additions & 1 deletion conf/variables.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ variable "backup_name" {default = "${backup_name}"}
variable "backup_ip" {default = "${backup_ip}"}
variable "login_name" {default = "${login_name}"}
variable "login_ip" {default = "${login_ip}"}
variable "monitoring_name" {default = "${monitoring_name}"}
variable "monitoring_ip" {default = "${monitoring_ip}"}
variable "scripts_folder" {default = "/opt/oci-hpc/bin/"}
variable "autoscaling_folder" {default = "/opt/oci-hpc/autoscaling/"}
variable "cluster_block_volume_size" {default="${cluster_block_volume_size}"}
Expand Down Expand Up @@ -120,7 +122,7 @@ variable "hyperthreading" { default = ##HT## }
variable "unsupported" { default = ${unsupported} }
variable "image_ocid" { default = "##IMAGE##" }
variable "ldap" { default = ${ldap} }
variable "monitoring" { default = ${monitoring} }
variable "cluster_monitoring" { default = ${cluster_monitoring} }
variable "autoscaling_monitoring" { default = ${autoscaling_monitoring} }


Expand Down
8 changes: 6 additions & 2 deletions controller.tf
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,8 @@ resource "null_resource" "cluster" {
backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "",
login_name = var.login_node ? oci_core_instance.login[0].display_name : "",
login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "",
monitoring_name = var.monitoring_node ? oci_core_instance.monitoring[0].display_name : "",
monitoring_ip = var.monitoring_node ? oci_core_instance.monitoring[0].private_ip: "",
compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[])
public_subnet = data.oci_core_subnet.public_subnet.cidr_block,
private_subnet = data.oci_core_subnet.private_subnet.cidr_block,
Expand Down Expand Up @@ -278,7 +280,7 @@ resource "null_resource" "cluster" {
shape = local.shape,
instance_pool_ocpus = local.instance_pool_ocpus,
queue=var.queue,
monitoring = var.monitoring,
cluster_monitoring = var.cluster_monitoring,
hyperthreading = var.hyperthreading,
controller_username = var.controller_username,
compute_username = var.compute_username,
Expand Down Expand Up @@ -384,6 +386,8 @@ resource "null_resource" "cluster" {
backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "",
login_name = var.login_node ? oci_core_instance.login[0].display_name : "",
login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "",
monitoring_name = var.monitoring_node ? oci_core_instance.monitoring[0].display_name : "",
monitoring_ip = var.monitoring_node ? oci_core_instance.monitoring[0].private_ip: "",
compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[])
public_subnet = data.oci_core_subnet.public_subnet.cidr_block,
public_subnet_id = local.controller_subnet_id,
Expand Down Expand Up @@ -430,7 +434,7 @@ resource "null_resource" "cluster" {
localdisk = var.localdisk,
log_vol = var.log_vol,
redundancy = var.redundancy,
monitoring = var.monitoring,
cluster_monitoring = var.cluster_monitoring,
hyperthreading = var.hyperthreading,
unsupported = var.unsupported,
autoscaling_monitoring = var.autoscaling_monitoring,
Expand Down
7 changes: 7 additions & 0 deletions data.tf
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,13 @@ data "oci_resourcemanager_private_endpoint_reachable_ip" "private_endpoint_reach
private_ip = tostring(oci_core_instance.login[0].private_ip)
}

data "oci_resourcemanager_private_endpoint_reachable_ip" "private_endpoint_reachable_ip_monitoring" {
#Required
count = (var.private_deployment && var.monitoring_node) ? 1 : 0
private_endpoint_id = oci_resourcemanager_private_endpoint.rms_private_endpoint[0].id
private_ip = tostring(oci_core_instance.monitoring[0].private_ip)
}

data "oci_dns_views" "dns_views" {
depends_on = [local.controller_subnet, oci_core_vcn.vcn]
compartment_id = var.targetCompartment
Expand Down
4 changes: 3 additions & 1 deletion inventory.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ ${controller_name} ansible_host=${controller_ip} ansible_user=${controller_usern
%{ if backup_name != "" }${backup_name} ansible_host=${backup_ip} ansible_user=${compute_username} role=controller%{ endif }
[login]
%{ if login_name != "" }${login_name} ansible_host=${login_ip} ansible_user=${compute_username} role=login%{ endif }
[monitoring]
%{ if monitoring_name != "" }${monitoring_name} ansible_host=${monitoring_ip} ansible_user=${compute_username} role=monitoring%{ endif }
[compute_to_add]
[compute_configured]
%{ for host, ip in compute ~}
Expand Down Expand Up @@ -53,7 +55,7 @@ redundancy=${redundancy}
log_vol=${log_vol}
instance_pool_ocpus=${instance_pool_ocpus}
queue=${queue}
monitoring=${monitoring}
cluster_monitoring=${cluster_monitoring}
hyperthreading=${hyperthreading}
ldap=${ldap}
autoscaling_monitoring=${autoscaling_monitoring}
Expand Down
8 changes: 8 additions & 0 deletions locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,14 @@ locals {
image_ocid = var.unsupported ? var.image_ocid : var.image
custom_controller_image_ocid = var.unsupported_controller ? var.unsupported_controller_image : var.custom_controller_image
custom_login_image_ocid = var.unsupported_login ? var.unsupported_login_image : var.custom_login_image
custom_monitoring_image_ocid = var.unsupported_monitoring ? var.unsupported_monitoring_image : var.custom_monitoring_image


shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape
instance_pool_ocpus = ( local.shape == "VM.DenseIO.E4.Flex" || local.shape == "VM.DenseIO.E5.Flex" ) ? var.instance_pool_ocpus_denseIO_flex : var.instance_pool_ocpus
controller_ocpus = ( var.controller_shape == "VM.DenseIO.E4.Flex" || var.controller_shape == "VM.DenseIO.E5.Flex" ) ? var.controller_ocpus_denseIO_flex : var.controller_ocpus
login_ocpus = ( var.login_shape == "VM.DenseIO.E4.Flex" || var.login_shape == "VM.DenseIO.E5.Flex" ) ? var.login_ocpus_denseIO_flex : var.login_ocpus
monitoring_ocpus = ( var.monitoring_shape == "VM.DenseIO.E4.Flex" || var.monitoring_shape == "VM.DenseIO.E5.Flex" ) ? var.monitoring_ocpus_denseIO_flex : var.monitoring_ocpus
// ips of the instances
cluster_instances_ips = var.compute_cluster ? oci_core_instance.compute_cluster_instances.*.private_ip : var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.private_ip : data.oci_core_instance.instance_pool_instances.*.private_ip
first_vcn_ip = cidrhost(data.oci_core_subnet.private_subnet.cidr_block,0)
Expand All @@ -36,6 +39,8 @@ locals {

login_image = var.login_node && var.use_marketplace_image_login ? oci_core_app_catalog_subscription.login_mp_image_subscription[0].listing_resource_id : local.custom_login_image_ocid

monitoring_image = var.monitoring_node && var.use_marketplace_image_monitoring ? oci_core_app_catalog_subscription.monitoring_mp_image_subscription[0].listing_resource_id : local.custom_monitoring_image_ocid

cluster_network_image = var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid

instance_pool_image = ! var.cluster_network && var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid
Expand All @@ -44,6 +49,7 @@ locals {

is_controller_flex_shape = length(regexall(".*VM.*.*Flex$", var.controller_shape)) > 0 ? [local.controller_ocpus]:[]
is_login_flex_shape = length(regexall(".*VM.*.*Flex$", var.login_shape)) > 0 ? [local.login_ocpus]:[]
is_monitoring_flex_shape = length(regexall(".*VM.*.*Flex$", var.monitoring_shape)) > 0 ? [local.monitoring_ocpus]:[]

is_instance_pool_flex_shape = length(regexall(".*VM.*.*Flex$", var.instance_pool_shape)) > 0 ? [local.instance_pool_ocpus]:[]

Expand All @@ -63,10 +69,12 @@ locals {
host = var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip[0].ip_address : oci_core_instance.controller.public_ip
controller_bool_ip = var.private_deployment ? false : true
login_bool_ip = var.private_deployment ? false : true
monitoring_bool_ip = var.private_deployment ? false : true
controller_subnet = var.private_deployment ? oci_core_subnet.private-subnet : oci_core_subnet.public-subnet
private_subnet_cidr = var.private_deployment ? [var.public_subnet, var.private_subnet] : [var.private_subnet]
host_backup = var.slurm_ha ? var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip_backup[0].ip_address : oci_core_instance.backup[0].public_ip : "none"
host_login = var.login_node ? var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip_login[0].ip_address : oci_core_instance.login[0].public_ip : "none"
host_monitoring = var.monitoring_node ? var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip_monitoring[0].ip_address : oci_core_instance.monitoring[0].public_ip : "none"

timeout_per_batch= var.cluster_network ? 30 : 15
timeout_ip = join("",[ (( var.node_count - ( var.node_count % 20 ) )/20 + 1 ) * local.timeout_per_batch,"m"])
Expand Down
28 changes: 27 additions & 1 deletion marketplace.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ locals {
mp_listing_id = var.use_marketplace_image ? substr(var.marketplace_listing,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : ""
mp_controller_listing_id = var.use_marketplace_image_controller ? substr(var.marketplace_listing_controller,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : ""
mp_login_listing_id = var.use_marketplace_image_login ? substr(var.marketplace_listing_login,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : ""
mp_monitoring_listing_id = var.use_marketplace_image_monitoring ? substr(var.marketplace_listing_monitoring,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : ""
mp_version_id = var.marketplace_version_id[var.marketplace_listing]
mp_controller_version_id = var.marketplace_version_id[var.marketplace_listing_controller]
mp_login_version_id = var.marketplace_version_id[var.marketplace_listing_login]
mp_monitoring_version_id = var.marketplace_version_id[var.marketplace_listing_monitoring]
}

/*
Expand Down Expand Up @@ -80,13 +82,23 @@ data "oci_core_app_catalog_listing_resource_versions" "login_app_catalog_listing
count = var.login_node && var.use_marketplace_image_login ? 1 : 0
listing_id = local.mp_login_listing_id
}

data "oci_core_app_catalog_listing_resource_versions" "monitoring_app_catalog_listing_resource_versions" {
count = var.monitoring_node && var.use_marketplace_image_monitoring ? 1 : 0
listing_id = local.mp_monitoring_listing_id
}
resource "oci_core_app_catalog_listing_resource_version_agreement" "login_mp_image_agreement" {
count = var.login_node && var.use_marketplace_image_login ? 1 : 0

listing_id = local.mp_login_listing_id
listing_resource_version = local.mp_login_version_id

}
resource "oci_core_app_catalog_listing_resource_version_agreement" "monitoring_mp_image_agreement" {
count = var.monitoring_node && var.use_marketplace_image_monitoring ? 1 : 0

listing_id = local.mp_monitoring_listing_id
listing_resource_version = local.mp_monitoring_version_id

}

resource "oci_core_app_catalog_subscription" "login_mp_image_subscription" {
Expand All @@ -103,3 +115,17 @@ resource "oci_core_app_catalog_subscription" "login_mp_image_subscription" {
create = "20m"
}
}
resource "oci_core_app_catalog_subscription" "monitoring_mp_image_subscription" {
count = var.monitoring_node && var.use_marketplace_image_monitoring ? 1 : 0
compartment_id = var.targetCompartment
eula_link = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].eula_link
listing_id = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].listing_id
listing_resource_version = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].listing_resource_version
oracle_terms_of_use_link = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].oracle_terms_of_use_link
signature = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].signature
time_retrieved = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].time_retrieved

timeouts {
create = "20m"
}
}
55 changes: 55 additions & 0 deletions monitoring.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
resource "oci_core_instance" "monitoring" {
count = var.monitoring_node ? 1 : 0
depends_on = [oci_core_subnet.public-subnet]
availability_domain = var.monitoring_ad
compartment_id = var.targetCompartment
shape = var.monitoring_shape

dynamic "shape_config" {
for_each = local.is_monitoring_flex_shape
content {
ocpus = shape_config.value
memory_in_gbs = var.monitoring_custom_memory ? var.monitoring_memory : 16 * shape_config.value
}
}
agent_config {
is_management_disabled = true
}
display_name = "${local.cluster_name}-monitoring"

freeform_tags = {
"cluster_name" = local.cluster_name
"parent_cluster" = local.cluster_name
}

metadata = {
ssh_authorized_keys = "${var.ssh_key}\n${tls_private_key.ssh.public_key_openssh}"
user_data = base64encode(data.template_file.controller_config.rendered)
}
source_details {
source_id = local.monitoring_image
boot_volume_size_in_gbs = var.monitoring_boot_volume_size
boot_volume_vpus_per_gb = 30
source_type = "image"
}

create_vnic_details {
subnet_id = local.controller_subnet_id
assign_public_ip = local.monitoring_bool_ip
}
}

resource "oci_dns_rrset" "rrset-monitoring" {
count = var.monitoring_node && var.dns_entries ? 1 : 0
zone_name_or_id = data.oci_dns_zones.dns_zones.zones[0].id
domain = "${var.monitoring_node ? oci_core_instance.monitoring[0].display_name : ""}.${local.zone_name}"
rtype = "A"
items {
domain = "${var.monitoring_node ? oci_core_instance.monitoring[0].display_name : ""}.${local.zone_name}"
rtype = "A"
rdata = var.monitoring_node ? oci_core_instance.monitoring[0].private_ip: ""
ttl = 3600
}
scope = "PRIVATE"
view_id = data.oci_dns_views.dns_views.views[0].id
}
4 changes: 4 additions & 0 deletions outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,8 @@ output "backup" {

output "login" {
value = var.login_node ? local.host_login : "No Login Node Defined"
}

output "monitoring" {
value = var.monitoring_node ? local.host_monitoring : "No Monitoring Node Defined"
}
2 changes: 1 addition & 1 deletion playbooks/destroy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
- include_role:
name: slurm
when: slurm|default(false)|bool
- hosts: controller, slurm_backup, login
- hosts: controller, slurm_backup, login, monitoring
become: true
vars:
destroy: true
Expand Down
25 changes: 25 additions & 0 deletions playbooks/monitoring.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
- hosts: all,!monitoring
gather_facts: true
tasks:
- include_role:
name: metrics-exporter
when: cluster_monitoring|default(false)|bool

- hosts: monitoring
gather_facts: true
tasks:
- include_role:
name: grafana
when: cluster_monitoring|default(false)|bool and ( groups['monitoring'] | length > 0 )

- hosts: controller
tasks:
- include_role:
name: grafana
when: cluster_monitoring|default(false)|bool and ( groups['monitoring'] | length == 0 )

- hosts: controller, monitoring
tasks:
- include_role:
name: prometheus
when: cluster_monitoring|default(false)|bool
6 changes: 3 additions & 3 deletions playbooks/new_nodes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
- include_role:
name: healthchecks

- hosts: controller,slurm_backup,login,compute
- hosts: controller,slurm_backup,login,compute, monitoring
become: true
vars:
destroy: false
Expand Down Expand Up @@ -201,10 +201,10 @@
when: spack|default(false)|bool
- include_role:
name: prometheus
when: monitoring|default(false)|bool
when: cluster_monitoring|default(false)|bool
- include_role:
name: metrics-exporter
when: monitoring|default(false)|bool
when: cluster_monitoring|default(false)|bool
- include_role:
name: slurm
when: slurm|default(false)|bool
6 changes: 3 additions & 3 deletions playbooks/resize_add.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
- include_role:
name: healthchecks

- hosts: controller,slurm_backup,login,compute
- hosts: controller,slurm_backup,login,compute, monitoring
become: true
vars:
destroy: false
Expand Down Expand Up @@ -194,10 +194,10 @@
when: spack|default(false)|bool
- include_role:
name: prometheus
when: monitoring|default(false)|bool
when: cluster_monitoring|default(false)|bool
- include_role:
name: metrics-exporter
when: monitoring|default(false)|bool
when: cluster_monitoring|default(false)|bool
- include_role:
name: slurm
when: slurm|default(false)|bool
Loading

0 comments on commit c78461e

Please sign in to comment.