diff --git a/autoscaling/tf_init/cluster-network.tf b/autoscaling/tf_init/cluster-network.tf
index d7b0b4f0..c3218b1f 100755
--- a/autoscaling/tf_init/cluster-network.tf
+++ b/autoscaling/tf_init/cluster-network.tf
@@ -28,6 +28,8 @@ resource "oci_core_cluster_network" "cluster_network" {
   }
   freeform_tags = {
       "user" = var.tags
+      "cluster_name" = local.cluster_name
+      "parent_cluster" = local.cluster_name
   }
   placement_configuration {
     availability_domain = var.ad
diff --git a/autoscaling/tf_init/compute-cluster.tf b/autoscaling/tf_init/compute-cluster.tf
index ef9067b8..1b5b7dfa 100755
--- a/autoscaling/tf_init/compute-cluster.tf
+++ b/autoscaling/tf_init/compute-cluster.tf
@@ -7,6 +7,7 @@ resource "oci_core_compute_cluster" "compute_cluster" {
     #Optional
     display_name = local.cluster_name
     freeform_tags = {
+      "user" = var.tags
       "cluster_name" = local.cluster_name
       "parent_cluster" = local.cluster_name
   }
diff --git a/autoscaling/tf_init/controller_update.tf b/autoscaling/tf_init/controller_update.tf
index 86ddf9e5..1f4d36db 100755
--- a/autoscaling/tf_init/controller_update.tf
+++ b/autoscaling/tf_init/controller_update.tf
@@ -24,6 +24,8 @@ resource "local_file" "inventory" {
     backup_ip = var.backup_ip,
     login_name = var.login_name,
     login_ip = var.login_ip,
+    monitoring_name = var.monitoring_name,
+    monitoring_ip = var.monitoring_ip,
     compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[])
     public_subnet = var.public_subnet, 
     private_subnet = var.private_subnet, 
@@ -66,7 +68,7 @@ resource "local_file" "inventory" {
     instance_pool_ocpus=local.instance_pool_ocpus,
     queue=var.queue,
     instance_type=var.instance_type,
-    monitoring=var.monitoring,
+    cluster_monitoring=var.cluster_monitoring,
     autoscaling_monitoring = var.autoscaling_monitoring,
     unsupported = var.unsupported,
     hyperthreading = var.hyperthreading,
diff --git a/autoscaling/tf_init/instance-pool.tf b/autoscaling/tf_init/instance-pool.tf
index 37ca4b2e..a5088bb8 100755
--- a/autoscaling/tf_init/instance-pool.tf
+++ b/autoscaling/tf_init/instance-pool.tf
@@ -27,6 +27,8 @@ resource "oci_core_instance_pool" "instance_pool" {
   display_name              = local.cluster_name
   freeform_tags = {
       "user" = var.tags
+      "cluster_name" = local.cluster_name
+      "parent_cluster" = local.cluster_name
   }
   placement_configurations {
     availability_domain = var.ad
diff --git a/autoscaling/tf_init/inventory.tpl b/autoscaling/tf_init/inventory.tpl
index 7006035f..f511e48a 100755
--- a/autoscaling/tf_init/inventory.tpl
+++ b/autoscaling/tf_init/inventory.tpl
@@ -4,6 +4,8 @@ ${controller_name} ansible_host=${controller_ip} ansible_user=${controller_usern
 %{ if backup_name != "" }${backup_name} ansible_host=${backup_ip} ansible_user=${controller_username} role=controller%{ endif }
 [login]
 %{ if login_name != "" }${login_name} ansible_host=${login_ip} ansible_user=${compute_username} role=login%{ endif }
+[monitoring]
+%{ if monitoring_name != "" }${monitoring_name} ansible_host=${monitoring_ip} ansible_user=${compute_username} role=monitoring%{ endif }
 [compute_to_add]
 [compute_configured]
 %{ for host, ip in compute ~}
@@ -62,7 +64,7 @@ log_vol=${log_vol}
 ldap=${ldap}
 queue=${queue}
 instance_type=${instance_type}
-monitoring=${monitoring}
+cluster_monitoring=${cluster_monitoring}
 hyperthreading=${hyperthreading}
 privilege_sudo=${privilege_sudo}
 privilege_group_name=${privilege_group_name}
diff --git a/conf/variables.tpl b/conf/variables.tpl
index 37497275..c4f9ea05 100755
--- a/conf/variables.tpl
+++ b/conf/variables.tpl
@@ -92,6 +92,8 @@ variable "backup_name" {default = "${backup_name}"}
 variable "backup_ip" {default = "${backup_ip}"}
 variable "login_name" {default = "${login_name}"}
 variable "login_ip" {default = "${login_ip}"}
+variable "monitoring_name" {default = "${monitoring_name}"}
+variable "monitoring_ip" {default = "${monitoring_ip}"}
 variable "scripts_folder" {default = "/opt/oci-hpc/bin/"}
 variable "autoscaling_folder" {default = "/opt/oci-hpc/autoscaling/"}
 variable "cluster_block_volume_size" {default="${cluster_block_volume_size}"}
@@ -120,7 +122,7 @@ variable "hyperthreading" { default = ##HT## }
 variable "unsupported" { default = ${unsupported} }
 variable "image_ocid" { default = "##IMAGE##" }
 variable "ldap" { default = ${ldap} }
-variable "monitoring" { default = ${monitoring} }
+variable "cluster_monitoring" { default = ${cluster_monitoring} }
 variable "autoscaling_monitoring" { default = ${autoscaling_monitoring} }
 
 
diff --git a/controller.tf b/controller.tf
index ab942ed0..f8185c20 100644
--- a/controller.tf
+++ b/controller.tf
@@ -237,6 +237,8 @@ resource "null_resource" "cluster" {
       backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "",
       login_name = var.login_node ? oci_core_instance.login[0].display_name : "",
       login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "",
+      monitoring_name = var.monitoring_node ? oci_core_instance.monitoring[0].display_name : "",
+      monitoring_ip = var.monitoring_node ? oci_core_instance.monitoring[0].private_ip: "",
       compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[])
       public_subnet = data.oci_core_subnet.public_subnet.cidr_block, 
       private_subnet = data.oci_core_subnet.private_subnet.cidr_block, 
@@ -278,7 +280,7 @@ resource "null_resource" "cluster" {
       shape = local.shape,
       instance_pool_ocpus = local.instance_pool_ocpus,
       queue=var.queue,
-      monitoring = var.monitoring,
+      cluster_monitoring = var.cluster_monitoring,
       hyperthreading = var.hyperthreading,
       controller_username = var.controller_username,
       compute_username = var.compute_username,
@@ -384,6 +386,8 @@ resource "null_resource" "cluster" {
       backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "",
       login_name = var.login_node ? oci_core_instance.login[0].display_name : "",
       login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "",
+      monitoring_name = var.monitoring_node ? oci_core_instance.monitoring[0].display_name : "",
+      monitoring_ip = var.monitoring_node ? oci_core_instance.monitoring[0].private_ip: "",
       compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[])
       public_subnet = data.oci_core_subnet.public_subnet.cidr_block,
       public_subnet_id = local.controller_subnet_id,
@@ -430,7 +434,7 @@ resource "null_resource" "cluster" {
       localdisk = var.localdisk,
       log_vol = var.log_vol,
       redundancy = var.redundancy,
-      monitoring = var.monitoring,
+      cluster_monitoring = var.cluster_monitoring,
       hyperthreading = var.hyperthreading,
       unsupported = var.unsupported,
       autoscaling_monitoring = var.autoscaling_monitoring,
diff --git a/data.tf b/data.tf
index e5dd4277..05c577e3 100755
--- a/data.tf
+++ b/data.tf
@@ -78,6 +78,13 @@ data "oci_resourcemanager_private_endpoint_reachable_ip" "private_endpoint_reach
     private_ip = tostring(oci_core_instance.login[0].private_ip)
 }
 
+data "oci_resourcemanager_private_endpoint_reachable_ip" "private_endpoint_reachable_ip_monitoring" {
+    #Required
+    count = (var.private_deployment && var.monitoring_node) ? 1 : 0
+    private_endpoint_id = oci_resourcemanager_private_endpoint.rms_private_endpoint[0].id
+    private_ip = tostring(oci_core_instance.monitoring[0].private_ip)
+}
+
 data "oci_dns_views" "dns_views" {
   depends_on = [local.controller_subnet, oci_core_vcn.vcn]
   compartment_id = var.targetCompartment
diff --git a/inventory.tpl b/inventory.tpl
index eed3ac4d..681cf159 100755
--- a/inventory.tpl
+++ b/inventory.tpl
@@ -4,6 +4,8 @@ ${controller_name} ansible_host=${controller_ip} ansible_user=${controller_usern
 %{ if backup_name != "" }${backup_name} ansible_host=${backup_ip} ansible_user=${compute_username} role=controller%{ endif }
 [login]
 %{ if login_name != "" }${login_name} ansible_host=${login_ip} ansible_user=${compute_username} role=login%{ endif }
+[monitoring]
+%{ if monitoring_name != "" }${monitoring_name} ansible_host=${monitoring_ip} ansible_user=${compute_username} role=monitoring%{ endif }
 [compute_to_add]
 [compute_configured]
 %{ for host, ip in compute ~}
@@ -53,7 +55,7 @@ redundancy=${redundancy}
 log_vol=${log_vol}
 instance_pool_ocpus=${instance_pool_ocpus}
 queue=${queue}
-monitoring=${monitoring}
+cluster_monitoring=${cluster_monitoring}
 hyperthreading=${hyperthreading}
 ldap=${ldap}
 autoscaling_monitoring=${autoscaling_monitoring}
diff --git a/locals.tf b/locals.tf
index 12a07964..93bd4b0b 100755
--- a/locals.tf
+++ b/locals.tf
@@ -6,11 +6,14 @@ locals {
   image_ocid = var.unsupported ? var.image_ocid : var.image
   custom_controller_image_ocid = var.unsupported_controller ? var.unsupported_controller_image : var.custom_controller_image
   custom_login_image_ocid = var.unsupported_login ? var.unsupported_login_image : var.custom_login_image
+  custom_monitoring_image_ocid = var.unsupported_monitoring ? var.unsupported_monitoring_image : var.custom_monitoring_image
+
 
   shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape
   instance_pool_ocpus = ( local.shape == "VM.DenseIO.E4.Flex" || local.shape == "VM.DenseIO.E5.Flex" ) ? var.instance_pool_ocpus_denseIO_flex : var.instance_pool_ocpus
   controller_ocpus = ( var.controller_shape == "VM.DenseIO.E4.Flex" || var.controller_shape == "VM.DenseIO.E5.Flex" ) ? var.controller_ocpus_denseIO_flex : var.controller_ocpus
   login_ocpus = ( var.login_shape == "VM.DenseIO.E4.Flex" || var.login_shape == "VM.DenseIO.E5.Flex" ) ? var.login_ocpus_denseIO_flex : var.login_ocpus
+  monitoring_ocpus = ( var.monitoring_shape == "VM.DenseIO.E4.Flex" || var.monitoring_shape == "VM.DenseIO.E5.Flex" ) ? var.monitoring_ocpus_denseIO_flex : var.monitoring_ocpus
 // ips of the instances
   cluster_instances_ips = var.compute_cluster ? oci_core_instance.compute_cluster_instances.*.private_ip : var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.private_ip : data.oci_core_instance.instance_pool_instances.*.private_ip
   first_vcn_ip = cidrhost(data.oci_core_subnet.private_subnet.cidr_block,0)
@@ -36,6 +39,8 @@ locals {
 
   login_image = var.login_node &&  var.use_marketplace_image_login ? oci_core_app_catalog_subscription.login_mp_image_subscription[0].listing_resource_id : local.custom_login_image_ocid
 
+  monitoring_image = var.monitoring_node &&  var.use_marketplace_image_monitoring ? oci_core_app_catalog_subscription.monitoring_mp_image_subscription[0].listing_resource_id : local.custom_monitoring_image_ocid
+
   cluster_network_image = var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid
 
   instance_pool_image = ! var.cluster_network && var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid
@@ -44,6 +49,7 @@ locals {
 
   is_controller_flex_shape = length(regexall(".*VM.*.*Flex$", var.controller_shape)) > 0 ? [local.controller_ocpus]:[]
   is_login_flex_shape = length(regexall(".*VM.*.*Flex$", var.login_shape)) > 0 ? [local.login_ocpus]:[]
+  is_monitoring_flex_shape = length(regexall(".*VM.*.*Flex$", var.monitoring_shape)) > 0 ? [local.monitoring_ocpus]:[]
 
   is_instance_pool_flex_shape = length(regexall(".*VM.*.*Flex$", var.instance_pool_shape)) > 0 ? [local.instance_pool_ocpus]:[]
   
@@ -63,10 +69,12 @@ locals {
   host = var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip[0].ip_address : oci_core_instance.controller.public_ip
   controller_bool_ip = var.private_deployment ? false : true
   login_bool_ip = var.private_deployment ? false : true
+  monitoring_bool_ip = var.private_deployment ? false : true
   controller_subnet = var.private_deployment ? oci_core_subnet.private-subnet : oci_core_subnet.public-subnet
   private_subnet_cidr = var.private_deployment ? [var.public_subnet, var.private_subnet] : [var.private_subnet]
   host_backup = var.slurm_ha ? var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip_backup[0].ip_address : oci_core_instance.backup[0].public_ip : "none"
   host_login = var.login_node ? var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip_login[0].ip_address : oci_core_instance.login[0].public_ip : "none"
+  host_monitoring = var.monitoring_node ? var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip_monitoring[0].ip_address : oci_core_instance.monitoring[0].public_ip : "none"
 
   timeout_per_batch= var.cluster_network ? 30 : 15
   timeout_ip = join("",[ (( var.node_count - ( var.node_count % 20 ) )/20 + 1 ) * local.timeout_per_batch,"m"])
diff --git a/marketplace.tf b/marketplace.tf
index a735598d..8cac340e 100755
--- a/marketplace.tf
+++ b/marketplace.tf
@@ -3,9 +3,11 @@ locals {
     mp_listing_id = var.use_marketplace_image ? substr(var.marketplace_listing,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : ""
     mp_controller_listing_id = var.use_marketplace_image_controller ?  substr(var.marketplace_listing_controller,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : ""
     mp_login_listing_id = var.use_marketplace_image_login ? substr(var.marketplace_listing_login,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : ""
+    mp_monitoring_listing_id = var.use_marketplace_image_monitoring ? substr(var.marketplace_listing_monitoring,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : ""
     mp_version_id = var.marketplace_version_id[var.marketplace_listing]
     mp_controller_version_id = var.marketplace_version_id[var.marketplace_listing_controller]
     mp_login_version_id = var.marketplace_version_id[var.marketplace_listing_login]
+    mp_monitoring_version_id = var.marketplace_version_id[var.marketplace_listing_monitoring]
 }
 
 /* 
@@ -80,13 +82,23 @@ data "oci_core_app_catalog_listing_resource_versions" "login_app_catalog_listing
     count = var.login_node && var.use_marketplace_image_login  ? 1 : 0
     listing_id = local.mp_login_listing_id
 }
-
+data "oci_core_app_catalog_listing_resource_versions" "monitoring_app_catalog_listing_resource_versions" {
+    count = var.monitoring_node && var.use_marketplace_image_monitoring  ? 1 : 0
+    listing_id = local.mp_monitoring_listing_id
+}
 resource "oci_core_app_catalog_listing_resource_version_agreement" "login_mp_image_agreement" {
   count = var.login_node && var.use_marketplace_image_login ? 1 : 0
 
   listing_id               = local.mp_login_listing_id
   listing_resource_version = local.mp_login_version_id
 
+}
+resource "oci_core_app_catalog_listing_resource_version_agreement" "monitoring_mp_image_agreement" {
+  count = var.monitoring_node && var.use_marketplace_image_monitoring ? 1 : 0
+
+  listing_id               = local.mp_monitoring_listing_id
+  listing_resource_version = local.mp_monitoring_version_id
+
 }
 
 resource "oci_core_app_catalog_subscription" "login_mp_image_subscription" {
@@ -103,3 +115,17 @@ resource "oci_core_app_catalog_subscription" "login_mp_image_subscription" {
     create = "20m"
   }
 }
+resource "oci_core_app_catalog_subscription" "monitoring_mp_image_subscription" {
+  count                    = var.monitoring_node && var.use_marketplace_image_monitoring  ? 1 : 0
+  compartment_id           = var.targetCompartment
+  eula_link                = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].eula_link
+  listing_id               = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].listing_id
+  listing_resource_version = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].listing_resource_version
+  oracle_terms_of_use_link = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].oracle_terms_of_use_link
+  signature                = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].signature
+  time_retrieved           = oci_core_app_catalog_listing_resource_version_agreement.monitoring_mp_image_agreement[0].time_retrieved
+
+  timeouts {
+    create = "20m"
+  }
+}
diff --git a/monitoring.tf b/monitoring.tf
new file mode 100644
index 00000000..bf8d71e7
--- /dev/null
+++ b/monitoring.tf
@@ -0,0 +1,55 @@
+resource "oci_core_instance" "monitoring" {
+  count = var.monitoring_node ? 1 : 0
+  depends_on          = [oci_core_subnet.public-subnet]
+  availability_domain = var.monitoring_ad
+  compartment_id      = var.targetCompartment
+  shape               = var.monitoring_shape
+
+  dynamic "shape_config" {
+    for_each = local.is_monitoring_flex_shape
+      content {
+        ocpus = shape_config.value
+        memory_in_gbs = var.monitoring_custom_memory ? var.monitoring_memory : 16 * shape_config.value
+      }
+  }
+  agent_config {
+    is_management_disabled = true
+    }
+  display_name        = "${local.cluster_name}-monitoring"
+
+  freeform_tags = {
+    "cluster_name" = local.cluster_name
+    "parent_cluster" = local.cluster_name
+  }
+
+  metadata = {
+    ssh_authorized_keys = "${var.ssh_key}\n${tls_private_key.ssh.public_key_openssh}"
+    user_data           = base64encode(data.template_file.controller_config.rendered)
+  }
+  source_details {
+    source_id = local.monitoring_image
+    boot_volume_size_in_gbs = var.monitoring_boot_volume_size
+    boot_volume_vpus_per_gb = 30
+    source_type = "image"
+  }
+
+  create_vnic_details {
+    subnet_id = local.controller_subnet_id
+    assign_public_ip = local.monitoring_bool_ip
+  }
+} 
+
+resource "oci_dns_rrset" "rrset-monitoring" {
+  count = var.monitoring_node && var.dns_entries ? 1 : 0
+  zone_name_or_id = data.oci_dns_zones.dns_zones.zones[0].id
+  domain          = "${var.monitoring_node ? oci_core_instance.monitoring[0].display_name : ""}.${local.zone_name}"
+  rtype           = "A"
+  items {
+    domain = "${var.monitoring_node ? oci_core_instance.monitoring[0].display_name : ""}.${local.zone_name}"
+    rtype  = "A"
+    rdata  = var.monitoring_node ? oci_core_instance.monitoring[0].private_ip: ""
+    ttl    = 3600
+  }
+  scope = "PRIVATE"
+  view_id = data.oci_dns_views.dns_views.views[0].id
+}
\ No newline at end of file
diff --git a/outputs.tf b/outputs.tf
index af5b5cba..d4770cf1 100755
--- a/outputs.tf
+++ b/outputs.tf
@@ -12,4 +12,8 @@ output "backup" {
 
 output "login" {
   value = var.login_node ? local.host_login : "No Login Node Defined"
+}
+
+output "monitoring" {
+  value = var.monitoring_node ? local.host_monitoring : "No Monitoring Node Defined"
 }
\ No newline at end of file
diff --git a/playbooks/destroy.yml b/playbooks/destroy.yml
index 9f413982..f7ded4a0 100755
--- a/playbooks/destroy.yml
+++ b/playbooks/destroy.yml
@@ -9,7 +9,7 @@
     - include_role:
         name: slurm
       when: slurm|default(false)|bool
-- hosts: controller, slurm_backup, login
+- hosts: controller, slurm_backup, login, monitoring
   become: true
   vars:
     destroy: true
diff --git a/playbooks/monitoring.yml b/playbooks/monitoring.yml
new file mode 100644
index 00000000..8f1102b9
--- /dev/null
+++ b/playbooks/monitoring.yml
@@ -0,0 +1,25 @@
+- hosts: all,!monitoring
+  gather_facts: true
+  tasks: 
+    - include_role: 
+        name: metrics-exporter
+      when: cluster_monitoring|default(false)|bool
+
+- hosts: monitoring
+  gather_facts: true
+  tasks: 
+    - include_role: 
+        name: grafana
+      when: cluster_monitoring|default(false)|bool and ( groups['monitoring'] | length > 0 ) 
+
+- hosts: controller
+  tasks: 
+    - include_role: 
+        name: grafana
+      when: cluster_monitoring|default(false)|bool and ( groups['monitoring'] | length == 0 ) 
+
+- hosts: controller, monitoring
+  tasks: 
+    - include_role: 
+        name: prometheus
+      when: cluster_monitoring|default(false)|bool
\ No newline at end of file
diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml
index 83553281..5e3adb1b 100644
--- a/playbooks/new_nodes.yml
+++ b/playbooks/new_nodes.yml
@@ -51,7 +51,7 @@
     - include_role: 
         name: healthchecks
 
-- hosts: controller,slurm_backup,login,compute
+- hosts: controller,slurm_backup,login,compute, monitoring
   become: true
   vars: 
     destroy: false
@@ -201,10 +201,10 @@
       when: spack|default(false)|bool
     - include_role: 
         name: prometheus
-      when: monitoring|default(false)|bool
+      when: cluster_monitoring|default(false)|bool
     - include_role: 
         name: metrics-exporter
-      when: monitoring|default(false)|bool
+      when: cluster_monitoring|default(false)|bool
     - include_role: 
         name: slurm
       when: slurm|default(false)|bool
\ No newline at end of file
diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml
index 95a05f21..576cbd1f 100644
--- a/playbooks/resize_add.yml
+++ b/playbooks/resize_add.yml
@@ -49,7 +49,7 @@
     - include_role: 
         name: healthchecks
 
-- hosts: controller,slurm_backup,login,compute
+- hosts: controller,slurm_backup,login,compute, monitoring
   become: true
   vars: 
     destroy: false
@@ -194,10 +194,10 @@
       when: spack|default(false)|bool
     - include_role: 
         name: prometheus
-      when: monitoring|default(false)|bool
+      when: cluster_monitoring|default(false)|bool
     - include_role: 
         name: metrics-exporter
-      when: monitoring|default(false)|bool
+      when: cluster_monitoring|default(false)|bool
     - include_role: 
         name: slurm
       when: slurm|default(false)|bool
\ No newline at end of file
diff --git a/playbooks/resize_remove.yml b/playbooks/resize_remove.yml
index 99029c50..b5d04156 100755
--- a/playbooks/resize_remove.yml
+++ b/playbooks/resize_remove.yml
@@ -1,4 +1,4 @@
-- hosts: controller, slurm_backup, compute, login
+- hosts: controller, slurm_backup, compute, login, monitoring
   become: true
   gather_facts: true
   vars:
diff --git a/playbooks/resize_remove_unreachable.yml b/playbooks/resize_remove_unreachable.yml
index 5921cd16..ecc61d35 100644
--- a/playbooks/resize_remove_unreachable.yml
+++ b/playbooks/resize_remove_unreachable.yml
@@ -1,4 +1,4 @@
-- hosts: controller, compute, slurm_backup, login
+- hosts: controller, compute, slurm_backup, login, monitoring
   become: true
   gather_facts: true
   vars:
diff --git a/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml b/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml
index 64020bc0..3bf2d6c9 100644
--- a/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml
+++ b/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml
@@ -53,7 +53,7 @@
 - name: install grafana
   include_role: 
     name: grafana
-  when: not monitoring|default(false)|bool
+  when: not cluster_monitoring|default(false)|bool
 
 # - name: Import mysql-2022 key 
 #   become: true
diff --git a/playbooks/roles/etc-hosts/tasks/common.yml b/playbooks/roles/etc-hosts/tasks/common.yml
index 97888dcb..d6d47225 100644
--- a/playbooks/roles/etc-hosts/tasks/common.yml
+++ b/playbooks/roles/etc-hosts/tasks/common.yml
@@ -52,13 +52,13 @@
   run_once: true
   when: not destroy|bool and groups['compute']|length > 0
 
-- name: move /etc/hosts on backup slurm and login node
+- name: move /etc/hosts on backup slurm, login node and monitoring node
   become: true
   copy:
     dest: /etc/hosts
     src: /etc/hosts
     force: yes
-  when: ( not destroy|bool ) and (('slurm_backup' in group_names) or ('login' in group_names))
+  when: ( not destroy|bool ) and (('slurm_backup' in group_names) or ('login' in group_names) or ('monitoring' in group_names))
 
 - name: Make sure the IP for each node was not left over in another cluster
   become: true
@@ -66,7 +66,7 @@
     dest: /etc/hosts
     regexp: "^127.0.1.1\\s{{hostvars[groups['controller'][0]]['inventory_hostname']}}.*"
     state: absent
-  when: ( not destroy|bool ) and (('slurm_backup' in group_names) or ('login' in group_names))
+  when: ( not destroy|bool ) and (('slurm_backup' in group_names) or ('login' in group_names) or ('monitoring' in group_names))
 
 - name: move /etc/hosts on all compute nodes
   become: true
@@ -74,7 +74,7 @@
     dest: /etc/hosts
     src: /tmp/hosts.etc.{{ cluster_name }}
     force: yes
-  when: ( not destroy|bool ) and (not 'controller' in group_names) and (not 'slurm_backup' in group_names) and (not 'login' in group_names)
+  when: ( not destroy|bool ) and (not 'controller' in group_names) and (not 'slurm_backup' in group_names) and (not 'login' in group_names) and (not 'monitoring' in group_names)
 
 - name: remove cluster from etc-host
   become: true
@@ -104,4 +104,14 @@
     state: absent
   delegate_to: "{{ groups['login'][0] }}"
   run_once: true
-  when: destroy|bool and (groups['login']|length > 0)|bool
\ No newline at end of file
+  when: destroy|bool and (groups['login']|length > 0)|bool
+
+- name: remove cluster from etc-host on monitoring
+  become: true
+  blockinfile:
+    dest: /etc/hosts
+    marker: "# {mark} ANSIBLE MANAGED BLOCK {{ cluster_name }}"
+    state: absent
+  delegate_to: "{{ groups['monitoring'][0] }}"
+  run_once: true
+  when: destroy|bool and (groups['monitoring']|length > 0)|bool
\ No newline at end of file
diff --git a/playbooks/roles/etc-hosts/templates/etc-hosts-controller.j2 b/playbooks/roles/etc-hosts/templates/etc-hosts-controller.j2
index e604e118..180e46f2 100755
--- a/playbooks/roles/etc-hosts/templates/etc-hosts-controller.j2
+++ b/playbooks/roles/etc-hosts/templates/etc-hosts-controller.j2
@@ -9,4 +9,8 @@
 {% for item in groups['login'] %}
 {% set short_name = hostvars[item]['ansible_fqdn'].split('.') %}
 {{ hostvars[item]['ansible_host'] }}  {{ hostvars[item]['ansible_fqdn'] }} {{ short_name[0] }} login 
+{% endfor %}
+{% for item in groups['monitoring'] %}
+{% set short_name = hostvars[item]['ansible_fqdn'].split('.') %}
+{{ hostvars[item]['ansible_host'] }}  {{ hostvars[item]['ansible_fqdn'] }} {{ short_name[0] }} monitoring 
 {% endfor %}
\ No newline at end of file
diff --git a/playbooks/roles/grafana/tasks/dashboard.yml b/playbooks/roles/grafana/tasks/dashboard.yml
index af4b65b5..355d3f50 100644
--- a/playbooks/roles/grafana/tasks/dashboard.yml
+++ b/playbooks/roles/grafana/tasks/dashboard.yml
@@ -53,6 +53,7 @@
   copy:
     src: "{{ dashboard_build_dir }}/cluster_prometheus_v2.json"
     dest: "/opt/oci-hpc/monitoring/cluster_prometheus_v2.json"
+    remote_src: true
 
 - name: Import NodeExporter, DCGM, RDMA, NVLink Grafana dashboards
   community.grafana.grafana_dashboard:
diff --git a/playbooks/roles/metrics-exporter/tasks/main.yml b/playbooks/roles/metrics-exporter/tasks/main.yml
index 2029c8e1..fa3cad5c 100644
--- a/playbooks/roles/metrics-exporter/tasks/main.yml
+++ b/playbooks/roles/metrics-exporter/tasks/main.yml
@@ -5,13 +5,13 @@
   when: ansible_distribution == 'Ubuntu'
 
 - include_tasks: dcgm_exporter.yml
-  when: ('compute' in group_names)
+  when: ('compute' in group_names) and 'GPU' in shape
 
 - include_tasks: rdma_exporter.yml
-  when: ('compute' in group_names)
+  when: ('compute' in group_names) and cluster_network|bool
 
 - include_tasks: nvlink_exporter.yml
-  when: ('compute' in group_names)
+  when: ('compute' in group_names) and 'GPU' in shape
 
 - include_tasks: custom_metrics.yml
-  when: ('compute' in group_names)
+  when: ('compute' in group_names) and cluster_network|bool
diff --git a/playbooks/roles/metrics-exporter/tasks/node_exporter_el.yml b/playbooks/roles/metrics-exporter/tasks/node_exporter_el.yml
index b1d8b47e..18ba2420 100644
--- a/playbooks/roles/metrics-exporter/tasks/node_exporter_el.yml
+++ b/playbooks/roles/metrics-exporter/tasks/node_exporter_el.yml
@@ -1,5 +1,14 @@
 ---
 
+- name: Create user for prometheus
+  become: true
+  user:
+    name: "{{ prometheus_user }}" 
+    state: present 
+    createhome: no   
+    shell: /usr/sbin/nologin
+    append: yes
+
 - name: Create /var/lib/prometheus/node_exporter directory
   become: true
   file:
@@ -7,14 +16,12 @@
     state: directory
     owner: 'prometheus'
     group: 'prometheus'
-  when: ('compute' in group_names)
 
 - name: Download node_exporter {{ node_exporter }}
   become: true
   get_url:
     url: https://github.com/prometheus/node_exporter/releases/download/v{{ node_exporter }}/node_exporter-{{ node_exporter }}.linux-amd64.tar.gz
     dest: /tmp/node_exporter-{{ node_exporter }}.linux-amd64.tar.gz 
-  when: ('compute' in group_names)
 
 - name: Extract node_exporter {{ node_exporter }}.linux-amd64.tar.gz into /var/lib/prometheus/node_exporter/
   become: true
@@ -23,7 +30,6 @@
     dest: /var/lib/prometheus/node_exporter/
     extra_opts: [--strip-components=1]
     remote_src: yes    
-  when: ('compute' in group_names)
 
 - name: Recursively change ownership of a /var/lib/prometheus/node_exporter/
   become: true
@@ -33,7 +39,6 @@
     recurse: yes
     owner: prometheus
     group: prometheus 
-  when: ('compute' in group_names)
 
 - name: Create a symbolic link node_exporter
   become: true
@@ -41,27 +46,30 @@
     src: /var/lib/prometheus/node_exporter/node_exporter
     dest: /usr/bin/node_exporter
     state: link     
-  when: ('compute' in group_names)
 
 - name: Configure node_exporter service
   become: true
   copy:
     src: 'node_exporter.service'
     dest: '/usr/lib/systemd/system/node_exporter.service' 
-  when: ('compute' in group_names)  
 
 
 - name: Run command deactivate selinux for node_exporter, chcon  
   become: true
   command: chcon --reference=/bin/less /usr/bin/node_exporter
-  when: ('compute' in group_names)
 
+- name: Make sure the python setuptools are installed
+  vars: 
+    package_name: 
+      - platform-python-setuptools
+  include_role: 
+    name: safe_yum
+    
 - name: Run command deactivate selinux for node_exporter, semanage  
   become: true
   command: semanage fcontext -a -t bin_t "/usr/bin/node_exporter"
   register: node_exporter
   failed_when: "node_exporter.rc != 0 and 'already defined' not in node_exporter.stderr"
-  when: ('compute' in group_names)
 
 - name: start node_exporter.service
   become: true
@@ -69,4 +77,3 @@
     name: node_exporter.service
     state: restarted
     enabled: true
-  when: ('compute' in group_names) 
diff --git a/playbooks/roles/metrics-exporter/tasks/node_exporter_ubuntu.yml b/playbooks/roles/metrics-exporter/tasks/node_exporter_ubuntu.yml
index f779465c..3ad498cf 100644
--- a/playbooks/roles/metrics-exporter/tasks/node_exporter_ubuntu.yml
+++ b/playbooks/roles/metrics-exporter/tasks/node_exporter_ubuntu.yml
@@ -5,7 +5,6 @@
     name: prometheus
     state: present
     system: yes
-  when: "'compute' in group_names"
 
 - name: Create prometheus user on compute
   become: true
@@ -14,7 +13,6 @@
     comment: "prometheus user"
     group: prometheus
     createhome: no  # Optional: depending on if you want to create a home directory
-  when: "'compute' in group_names"
 
 - name: Create /var/lib/prometheus/node_exporter directory
   become: true
@@ -23,14 +21,12 @@
     state: directory
     owner: 'prometheus'
     group: 'prometheus'
-  when: "'compute' in group_names"
 
 - name: Download node_exporter {{ node_exporter }}
   become: true
   get_url:
     url: "https://github.com/prometheus/node_exporter/releases/download/v{{ node_exporter }}/node_exporter-{{ node_exporter }}.linux-amd64.tar.gz"
     dest: "/tmp/node_exporter-{{ node_exporter }}.linux-amd64.tar.gz"
-  when: "'compute' in group_names"
 
 - name: Extract node_exporter {{ node_exporter }}.linux-amd64.tar.gz into /var/lib/prometheus/node_exporter/
   become: true
@@ -39,7 +35,6 @@
     dest: "/var/lib/prometheus/node_exporter/"
     extra_opts: ["--strip-components=1"]
     remote_src: yes
-  when: "'compute' in group_names"
 
 - name: Recursively change ownership of /var/lib/prometheus/node_exporter/
   become: true
@@ -49,7 +44,6 @@
     recurse: yes
     owner: prometheus
     group: prometheus 
-  when: "'compute' in group_names"
 
 - name: Create a symbolic link for node_exporter
   become: true
@@ -57,14 +51,12 @@
     src: /var/lib/prometheus/node_exporter/node_exporter
     dest: /usr/bin/node_exporter
     state: link     
-  when: "'compute' in group_names"
 
 - name: Configure node_exporter service
   become: true
   copy:
     src: 'node_exporter.service'
     dest: '/etc/systemd/system/node_exporter.service' 
-  when: "'compute' in group_names"
 
 - name: Start and enable node_exporter service
   become: true
@@ -72,4 +64,3 @@
     name: node_exporter.service
     state: restarted
     enabled: true
-  when: "'compute' in group_names"
diff --git a/playbooks/roles/prometheus/tasks/gather_info.yml b/playbooks/roles/prometheus/tasks/gather_info.yml
index e5cba703..cc3afa38 100644
--- a/playbooks/roles/prometheus/tasks/gather_info.yml
+++ b/playbooks/roles/prometheus/tasks/gather_info.yml
@@ -3,6 +3,22 @@
   register: serial_output
   delegate_to: "{{ item }}"
 
+- name: Gather FSS IP
+  shell: "cat /etc/fstab | grep {{nfs_source_path}}"
+  register: nfs_output
+  delegate_to: "{{ item }}"
+  ignore_errors: yes
+
+- name: Extract the IP address using regex
+  set_fact:
+    ip_address: "{{ nfs_output.stdout | regex_search('([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3})') }}"
+  when: nfs_output.rc == 0
+
+- name: Extract the IP address using regex
+  set_fact:
+    ip_address: "None"
+  when: nfs_output.rc != 0
+
 - name: gather metadata
   uri:
     url: http://169.254.169.254/opc/v2/instance/
@@ -43,4 +59,4 @@
 
 - name: Build the host_info dictionary
   set_fact:
-    host_info: "{{ host_info | default({}) | combine({item: {'serial_number': serial_output.stdout, 'cluster_name': instance_metadata['freeformTags']['cluster_name'], 'shape': instance_metadata['shape'] , 'ocid': instance_metadata['id'] , 'oci_name': instance_metadata['displayName'], 'availabilityDomain': instance_metadata['availabilityDomain'],'compartmentId': instance_metadata['compartmentId'],'rackID': rdma_metadata['rackId'],'networkBlockId': rdma_metadata['networkBlockId'],'rail_id': rdma_metadata['rdmaTopologyData']['customerLocalBlock'],'hpc_island': rdma_metadata['rdmaTopologyData']['customerHPCIslandId'] }})   }}"
\ No newline at end of file
+    host_info: "{{ host_info | default({}) | combine({item: {'serial_number': serial_output.stdout, 'cluster_name': instance_metadata['freeformTags']['cluster_name'], 'shape': instance_metadata['shape'] , 'ocid': instance_metadata['id'] , 'oci_name': instance_metadata['displayName'], 'availabilityDomain': instance_metadata['availabilityDomain'],'compartmentId': instance_metadata['compartmentId'],'rackID': rdma_metadata['rackId'],'networkBlockId': rdma_metadata['networkBlockId'],'rail_id': rdma_metadata['rdmaTopologyData']['customerLocalBlock'],'fss_ip': ip_address,'hpc_island': rdma_metadata['rdmaTopologyData']['customerHPCIslandId'] }})   }}"
\ No newline at end of file
diff --git a/playbooks/roles/prometheus/tasks/main.yml b/playbooks/roles/prometheus/tasks/main.yml
index bd9a5602..c19d49d9 100644
--- a/playbooks/roles/prometheus/tasks/main.yml
+++ b/playbooks/roles/prometheus/tasks/main.yml
@@ -8,6 +8,8 @@
     createhome: no  # Create the user's home directory  
     shell: /usr/sbin/nologin
     append: yes
+  run_once: true
+  delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}"
 
 - name: Create installation folder in etc
   become: true
@@ -18,6 +20,9 @@
     group: "{{ prometheus_user }}"    
     mode: '0775'
     recurse: yes
+  run_once: true
+  delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}"
+
 
 - name: Create data folder
   become: true
@@ -28,6 +33,8 @@
     group: "{{ prometheus_user }}"     
     mode: '0775'
     recurse: yes
+  run_once: true
+  delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}"
 
 - name: Download/unarchive Packages for prometheus
   become: true
@@ -38,6 +45,8 @@
     group: "{{ prometheus_user }}" 
     remote_src: yes
     creates: "{{ prometheus_download_dir }}"
+  run_once: true
+  delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}"
 
 - name: Copying the service binary for prometheus
   become: true
@@ -51,6 +60,8 @@
   with_items:
    - prometheus
    - promtool 
+  run_once: true
+  delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}"
      
 - name: Copying the console binary
   become: true
@@ -64,18 +75,24 @@
   with_items:
    - consoles
    - console_libraries 
+  run_once: true
+  delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}"
 
 - name: Removing the tar file of prometheus
   become: true
   file:
     path: "{{ prometheus_download_dir }}"
     state: absent
+  run_once: true
+  delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}"
 
 - name: Create prometheus systemd service file
   become: true
   template:
     src: templates/prometheus.service.j2
     dest: "{{ service_dest_dir }}/prometheus.service"   
+  run_once: true
+  delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}"
 
 - name: Get current nodes in /etc/hosts
   shell: "cat /etc/hosts | grep .local.vcn | awk '{print $2}'"
@@ -84,9 +101,16 @@
   run_once: true
   ignore_errors: yes
 
+- name: Get controller and login nodes
+  shell: "cat /etc/hosts | grep \"controller\\|login\" | grep -v \"ANSIBLE MANAGED BLOCK\" | awk '{print $3}'"
+  register: c_l_nodes_in_etc_hosts
+  delegate_to: 127.0.0.1
+  run_once: true
+  ignore_errors: yes
+
 - name: set fact
   set_fact:
-    nodelist: "{{ nodes_in_etc_hosts.stdout_lines }}"
+    nodelist: "{{ nodes_in_etc_hosts.stdout_lines + c_l_nodes_in_etc_hosts.stdout_lines }}"
   run_once: true
 
 - name: Loop over the list of hosts and gather serial number and cluster name
@@ -103,7 +127,7 @@
     group: "{{ prometheus_user }}"
     mode: '0775'
   run_once: true
-  delegate_to: 127.0.0.1
+  delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}"
   
 - name: restart prometheus
   become: true
@@ -112,4 +136,4 @@
     state: restarted
     daemon_reload: yes
     enabled: yes 
-  delegate_to: 127.0.0.1
+  delegate_to: "{% if groups['monitoring'] | length > 0 %}{{groups['monitoring'][0]}}{% else%}127.0.0.1{% endif %}"
diff --git a/playbooks/roles/slurm/tasks/compute-rack-aware.yml b/playbooks/roles/slurm/tasks/compute-rack-aware.yml
index 51c44d70..3af0233a 100755
--- a/playbooks/roles/slurm/tasks/compute-rack-aware.yml
+++ b/playbooks/roles/slurm/tasks/compute-rack-aware.yml
@@ -124,7 +124,7 @@
 - name: Get rackIDs for all compute nodes
   set_fact:
     racks_to_add_temp: "{{cluster_name}}:{{hostvars[item]['rackID']}}"
-  with_items: "{{ play_hosts | difference(groups['controller'])  | difference(groups['slurm_backup']) | difference(groups['login'])}}"
+  with_items: "{{ play_hosts | difference(groups['controller'])  | difference(groups['slurm_backup']) | difference(groups['login'])| difference(groups['monitoring'])}}"
   run_once: true
   register: racks_to_add_temp_results
 
@@ -135,7 +135,7 @@
 - name: Get hostnames
   set_fact:
     nodes_to_add_temp: "{{hostvars[item]['ansible_hostname']}}"
-  with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) }}"
+  with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) | difference(groups['monitoring'])}}"
   run_once: true
   register: nodes_to_add_temp_results
 
@@ -162,7 +162,7 @@
 
 - name: Get hostlist if switch exists
   vars:
-    new_line: "{% for node in ( play_hosts | difference(groups['controller']) | difference(groups['slurm_backup'])  | difference(groups['login']) ) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}"
+    new_line: "{% for node in ( play_hosts | difference(groups['controller']) | difference(groups['slurm_backup'])  | difference(groups['login']) | difference(groups['monitoring'])) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}"
   command: "scontrol show hostlistsorted {{ item.stdout_lines | union (new_line[:-1].split(',') | list )| join(',') }}"
   register: rack_hostlist1
   delegate_to: 127.0.0.1
@@ -172,7 +172,7 @@
 
 - name: Get hostlist if switch does not exists
   vars:
-    new_line: "{% for node in ( play_hosts | difference(groups['controller'])  | difference(groups['slurm_backup']) | difference(groups['login']) ) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}"
+    new_line: "{% for node in ( play_hosts | difference(groups['controller'])  | difference(groups['slurm_backup']) | difference(groups['login']) | difference(groups['monitoring'])) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}"
   command: "scontrol show hostlistsorted {{ new_line[:-1] }}"
   register: rack_hostlist2
   delegate_to: 127.0.0.1
diff --git a/playbooks/roles/slurm/tasks/compute.yml b/playbooks/roles/slurm/tasks/compute.yml
index 94fdb547..3c384390 100755
--- a/playbooks/roles/slurm/tasks/compute.yml
+++ b/playbooks/roles/slurm/tasks/compute.yml
@@ -92,7 +92,7 @@
 - name: Get hostnames
   set_fact:
     nodes_to_add_temp: "{{hostvars[item]['ansible_hostname']}}"
-  with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) }}"
+  with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup']) | difference(groups['login']) | difference(groups['monitoring']) }}"
   run_once: true
   register: nodes_to_add_temp_results
 
diff --git a/playbooks/roles/slurm/tasks/destroy-rack-aware.yml b/playbooks/roles/slurm/tasks/destroy-rack-aware.yml
index 93c20964..fa3a15ab 100755
--- a/playbooks/roles/slurm/tasks/destroy-rack-aware.yml
+++ b/playbooks/roles/slurm/tasks/destroy-rack-aware.yml
@@ -16,7 +16,7 @@
 - name: Get hostnames
   set_fact:
     nodes_to_remove_temp: "{{hostvars[item]['ansible_hostname']}}"
-  with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup'])  | difference(groups['login'])}}"
+  with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup'])  | difference(groups['login']) | difference(groups['monitoring'])}}"
   run_once: true
   register: nodes_to_remove_temp_results
 
@@ -62,7 +62,7 @@
 - name: Get rackIDs
   set_fact:
     racks_to_remove_temp: "{{cluster_name}}:{{hostvars[item]['rackID']}}"
-  with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup'])  | difference(groups['login'])}}"
+  with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup'])  | difference(groups['login']) | difference(groups['monitoring'])}}"
   run_once: true
   register: racks_to_remove_temp_results
 
diff --git a/playbooks/roles/slurm/tasks/destroy.yml b/playbooks/roles/slurm/tasks/destroy.yml
index 5d9d7c5d..ea6bf5e1 100755
--- a/playbooks/roles/slurm/tasks/destroy.yml
+++ b/playbooks/roles/slurm/tasks/destroy.yml
@@ -36,7 +36,7 @@
 - name: Get hostnames
   set_fact:
     nodes_to_add_temp: "{{hostvars[item]['ansible_hostname']}}"
-  with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup'])  | difference(groups['login']) }}"
+  with_items: "{{ play_hosts | difference(groups['controller']) | difference(groups['slurm_backup'])  | difference(groups['login']) | difference(groups['monitoring']) }}"
   run_once: true
   register: nodes_to_add_temp_results
 
diff --git a/playbooks/roles/slurm/tasks/el7.yml b/playbooks/roles/slurm/tasks/el7.yml
index d9c15214..8afebd88 100755
--- a/playbooks/roles/slurm/tasks/el7.yml
+++ b/playbooks/roles/slurm/tasks/el7.yml
@@ -5,11 +5,11 @@
   include_tasks: "compute{{rack_aware_playbook_suffix}}.yml"
   when: ('compute' in group_names) and (not destroy|bool)
 
-- name: run login server directives
+- name: run login/monitoring server directives
   vars: 
     slurm_repos: "epel,ol7_developer_EPEL"
   include_tasks: login.yml
-  when: ('login' in group_names) and (not destroy|bool) and (initial| bool)
+  when: (('login' in group_names) or ('monitoring' in group_names) )and (not destroy|bool) and (initial| bool)
 
 - name: run backup server directives
   vars: 
diff --git a/playbooks/roles/slurm/tasks/el8.yml b/playbooks/roles/slurm/tasks/el8.yml
index 1f5a2482..693df83e 100755
--- a/playbooks/roles/slurm/tasks/el8.yml
+++ b/playbooks/roles/slurm/tasks/el8.yml
@@ -5,11 +5,11 @@
   include_tasks: "compute{{rack_aware_playbook_suffix}}.yml"
   when: ('compute' in group_names) and (not destroy|bool)
 
-- name: run login server directives
+- name: run login/monitoring server directives
   vars: 
     slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder"
   include_tasks: login.yml
-  when: ('login' in group_names) and (not destroy|bool) and (initial| bool)
+  when: (('login' in group_names) or ('monitoring' in group_names) ) and (not destroy|bool) and (initial| bool)
 
 - name: run backup server directives
   vars: 
diff --git a/playbooks/roles/slurm/tasks/ubuntu.yml b/playbooks/roles/slurm/tasks/ubuntu.yml
index 5399d7cc..e3533b75 100644
--- a/playbooks/roles/slurm/tasks/ubuntu.yml
+++ b/playbooks/roles/slurm/tasks/ubuntu.yml
@@ -10,9 +10,9 @@
   include_tasks: "compute{{rack_aware_playbook_suffix}}.yml"
   when: ('compute' in group_names) and (not destroy|bool)
 
-- name: run login server directives
+- name: run login/monitoring server directives
   include_tasks: login.yml
-  when: ('login' in group_names) and (not destroy|bool) and (initial| bool)
+  when: (('login' in group_names) or ('monitoring' in group_names) ) and (not destroy|bool) and (initial| bool)
 
 - name: run backup server directives
   include_tasks: backup_server.yml
diff --git a/playbooks/site.yml b/playbooks/site.yml
index 70bc03b6..a76a6afd 100644
--- a/playbooks/site.yml
+++ b/playbooks/site.yml
@@ -11,7 +11,7 @@
       when: change_hostname  | default(false) | bool
 
 # for ubuntu, on all compute nodes, run --fix-broken install
-- hosts: compute, login
+- hosts: compute, login, monitoring
   become: true
   tasks:
     - include_role:
@@ -90,7 +90,7 @@
         name: fss-home
       when: add_nfs|bool and home_fss|bool
 
-- hosts: controller, slurm_backup, login
+- hosts: controller, slurm_backup, login, monitoring
   become: true
   tasks:
     - include_role:
@@ -125,7 +125,7 @@
       when: not inst_prin|bool 
 
 
-- hosts: compute, login
+- hosts: compute, login, monitoring
   become: true
   tasks:
     - include_role:
@@ -254,27 +254,38 @@
         name: nccl-conf
       when: cluster_network|bool
 
-- hosts: all
+- hosts: all,!monitoring
   tasks: 
-    - include_role: 
-        name: prometheus
-      when: monitoring|default(false)|bool
     - include_role: 
         name: metrics-exporter
-      when: monitoring|default(false)|bool
+      when: cluster_monitoring|default(false)|bool
+
+- hosts: monitoring
+  tasks: 
+    - include_role: 
+        name: grafana
+      when: cluster_monitoring|default(false)|bool and ( groups['monitoring'] | length > 0 ) 
 
 - hosts: controller
   tasks: 
     - include_role: 
         name: grafana
-      when: monitoring|default(false)|bool
+      when: cluster_monitoring|default(false)|bool and ( groups['monitoring'] | length == 0 ) 
+
+- hosts: controller, monitoring
+  tasks: 
+    - include_role: 
+        name: prometheus
+      when: cluster_monitoring|default(false)|bool
+
+- hosts: controller
+  tasks: 
     - include_role: 
         name: autoscaling_mon
       when: autoscaling_monitoring|default(false)|bool
     - include_role: 
         name: cron
 
-
 - hosts: compute
   become: true
   vars:
diff --git a/playbooks/slurm_config.yml b/playbooks/slurm_config.yml
index dce70f01..ce9c15f0 100755
--- a/playbooks/slurm_config.yml
+++ b/playbooks/slurm_config.yml
@@ -1,4 +1,4 @@
-- hosts: controller,slurm_backup,compute,login
+- hosts: controller,slurm_backup,compute,login, monitoring
   gather_facts: true
   vars:
     destroy: false
diff --git a/schema.yaml b/schema.yaml
index befa2a21..edcfb0d2 100755
--- a/schema.yaml
+++ b/schema.yaml
@@ -93,6 +93,26 @@ variableGroups:
       - ${login_block}
       - ${login_block_volume_size}
       - ${login_block_volume_performance}
+
+  - title: "Additional Monitoring Node"
+    variables:
+      - ${cluster_monitoring}
+      - ${monitoring_node}
+      - ${monitoring_ad}
+      - ${monitoring_shape}
+      - ${monitoring_ocpus}
+      - ${monitoring_ocpus_denseIO_flex}
+      - ${monitoring_custom_memory}
+      - ${monitoring_memory}
+      - ${monitoring_boot_volume_size}
+      - ${use_marketplace_image_monitoring}
+      - ${marketplace_listing_monitoring}
+      - ${unsupported_monitoring}
+      - ${monitoring_image_compartment}
+      - ${custom_monitoring_image}
+      - ${unsupported_monitoring_image}
+      - ${monitoring_username}
+
   - title: Autoscaling
     variables:
       - ${autoscaling}
@@ -104,7 +124,7 @@ variableGroups:
       - ${api_user_ocid}
       - ${api_fingerprint}
       - ${api_user_key}
-  - title: "Monitoring"
+  - title: "Autoscaling Monitoring"
     variables: 
       - ${autoscaling_mysql_service}
       - ${monitoring_shape_name}
@@ -178,7 +198,6 @@ variableGroups:
       - ${rack_aware}
       - ${queue}
       - ${spack}
-      - ${monitoring}
       - ${enroot}
       - ${pyxis}
       - ${pam}
@@ -1187,6 +1206,12 @@ variables:
     description: "Install Enroot, Nvidia Container Toolkit, and docker."
     visible: ${slurm}
 
+  cluster_monitoring:
+    type: boolean
+    title: "Install HPC Cluster Monitoring Tools"
+    default: false
+    description: "Install Grafana, Node-Exporter, and Prometheus tools for system monitoring."
+
   pam:
     type: boolean
     title: "Enable PAM"
@@ -1207,12 +1232,6 @@ variables:
     default: false
     description: "Will run tests on GPU nodes before starting a job. Nodes that are showing issues will be set in drain state"
     visible: ${slurm}
- 
-  monitoring:
-    type: boolean
-    title: "Install HPC Cluster Monitoring Tools"
-    default: false
-    description: "Install Grafana, Node-Exporter, and Prometheus tools for system monitoring."
 
   autoscaling:
     type: boolean
@@ -1704,3 +1723,235 @@ variables:
       and:
         - ${use_marketplace_image_login}
         - ${login_node}
+
+
+
+  monitoring_node:
+    type: boolean
+    title: "Monitoring Node"
+    default: false
+    description: "Create an additional monitoring node for users"
+    visible: cluster_monitoring
+
+  monitoring_ad:
+    type: oci:identity:availabilitydomain:name
+    dependsOn:
+      compartmentId: ${targetCompartment}
+    visible: 
+      and:
+        - complexExpression
+        - ${monitoring_node}
+    required: true
+    description: "Availability Domain for monitoring node"
+    title: "Availability Domain For monitoring Node"
+    default: ${ad}  
+
+  monitoring_shape:
+    type: oci:core:instanceshape:name
+    dependsOn:
+      compartmentId: ${targetCompartment}
+    required: true
+    default: VM.Standard.E4.Flex
+    visible: ${monitoring_node}
+
+  monitoring_ocpus:
+    type: integer
+    description: Number of OCPU's for flex shape
+    minimum: 1
+    maximum: 64
+    default: 32
+    visible:
+      and: 
+        - or: 
+          - eq:
+            - ${monitoring_shape}
+            - "VM.Standard.E3.Flex"
+          - eq: 
+            - ${monitoring_shape}
+            - "VM.Standard.E4.Flex"
+          - eq: 
+            - ${monitoring_shape}
+            - "VM.Standard.E5.Flex"
+          - eq:
+            - ${monitoring_shape}
+            - "VM.Optimized3.Flex"
+          - eq: 
+            - ${monitoring_shape}
+            - "VM.Standard.A1.Flex"
+          - eq:
+            - ${monitoring_shape}
+            - "VM.Standard3.Flex"
+        - ${monitoring_node}
+    required: true
+
+  monitoring_ocpus_denseIO_flex:
+    title: Cores
+    type: enum
+    description: Number of OCPU's for Dense IO flex shape
+    enum:
+      - 8
+      - 16
+      - 32
+    default: 16
+    visible:
+      and: 
+        - or: 
+          - eq:
+            - ${monitoring_shape}
+            - "VM.DenseIO.E4.Flex"
+          - eq:
+            - ${monitoring_shape}
+            - "VM.DenseIO.E5.Flex"
+        - ${monitoring_node}
+    required: true
+    
+  monitoring_custom_memory: 
+    title: Use custom memory size
+    type: boolean
+    default: false
+    visible:
+      and: 
+        - or: 
+          - eq:
+            - ${monitoring_shape}
+            - "VM.Standard.E3.Flex"
+          - eq:
+            - ${monitoring_shape}
+            - "VM.Optimized3.Flex"
+          - eq: 
+            - ${monitoring_shape}
+            - "VM.Standard.E4.Flex"
+          - eq: 
+            - ${monitoring_shape}
+            - "VM.Standard.E5.Flex"
+          - eq: 
+            - ${monitoring_shape}
+            - "VM.Standard.A1.Flex"
+          - eq:
+            - ${monitoring_shape}
+            - "VM.Standard3.Flex"
+        - ${monitoring_node}
+  monitoring_memory:
+    title: Memory in GBS
+    type: integer
+    description: Number of memory for flex shape. Minimum 1GB per core.
+    minimum: 1
+    maximum: 1024
+    default: 256
+    visible:
+      and: 
+        - and: 
+          - or: 
+            - eq:
+              - ${monitoring_shape}
+              - "VM.Standard.E3.Flex"
+            - eq:
+              - ${monitoring_shape}
+              - "VM.Optimized3.Flex"
+            - eq: 
+              - ${monitoring_shape}
+              - "VM.Standard.E4.Flex"
+            - eq: 
+              - ${monitoring_shape}
+              - "VM.Standard.E5.Flex"
+            - eq: 
+              - ${monitoring_shape}
+              - "VM.Standard.A1.Flex"
+            - eq:
+              - ${monitoring_shape}
+              - "VM.Standard3.Flex"
+        - and: 
+            - ${monitoring_custom_memory}
+            - ${monitoring_node}
+    required: true
+    
+  monitoring_boot_volume_size:
+    type: integer
+    required: true
+    minimum: 50
+    title: "Size of the boot volume in GB"
+    default: 512
+    visible:  ${monitoring_node}
+
+  unsupported_monitoring: 
+    title: "Use unsupported image" 
+    description: "Custom image ID for monitoring Node"
+    type: boolean
+    default: false
+    visible: 
+      and:
+        - ${monitoring_node}
+        - not: 
+          - ${use_marketplace_image_monitoring}
+
+  monitoring_image_compartment:
+    title: "monitoring image compartment"
+    type: oci:identity:compartment:id
+    default: ${targetCompartment}
+    visible:
+      and: 
+          - ${monitoring_node}
+          - not: 
+              - ${unsupported_monitoring}
+          - not: 
+              - ${use_marketplace_image_monitoring}
+    required: true
+
+  custom_monitoring_image:
+    title: "monitoring Image ID"
+    description: "Custom image ID for monitoring nodes. Please note that only Oracle Linux and Ubuntu 22.04 are supported as monitoring image at this moment. "
+    type: oci:core:image:id
+    dependsOn:
+      compartmentId: ${monitoring_image_compartment}
+    visible:
+      and: 
+          - ${monitoring_node}
+          - not: 
+              - ${unsupported_monitoring}
+          - not: 
+              - ${use_marketplace_image_monitoring}
+    required: true
+  unsupported_monitoring_image:
+    title: "Image OCID"
+    description: "Custom image ID for monitoring nodes"
+    type: string
+    required: true
+    visible:
+      and: 
+        - ${unsupported_monitoring}
+        - not: 
+            - ${use_marketplace_image_monitoring}
+    default: "image.ocid"
+    
+  monitoring_username: 
+    title: "Default username for monitoring node" 
+    description: "Custom image ID for monitoring node"
+    type: string
+    default: "opc"
+    required: true
+    visible: ${monitoring_node}
+
+  use_marketplace_image_monitoring:
+    type: boolean
+    title: "use marketplace image"
+    description: "Use marketplace image, otherwise provide custom image OCID"
+    default: true
+    visible: ${monitoring_node}
+
+  marketplace_listing_monitoring:
+    type: enum
+    title: "Image version"
+    description: "Marketplace listing to use"
+    required: true
+    enum:
+      - "HPC_OL7"
+      - "HPC_OL8"
+      - "GPU_OL8_NV550"
+      - "GPU_OL7_NV550"
+      - "GPU_OL8_NV535"
+      - "GPU_OL7_NV535"
+    default: "HPC_OL8"
+    visible:
+      and:
+        - ${use_marketplace_image_monitoring}
+        - ${monitoring_node}
diff --git a/slurm_ha.tf b/slurm_ha.tf
index df2cf788..89dda946 100644
--- a/slurm_ha.tf
+++ b/slurm_ha.tf
@@ -191,6 +191,8 @@ resource "null_resource" "cluster_backup" {
       backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "",
       login_name = var.login_node ? oci_core_instance.login[0].display_name : "",
       login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "",
+      monitoring_name = var.monitoring_node ? oci_core_instance.monitoring[0].display_name : "",
+      monitoring_ip = var.monitoring_node ? oci_core_instance.monitoring[0].private_ip: "",
       compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[])
       public_subnet = data.oci_core_subnet.public_subnet.cidr_block, 
       private_subnet = data.oci_core_subnet.private_subnet.cidr_block, 
@@ -232,7 +234,7 @@ resource "null_resource" "cluster_backup" {
       shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape,
       instance_pool_ocpus = local.instance_pool_ocpus,
       queue=var.queue,
-      monitoring = var.monitoring,
+      cluster_monitoring = var.cluster_monitoring,
       hyperthreading = var.hyperthreading,
       controller_username = var.controller_username,
       compute_username = var.compute_username,
@@ -341,6 +343,8 @@ resource "null_resource" "cluster_backup" {
       backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "",
       login_name = var.login_node ? oci_core_instance.login[0].display_name : "",
       login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "",
+      monitoring_name = var.monitoring_node ? oci_core_instance.monitoring[0].display_name : "",
+      monitoring_ip = var.monitoring_node ? oci_core_instance.monitoring[0].private_ip: "",
       compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[])
       public_subnet = data.oci_core_subnet.public_subnet.cidr_block,
       public_subnet_id = local.controller_subnet_id,
@@ -387,7 +391,7 @@ resource "null_resource" "cluster_backup" {
       localdisk = var.localdisk,
       log_vol = var.log_vol,
       redundancy = var.redundancy,
-      monitoring = var.monitoring,
+      cluster_monitoring = var.cluster_monitoring,
       hyperthreading = var.hyperthreading,
       unsupported = var.unsupported,
       autoscaling_monitoring = var.autoscaling_monitoring,
diff --git a/variables.tf b/variables.tf
index 4a6ef21e..ccf1fc15 100755
--- a/variables.tf
+++ b/variables.tf
@@ -24,6 +24,10 @@ variable "custom_login_image" {
   type = string
   default = "image.ocid" 
 }
+variable "custom_monitoring_image" { 
+  type = string
+  default = "image.ocid" 
+}
 variable "controller_boot_volume_size" {}
 variable "controller_boot_volume_backup" {}
 variable "controller_boot_volume_backup_type" {default = "INCREMENTAL"}
@@ -40,6 +44,7 @@ variable "image_ocid" { default = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm
 variable "use_compute_agent" { default = true }
 variable "unsupported_controller_image" { default = "" } 
 variable "unsupported_login_image" { default = "" } 
+variable "unsupported_monitoring_image" { default = "" } 
 variable "use_cluster_nfs" { default = true}
 variable "use_scratch_nfs" { default = false }
 variable "cluster_nfs_path" { default = "/nfs/cluster" } 
@@ -61,6 +66,10 @@ variable "login_node" { default = true }
 variable "login_ad" {default = ""}
 variable "login_shape" { default = "VM.Standard2.4" }
 variable "login_boot_volume_size" {default = 50}
+variable "monitoring_node" { default = false }
+variable "monitoring_ad" {default = ""}
+variable "monitoring_shape" { default = "VM.Standard2.4" }
+variable "monitoring_boot_volume_size" {default = 50}
 variable "slurm_nfs" { default = false }
 variable "rack_aware" { default = false }
 variable "ldap" { default = true } 
@@ -73,10 +82,14 @@ variable "instance_pool_memory" { default = 16 }
 variable "instance_pool_custom_memory" { default = false }
 variable "login_ocpus" { default = 2} 
 variable "login_ocpus_denseIO_flex" { default = 8}
+variable "monitoring_ocpus" { default = 2} 
+variable "monitoring_ocpus_denseIO_flex" { default = 8}
 variable "controller_memory" { default = 16 }
 variable "controller_custom_memory" { default = false }
 variable "login_memory" { default = 16 }
 variable "login_custom_memory" { default = false }
+variable "monitoring_memory" { default = 16 }
+variable "monitoring_custom_memory" { default = false }
 variable "privilege_sudo" { default = true }
 variable "privilege_group_name" { default = "privilege" }
 
@@ -134,9 +147,7 @@ variable "login_block_volume_performance" {
 */ 
 
 default = "10. Balanced performance" 
-
 }
-
 variable "login_block" { 
   default = false
 } 
@@ -171,8 +182,8 @@ variable "nfs_source_IP" { default = ""}
 variable "nfs_list_of_mount_target_IPs" { default = ""}
 variable "nfs_source_path" { default = "/app"}
 variable "nfs_options" {default = ""}
-variable "monitoring" { default = true }
 variable "enroot" { default = false }
+variable "cluster_monitoring" { default = false }
 variable "pyxis" { default = false }
 variable "pam" { default = false }
 variable "sacct_limits" { default = false }
@@ -195,6 +206,10 @@ variable "unsupported_login" {
   type=bool
   default = false 
 }
+variable "unsupported_monitoring" { 
+  type=bool
+  default = false 
+}
 variable "controller_username" { 
   type = string 
   default = "opc" 
@@ -208,7 +223,10 @@ variable "login_username" {
   type = string
   default = "opc" 
 } 
-
+variable "monitoring_username" { 
+  type = string
+  default = "opc" 
+}
 variable "autoscaling_monitoring" { 
   type= bool
   default = false
@@ -246,12 +264,16 @@ variable "log_vol" { default = false }
 variable "redundancy" { default = true }
 
 variable "use_marketplace_image_login" { default = true}
+variable "use_marketplace_image_monitoring" { default = true}
 
 variable "marketplace_listing_login" { 
-  default = "HPC_OL7"
+  default = "HPC_OL8"
+} 
+variable "marketplace_listing_monitoring" { 
+  default = "HPC_OL8"
 } 
 variable "marketplace_listing_controller" { 
-  default = "HPC_OL7"
+  default = "HPC_OL8"
 } 
 variable "zone_name" {
   default = ""